• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Blitter.hpp"
16 
17 #include "Pipeline/ShaderCore.hpp"
18 #include "Reactor/Reactor.hpp"
19 #include "System/CPUID.hpp"
20 #include "System/Debug.hpp"
21 #include "System/Half.hpp"
22 #include "System/Memory.hpp"
23 #include "Vulkan/VkImage.hpp"
24 #include "Vulkan/VkImageView.hpp"
25 
26 #include <utility>
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 #	include <xmmintrin.h>
30 #	include <emmintrin.h>
31 #endif
32 
33 namespace sw {
34 
PackFields(rr::Int4 const & ints,const sw::int4 shifts)35 static rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
36 {
37 	return (rr::Int(ints.x) << shifts[0]) |
38 	       (rr::Int(ints.y) << shifts[1]) |
39 	       (rr::Int(ints.z) << shifts[2]) |
40 	       (rr::Int(ints.w) << shifts[3]);
41 }
42 
Blitter()43 Blitter::Blitter()
44     : blitMutex()
45     , blitCache(1024)
46     , cornerUpdateMutex()
47     , cornerUpdateCache(64)  // We only need one of these per format
48 {
49 }
50 
~Blitter()51 Blitter::~Blitter()
52 {
53 }
54 
clear(const void * pixel,vk::Format format,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)55 void Blitter::clear(const void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
56 {
57 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
58 	vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
59 	if(dstFormat == VK_FORMAT_UNDEFINED)
60 	{
61 		return;
62 	}
63 
64 	VkClearColorValue clampedPixel;
65 	if(viewFormat.isSignedNormalized() || viewFormat.isUnsignedNormalized())
66 	{
67 		const float minValue = viewFormat.isSignedNormalized() ? -1.0f : 0.0f;
68 		memcpy(clampedPixel.float32, pixel, sizeof(VkClearColorValue));
69 		clampedPixel.float32[0] = sw::clamp(clampedPixel.float32[0], minValue, 1.0f);
70 		clampedPixel.float32[1] = sw::clamp(clampedPixel.float32[1], minValue, 1.0f);
71 		clampedPixel.float32[2] = sw::clamp(clampedPixel.float32[2], minValue, 1.0f);
72 		clampedPixel.float32[3] = sw::clamp(clampedPixel.float32[3], minValue, 1.0f);
73 		pixel = clampedPixel.float32;
74 	}
75 
76 	if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
77 	{
78 		return;
79 	}
80 
81 	State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
82 	auto blitRoutine = getBlitRoutine(state);
83 	if(!blitRoutine)
84 	{
85 		return;
86 	}
87 
88 	VkImageSubresource subres = {
89 		subresourceRange.aspectMask,
90 		subresourceRange.baseMipLevel,
91 		subresourceRange.baseArrayLayer
92 	};
93 
94 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
95 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
96 
97 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
98 	if(renderArea)
99 	{
100 		ASSERT(subresourceRange.levelCount == 1);
101 		area = *renderArea;
102 	}
103 
104 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
105 	{
106 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
107 		if(!renderArea)
108 		{
109 			area.extent.width = extent.width;
110 			area.extent.height = extent.height;
111 		}
112 
113 		BlitData data = {
114 			pixel, nullptr,  // source, dest
115 
116 			assert_cast<uint32_t>(format.bytes()),                                  // sPitchB
117 			assert_cast<uint32_t>(dest->rowPitchBytes(aspect, subres.mipLevel)),    // dPitchB
118 			0,                                                                      // sSliceB (unused in clear operations)
119 			assert_cast<uint32_t>(dest->slicePitchBytes(aspect, subres.mipLevel)),  // dSliceB
120 
121 			0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f,  // x0, y0, z0, w, h, d
122 
123 			area.offset.x, static_cast<int>(area.offset.x + area.extent.width),   // x0d, x1d
124 			area.offset.y, static_cast<int>(area.offset.y + area.extent.height),  // y0d, y1d
125 			0, 1,                                                                 // z0d, z1d
126 
127 			0, 0, 0,  // sWidth, sHeight, sDepth
128 
129 			false,  // filter3D
130 		};
131 
132 		if(renderArea && dest->is3DSlice())
133 		{
134 			// Reinterpret layers as depth slices
135 			subres.arrayLayer = 0;
136 			for(uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
137 			{
138 				data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
139 				blitRoutine(&data);
140 			}
141 		}
142 		else
143 		{
144 			for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
145 			{
146 				for(uint32_t depth = 0; depth < extent.depth; depth++)
147 				{
148 					data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
149 
150 					blitRoutine(&data);
151 				}
152 			}
153 		}
154 	}
155 	dest->contentsChanged(subresourceRange);
156 }
157 
fastClear(const void * clearValue,vk::Format clearFormat,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)158 bool Blitter::fastClear(const void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
159 {
160 	if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
161 	   clearFormat != VK_FORMAT_D32_SFLOAT &&
162 	   clearFormat != VK_FORMAT_S8_UINT)
163 	{
164 		return false;
165 	}
166 
167 	union ClearValue
168 	{
169 		struct
170 		{
171 			float r;
172 			float g;
173 			float b;
174 			float a;
175 		};
176 
177 		float rgb[3];
178 
179 		float d;
180 		uint32_t d_as_u32;
181 
182 		uint32_t s;
183 	};
184 
185 	const ClearValue &c = *reinterpret_cast<const ClearValue *>(clearValue);
186 
187 	uint32_t packed = 0;
188 
189 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
190 	switch(viewFormat)
191 	{
192 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
193 		packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
194 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
195 		         ((uint16_t)(31 * c.r + 0.5f) << 11);
196 		break;
197 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
198 		packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
199 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
200 		         ((uint16_t)(31 * c.b + 0.5f) << 11);
201 		break;
202 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
203 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
204 	case VK_FORMAT_R8G8B8A8_UNORM:
205 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
206 		         ((uint32_t)(255 * c.b + 0.5f) << 16) |
207 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
208 		         ((uint32_t)(255 * c.r + 0.5f) << 0);
209 		break;
210 	case VK_FORMAT_B8G8R8A8_UNORM:
211 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
212 		         ((uint32_t)(255 * c.r + 0.5f) << 16) |
213 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
214 		         ((uint32_t)(255 * c.b + 0.5f) << 0);
215 		break;
216 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
217 		packed = R11G11B10F(c.rgb);
218 		break;
219 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
220 		packed = RGB9E5(c.rgb);
221 		break;
222 	case VK_FORMAT_D32_SFLOAT:
223 		ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
224 		packed = c.d_as_u32;  // float reinterpreted as uint32
225 		break;
226 	case VK_FORMAT_S8_UINT:
227 		ASSERT(clearFormat == VK_FORMAT_S8_UINT);
228 		packed = static_cast<uint8_t>(c.s);
229 		break;
230 	default:
231 		return false;
232 	}
233 
234 	VkImageSubresource subres = {
235 		subresourceRange.aspectMask,
236 		subresourceRange.baseMipLevel,
237 		subresourceRange.baseArrayLayer
238 	};
239 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
240 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
241 
242 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
243 	if(renderArea)
244 	{
245 		ASSERT(subresourceRange.levelCount == 1);
246 		area = *renderArea;
247 	}
248 
249 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
250 	{
251 		int rowPitchBytes = dest->rowPitchBytes(aspect, subres.mipLevel);
252 		int slicePitchBytes = dest->slicePitchBytes(aspect, subres.mipLevel);
253 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
254 		if(!renderArea)
255 		{
256 			area.extent.width = extent.width;
257 			area.extent.height = extent.height;
258 		}
259 		if(dest->is3DSlice())
260 		{
261 			extent.depth = 1;  // The 3D image is instead interpreted as a 2D image with layers
262 		}
263 
264 		for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
265 		{
266 			for(uint32_t depth = 0; depth < extent.depth; depth++)
267 			{
268 				uint8_t *slice = (uint8_t *)dest->getTexelPointer(
269 				    { area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subres);
270 
271 				for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
272 				{
273 					uint8_t *d = slice;
274 
275 					switch(viewFormat.bytes())
276 					{
277 					case 4:
278 						for(uint32_t i = 0; i < area.extent.height; i++)
279 						{
280 							ASSERT(d < dest->end());
281 							sw::clear((uint32_t *)d, packed, area.extent.width);
282 							d += rowPitchBytes;
283 						}
284 						break;
285 					case 2:
286 						for(uint32_t i = 0; i < area.extent.height; i++)
287 						{
288 							ASSERT(d < dest->end());
289 							sw::clear((uint16_t *)d, static_cast<uint16_t>(packed), area.extent.width);
290 							d += rowPitchBytes;
291 						}
292 						break;
293 					case 1:
294 						for(uint32_t i = 0; i < area.extent.height; i++)
295 						{
296 							ASSERT(d < dest->end());
297 							memset(d, packed, area.extent.width);
298 							d += rowPitchBytes;
299 						}
300 						break;
301 					default:
302 						assert(false);
303 					}
304 
305 					slice += slicePitchBytes;
306 				}
307 			}
308 		}
309 	}
310 	dest->contentsChanged(subresourceRange);
311 
312 	return true;
313 }
314 
readFloat4(Pointer<Byte> element,const State & state)315 Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
316 {
317 	Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
318 
319 	switch(state.sourceFormat)
320 	{
321 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
322 		c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
323 		c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
324 		c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
325 		c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
326 		break;
327 	case VK_FORMAT_R8_SINT:
328 	case VK_FORMAT_R8_SNORM:
329 		c.x = Float(Int(*Pointer<SByte>(element)));
330 		c.w = float(0x7F);
331 		break;
332 	case VK_FORMAT_R8_UNORM:
333 	case VK_FORMAT_R8_UINT:
334 	case VK_FORMAT_R8_SRGB:
335 		c.x = Float(Int(*Pointer<Byte>(element)));
336 		c.w = float(0xFF);
337 		break;
338 	case VK_FORMAT_R16_SINT:
339 	case VK_FORMAT_R16_SNORM:
340 		c.x = Float(Int(*Pointer<Short>(element)));
341 		c.w = float(0x7FFF);
342 		break;
343 	case VK_FORMAT_R16_UNORM:
344 	case VK_FORMAT_R16_UINT:
345 		c.x = Float(Int(*Pointer<UShort>(element)));
346 		c.w = float(0xFFFF);
347 		break;
348 	case VK_FORMAT_R32_SINT:
349 		c.x = Float(*Pointer<Int>(element));
350 		c.w = float(0x7FFFFFFF);
351 		break;
352 	case VK_FORMAT_R32_UINT:
353 		c.x = Float(*Pointer<UInt>(element));
354 		c.w = float(0xFFFFFFFF);
355 		break;
356 	case VK_FORMAT_B8G8R8A8_SRGB:
357 	case VK_FORMAT_B8G8R8A8_UNORM:
358 		c = Float4(*Pointer<Byte4>(element)).zyxw;
359 		break;
360 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
361 	case VK_FORMAT_R8G8B8A8_SINT:
362 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
363 	case VK_FORMAT_R8G8B8A8_SNORM:
364 		c = Float4(*Pointer<SByte4>(element));
365 		break;
366 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
367 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
368 	case VK_FORMAT_R8G8B8A8_UNORM:
369 	case VK_FORMAT_R8G8B8A8_UINT:
370 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
371 	case VK_FORMAT_R8G8B8A8_SRGB:
372 		c = Float4(*Pointer<Byte4>(element));
373 		break;
374 	case VK_FORMAT_R16G16B16A16_SINT:
375 	case VK_FORMAT_R16G16B16A16_SNORM:
376 		c = Float4(*Pointer<Short4>(element));
377 		break;
378 	case VK_FORMAT_R16G16B16A16_UNORM:
379 	case VK_FORMAT_R16G16B16A16_UINT:
380 		c = Float4(*Pointer<UShort4>(element));
381 		break;
382 	case VK_FORMAT_R32G32B32A32_SINT:
383 		c = Float4(*Pointer<Int4>(element));
384 		break;
385 	case VK_FORMAT_R32G32B32A32_UINT:
386 		c = Float4(*Pointer<UInt4>(element));
387 		break;
388 	case VK_FORMAT_R8G8_SINT:
389 	case VK_FORMAT_R8G8_SNORM:
390 		c.x = Float(Int(*Pointer<SByte>(element + 0)));
391 		c.y = Float(Int(*Pointer<SByte>(element + 1)));
392 		c.w = float(0x7F);
393 		break;
394 	case VK_FORMAT_R8G8_UNORM:
395 	case VK_FORMAT_R8G8_UINT:
396 	case VK_FORMAT_R8G8_SRGB:
397 		c.x = Float(Int(*Pointer<Byte>(element + 0)));
398 		c.y = Float(Int(*Pointer<Byte>(element + 1)));
399 		c.w = float(0xFF);
400 		break;
401 	case VK_FORMAT_R16G16_SINT:
402 	case VK_FORMAT_R16G16_SNORM:
403 		c.x = Float(Int(*Pointer<Short>(element + 0)));
404 		c.y = Float(Int(*Pointer<Short>(element + 2)));
405 		c.w = float(0x7FFF);
406 		break;
407 	case VK_FORMAT_R16G16_UNORM:
408 	case VK_FORMAT_R16G16_UINT:
409 		c.x = Float(Int(*Pointer<UShort>(element + 0)));
410 		c.y = Float(Int(*Pointer<UShort>(element + 2)));
411 		c.w = float(0xFFFF);
412 		break;
413 	case VK_FORMAT_R32G32_SINT:
414 		c.x = Float(*Pointer<Int>(element + 0));
415 		c.y = Float(*Pointer<Int>(element + 4));
416 		c.w = float(0x7FFFFFFF);
417 		break;
418 	case VK_FORMAT_R32G32_UINT:
419 		c.x = Float(*Pointer<UInt>(element + 0));
420 		c.y = Float(*Pointer<UInt>(element + 4));
421 		c.w = float(0xFFFFFFFF);
422 		break;
423 	case VK_FORMAT_R32G32B32A32_SFLOAT:
424 		c = *Pointer<Float4>(element);
425 		break;
426 	case VK_FORMAT_R32G32_SFLOAT:
427 		c.x = *Pointer<Float>(element + 0);
428 		c.y = *Pointer<Float>(element + 4);
429 		break;
430 	case VK_FORMAT_R32_SFLOAT:
431 		c.x = *Pointer<Float>(element);
432 		break;
433 	case VK_FORMAT_R16G16B16A16_SFLOAT:
434 		c.w = Float(*Pointer<Half>(element + 6));
435 	case VK_FORMAT_R16G16B16_SFLOAT:
436 		c.z = Float(*Pointer<Half>(element + 4));
437 	case VK_FORMAT_R16G16_SFLOAT:
438 		c.y = Float(*Pointer<Half>(element + 2));
439 	case VK_FORMAT_R16_SFLOAT:
440 		c.x = Float(*Pointer<Half>(element));
441 		break;
442 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
443 		c = r11g11b10Unpack(*Pointer<UInt>(element));
444 		break;
445 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
446 		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
447 		c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));          // R's mantissa (bits 0-8)
448 		c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);   // G's mantissa (bits 9-17)
449 		c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18);  // B's mantissa (bits 18-26)
450 		c *= Float4(
451 		    // 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
452 		    Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
453 		    // Since the 9 bit mantissa values currently stored in RGB were converted straight
454 		    // from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
455 		    // are (1 << 9) times too high.
456 		    // Also, the exponent has 5 bits and we compute the exponent bias of floating point
457 		    // formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
458 		    // Exponent bias (15) + number of mantissa bits per component (9) = 24
459 		    Float(1.0f / (1 << 24)));
460 		c.w = 1.0f;
461 		break;
462 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
463 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
464 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
465 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
466 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
467 		break;
468 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
469 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
470 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
471 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
472 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
473 		break;
474 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
475 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
476 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
477 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
478 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
479 		break;
480 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
481 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
482 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
483 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
484 		break;
485 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
486 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
487 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
488 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
489 		break;
490 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
491 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
492 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
493 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
494 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
495 		break;
496 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
497 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
498 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
499 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
500 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
501 		break;
502 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
503 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
504 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
505 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
506 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
507 		break;
508 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
509 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
510 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
511 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
512 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
513 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
514 		break;
515 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
516 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
517 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
518 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
519 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
520 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
521 		break;
522 	case VK_FORMAT_D16_UNORM:
523 		c.x = Float(Int((*Pointer<UShort>(element))));
524 		break;
525 	case VK_FORMAT_X8_D24_UNORM_PACK32:
526 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
527 		break;
528 	case VK_FORMAT_D32_SFLOAT:
529 		c.x = *Pointer<Float>(element);
530 		break;
531 	case VK_FORMAT_S8_UINT:
532 		c.x = Float(Int(*Pointer<Byte>(element)));
533 		break;
534 	default:
535 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
536 	}
537 
538 	return c;
539 }
540 
write(Float4 & c,Pointer<Byte> element,const State & state)541 void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
542 {
543 	bool writeR = state.writeRed;
544 	bool writeG = state.writeGreen;
545 	bool writeB = state.writeBlue;
546 	bool writeA = state.writeAlpha;
547 	bool writeRGBA = writeR && writeG && writeB && writeA;
548 
549 	switch(state.destFormat)
550 	{
551 	case VK_FORMAT_R4G4_UNORM_PACK8:
552 		if(writeR | writeG)
553 		{
554 			if(!writeR)
555 			{
556 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
557 				                          (*Pointer<Byte>(element) & Byte(0xF0));
558 			}
559 			else if(!writeG)
560 			{
561 				*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
562 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
563 			}
564 			else
565 			{
566 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
567 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
568 			}
569 		}
570 		break;
571 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
572 		if(writeRGBA)
573 		{
574 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 }));
575 		}
576 		else
577 		{
578 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
579 			                      (writeB ? 0x00F0 : 0x0000) |
580 			                      (writeG ? 0x0F00 : 0x0000) |
581 			                      (writeR ? 0xF000 : 0x0000);
582 			unsigned short unmask = ~mask;
583 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
584 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 })) & UShort(mask));
585 		}
586 		break;
587 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
588 		if(writeRGBA)
589 		{
590 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 }));
591 		}
592 		else
593 		{
594 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
595 			                      (writeR ? 0x00F0 : 0x0000) |
596 			                      (writeG ? 0x0F00 : 0x0000) |
597 			                      (writeB ? 0xF000 : 0x0000);
598 			unsigned short unmask = ~mask;
599 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
600 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 })) & UShort(mask));
601 		}
602 		break;
603 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
604 		if(writeRGBA)
605 		{
606 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 }));
607 		}
608 		else
609 		{
610 			unsigned short mask = (writeB ? 0x000F : 0x0000) |
611 			                      (writeG ? 0x00F0 : 0x0000) |
612 			                      (writeR ? 0x0F00 : 0x0000) |
613 			                      (writeA ? 0xF000 : 0x0000);
614 			unsigned short unmask = ~mask;
615 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
616 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 })) & UShort(mask));
617 		}
618 		break;
619 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
620 		if(writeRGBA)
621 		{
622 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 }));
623 		}
624 		else
625 		{
626 			unsigned short mask = (writeR ? 0x000F : 0x0000) |
627 			                      (writeG ? 0x00F0 : 0x0000) |
628 			                      (writeB ? 0x0F00 : 0x0000) |
629 			                      (writeA ? 0xF000 : 0x0000);
630 			unsigned short unmask = ~mask;
631 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
632 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 })) & UShort(mask));
633 		}
634 		break;
635 	case VK_FORMAT_B8G8R8A8_SRGB:
636 	case VK_FORMAT_B8G8R8A8_UNORM:
637 		if(writeRGBA)
638 		{
639 			Short4 c0 = RoundShort4(c.zyxw);
640 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
641 		}
642 		else
643 		{
644 			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
645 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
646 			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
647 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
648 		}
649 		break;
650 	case VK_FORMAT_B8G8R8_SNORM:
651 		if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
652 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
653 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
654 		break;
655 	case VK_FORMAT_B8G8R8_UNORM:
656 	case VK_FORMAT_B8G8R8_SRGB:
657 		if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
658 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
659 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
660 		break;
661 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
662 	case VK_FORMAT_R8G8B8A8_UNORM:
663 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
664 	case VK_FORMAT_R8G8B8A8_SRGB:
665 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
666 	case VK_FORMAT_R8G8B8A8_UINT:
667 	case VK_FORMAT_R8G8B8A8_USCALED:
668 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
669 		if(writeRGBA)
670 		{
671 			Short4 c0 = RoundShort4(c);
672 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
673 		}
674 		else
675 		{
676 			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
677 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
678 			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
679 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
680 		}
681 		break;
682 	case VK_FORMAT_R32G32B32A32_SFLOAT:
683 		if(writeRGBA)
684 		{
685 			*Pointer<Float4>(element) = c;
686 		}
687 		else
688 		{
689 			if(writeR) { *Pointer<Float>(element) = c.x; }
690 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
691 			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
692 			if(writeA) { *Pointer<Float>(element + 12) = c.w; }
693 		}
694 		break;
695 	case VK_FORMAT_R32G32B32_SFLOAT:
696 		if(writeR) { *Pointer<Float>(element) = c.x; }
697 		if(writeG) { *Pointer<Float>(element + 4) = c.y; }
698 		if(writeB) { *Pointer<Float>(element + 8) = c.z; }
699 		break;
700 	case VK_FORMAT_R32G32_SFLOAT:
701 		if(writeR && writeG)
702 		{
703 			*Pointer<Float2>(element) = Float2(c);
704 		}
705 		else
706 		{
707 			if(writeR) { *Pointer<Float>(element) = c.x; }
708 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
709 		}
710 		break;
711 	case VK_FORMAT_R32_SFLOAT:
712 		if(writeR) { *Pointer<Float>(element) = c.x; }
713 		break;
714 	case VK_FORMAT_R16G16B16A16_SFLOAT:
715 		if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
716 		// [[fallthrough]]
717 	case VK_FORMAT_R16G16B16_SFLOAT:
718 		if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
719 		// [[fallthrough]]
720 	case VK_FORMAT_R16G16_SFLOAT:
721 		if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
722 		// [[fallthrough]]
723 	case VK_FORMAT_R16_SFLOAT:
724 		if(writeR) { *Pointer<Half>(element) = Half(c.x); }
725 		break;
726 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
727 		{
728 			UInt rgb = r11g11b10Pack(c);
729 
730 			UInt old = *Pointer<UInt>(element);
731 
732 			unsigned int mask = (writeR ? 0x000007FF : 0) |
733 			                    (writeG ? 0x003FF800 : 0) |
734 			                    (writeB ? 0xFFC00000 : 0);
735 
736 			*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
737 		}
738 		break;
739 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
740 		{
741 			ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
742 
743 			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
744 
745 			constexpr int N = 9;       // number of mantissa bits per component
746 			constexpr int B = 15;      // exponent bias
747 			constexpr int E_max = 31;  // maximum possible biased exponent value
748 
749 			// Maximum representable value.
750 			constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
751 
752 			// Clamp components to valid range. NaN becomes 0.
753 			Float red_c = Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
754 			Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
755 			Float blue_c = Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
756 
757 			// We're reducing the mantissa to 9 bits, so we must round up if the next
758 			// bit is 1. In other words add 0.5 to the new mantissa's position and
759 			// allow overflow into the exponent so we can scale correctly.
760 			constexpr int half = 1 << (23 - N);
761 			Float red_r = As<Float>(As<Int>(red_c) + half);
762 			Float green_r = As<Float>(As<Int>(green_c) + half);
763 			Float blue_r = As<Float>(As<Int>(blue_c) + half);
764 
765 			// The largest component determines the shared exponent. It can't be lower
766 			// than 0 (after bias subtraction) so also limit to the mimimum representable.
767 			constexpr float min_s = 0.5f / (1 << B);
768 			Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
769 
770 			// Obtain the reciprocal of the shared exponent by inverting the bits,
771 			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
772 			// format has an implicit leading 1, but this shared component format does not.
773 			Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
774 
775 			UInt R9 = RoundInt(red_c * scale);
776 			UInt G9 = UInt(RoundInt(green_c * scale));
777 			UInt B9 = UInt(RoundInt(blue_c * scale));
778 			UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
779 
780 			UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
781 
782 			*Pointer<UInt>(element) = E5B9G9R9;
783 		}
784 		break;
785 	case VK_FORMAT_B8G8R8A8_SNORM:
786 		if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
787 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
788 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
789 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
790 		break;
791 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
792 	case VK_FORMAT_R8G8B8A8_SINT:
793 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
794 	case VK_FORMAT_R8G8B8A8_SNORM:
795 	case VK_FORMAT_R8G8B8A8_SSCALED:
796 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
797 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
798 		// [[fallthrough]]
799 	case VK_FORMAT_R8G8B8_SINT:
800 	case VK_FORMAT_R8G8B8_SNORM:
801 	case VK_FORMAT_R8G8B8_SSCALED:
802 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
803 		// [[fallthrough]]
804 	case VK_FORMAT_R8G8_SINT:
805 	case VK_FORMAT_R8G8_SNORM:
806 	case VK_FORMAT_R8G8_SSCALED:
807 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
808 		// [[fallthrough]]
809 	case VK_FORMAT_R8_SINT:
810 	case VK_FORMAT_R8_SNORM:
811 	case VK_FORMAT_R8_SSCALED:
812 		if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
813 		break;
814 	case VK_FORMAT_R8G8B8_UINT:
815 	case VK_FORMAT_R8G8B8_UNORM:
816 	case VK_FORMAT_R8G8B8_USCALED:
817 	case VK_FORMAT_R8G8B8_SRGB:
818 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
819 		// [[fallthrough]]
820 	case VK_FORMAT_R8G8_UINT:
821 	case VK_FORMAT_R8G8_UNORM:
822 	case VK_FORMAT_R8G8_USCALED:
823 	case VK_FORMAT_R8G8_SRGB:
824 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
825 		// [[fallthrough]]
826 	case VK_FORMAT_R8_UINT:
827 	case VK_FORMAT_R8_UNORM:
828 	case VK_FORMAT_R8_USCALED:
829 	case VK_FORMAT_R8_SRGB:
830 		if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
831 		break;
832 	case VK_FORMAT_R16G16B16A16_SINT:
833 	case VK_FORMAT_R16G16B16A16_SNORM:
834 	case VK_FORMAT_R16G16B16A16_SSCALED:
835 		if(writeRGBA)
836 		{
837 			*Pointer<Short4>(element) = Short4(RoundInt(c));
838 		}
839 		else
840 		{
841 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
842 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
843 			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
844 			if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
845 		}
846 		break;
847 	case VK_FORMAT_R16G16B16_SINT:
848 	case VK_FORMAT_R16G16B16_SNORM:
849 	case VK_FORMAT_R16G16B16_SSCALED:
850 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
851 		if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
852 		if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
853 		break;
854 	case VK_FORMAT_R16G16_SINT:
855 	case VK_FORMAT_R16G16_SNORM:
856 	case VK_FORMAT_R16G16_SSCALED:
857 		if(writeR && writeG)
858 		{
859 			*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
860 		}
861 		else
862 		{
863 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
864 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
865 		}
866 		break;
867 	case VK_FORMAT_R16_SINT:
868 	case VK_FORMAT_R16_SNORM:
869 	case VK_FORMAT_R16_SSCALED:
870 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
871 		break;
872 	case VK_FORMAT_R16G16B16A16_UINT:
873 	case VK_FORMAT_R16G16B16A16_UNORM:
874 	case VK_FORMAT_R16G16B16A16_USCALED:
875 		if(writeRGBA)
876 		{
877 			*Pointer<UShort4>(element) = UShort4(RoundInt(c));
878 		}
879 		else
880 		{
881 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
882 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
883 			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
884 			if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
885 		}
886 		break;
887 	case VK_FORMAT_R16G16B16_UINT:
888 	case VK_FORMAT_R16G16B16_UNORM:
889 	case VK_FORMAT_R16G16B16_USCALED:
890 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
891 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
892 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
893 		break;
894 	case VK_FORMAT_R16G16_UINT:
895 	case VK_FORMAT_R16G16_UNORM:
896 	case VK_FORMAT_R16G16_USCALED:
897 		if(writeR && writeG)
898 		{
899 			*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
900 		}
901 		else
902 		{
903 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
904 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
905 		}
906 		break;
907 	case VK_FORMAT_R16_UINT:
908 	case VK_FORMAT_R16_UNORM:
909 	case VK_FORMAT_R16_USCALED:
910 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
911 		break;
912 	case VK_FORMAT_R32G32B32A32_SINT:
913 		if(writeRGBA)
914 		{
915 			*Pointer<Int4>(element) = RoundInt(c);
916 		}
917 		else
918 		{
919 			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
920 			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
921 			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
922 			if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
923 		}
924 		break;
925 	case VK_FORMAT_R32G32B32_SINT:
926 		if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
927 		// [[fallthrough]]
928 	case VK_FORMAT_R32G32_SINT:
929 		if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
930 		// [[fallthrough]]
931 	case VK_FORMAT_R32_SINT:
932 		if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
933 		break;
934 	case VK_FORMAT_R32G32B32A32_UINT:
935 		if(writeRGBA)
936 		{
937 			*Pointer<UInt4>(element) = UInt4(RoundInt(c));
938 		}
939 		else
940 		{
941 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
942 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
943 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
944 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
945 		}
946 		break;
947 	case VK_FORMAT_R32G32B32_UINT:
948 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
949 		// [[fallthrough]]
950 	case VK_FORMAT_R32G32_UINT:
951 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
952 		// [[fallthrough]]
953 	case VK_FORMAT_R32_UINT:
954 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
955 		break;
956 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
957 		if(writeR && writeG && writeB)
958 		{
959 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
960 		}
961 		else
962 		{
963 			unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
964 			unsigned short unmask = ~mask;
965 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
966 			                            (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
967 			                             UShort(mask));
968 		}
969 		break;
970 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
971 		if(writeR && writeG && writeB)
972 		{
973 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
974 		}
975 		else
976 		{
977 			unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
978 			unsigned short unmask = ~mask;
979 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
980 			                            (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
981 			                             UShort(mask));
982 		}
983 		break;
984 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
985 		if(writeRGBA)
986 		{
987 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
988 		}
989 		else
990 		{
991 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
992 			                      (writeR ? 0x7C00 : 0x0000) |
993 			                      (writeG ? 0x03E0 : 0x0000) |
994 			                      (writeB ? 0x001F : 0x0000);
995 			unsigned short unmask = ~mask;
996 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
997 			                            (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
998 			                             UShort(mask));
999 		}
1000 		break;
1001 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1002 		if(writeRGBA)
1003 		{
1004 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
1005 		}
1006 		else
1007 		{
1008 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1009 			                      (writeR ? 0x7C00 : 0x0000) |
1010 			                      (writeG ? 0x03E0 : 0x0000) |
1011 			                      (writeB ? 0x001F : 0x0000);
1012 			unsigned short unmask = ~mask;
1013 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1014 			                            (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
1015 			                             UShort(mask));
1016 		}
1017 		break;
1018 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1019 		if(writeRGBA)
1020 		{
1021 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
1022 		}
1023 		else
1024 		{
1025 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1026 			                      (writeR ? 0x7C00 : 0x0000) |
1027 			                      (writeG ? 0x03E0 : 0x0000) |
1028 			                      (writeB ? 0x001F : 0x0000);
1029 			unsigned short unmask = ~mask;
1030 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1031 			                            (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
1032 			                             UShort(mask));
1033 		}
1034 		break;
1035 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1036 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1037 	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1038 		if(writeRGBA)
1039 		{
1040 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
1041 		}
1042 		else
1043 		{
1044 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1045 			                    (writeB ? 0x3FF00000 : 0x0000) |
1046 			                    (writeG ? 0x000FFC00 : 0x0000) |
1047 			                    (writeR ? 0x000003FF : 0x0000);
1048 			unsigned int unmask = ~mask;
1049 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1050 			                          (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
1051 			                           UInt(mask));
1052 		}
1053 		break;
1054 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1055 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1056 	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1057 		if(writeRGBA)
1058 		{
1059 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
1060 		}
1061 		else
1062 		{
1063 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1064 			                    (writeR ? 0x3FF00000 : 0x0000) |
1065 			                    (writeG ? 0x000FFC00 : 0x0000) |
1066 			                    (writeB ? 0x000003FF : 0x0000);
1067 			unsigned int unmask = ~mask;
1068 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1069 			                          (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
1070 			                           UInt(mask));
1071 		}
1072 		break;
1073 	case VK_FORMAT_D16_UNORM:
1074 		*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
1075 		break;
1076 	case VK_FORMAT_X8_D24_UNORM_PACK32:
1077 		*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
1078 		break;
1079 	case VK_FORMAT_D32_SFLOAT:
1080 		*Pointer<Float>(element) = c.x;
1081 		break;
1082 	case VK_FORMAT_S8_UINT:
1083 		*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
1084 		break;
1085 	default:
1086 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1087 		break;
1088 	}
1089 }
1090 
readInt4(Pointer<Byte> element,const State & state)1091 Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
1092 {
1093 	Int4 c(0, 0, 0, 1);
1094 
1095 	switch(state.sourceFormat)
1096 	{
1097 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1098 	case VK_FORMAT_R8G8B8A8_SINT:
1099 		c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
1100 		c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
1101 		// [[fallthrough]]
1102 	case VK_FORMAT_R8G8_SINT:
1103 		c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
1104 		// [[fallthrough]]
1105 	case VK_FORMAT_R8_SINT:
1106 		c = Insert(c, Int(*Pointer<SByte>(element)), 0);
1107 		break;
1108 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1109 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
1110 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1111 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
1112 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1113 		break;
1114 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1115 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 2);
1116 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1117 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 0);
1118 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1119 		break;
1120 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1121 	case VK_FORMAT_R8G8B8A8_UINT:
1122 		c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
1123 		c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
1124 		// [[fallthrough]]
1125 	case VK_FORMAT_R8G8_UINT:
1126 		c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
1127 		// [[fallthrough]]
1128 	case VK_FORMAT_R8_UINT:
1129 	case VK_FORMAT_S8_UINT:
1130 		c = Insert(c, Int(*Pointer<Byte>(element)), 0);
1131 		break;
1132 	case VK_FORMAT_R16G16B16A16_SINT:
1133 		c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
1134 		c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
1135 		// [[fallthrough]]
1136 	case VK_FORMAT_R16G16_SINT:
1137 		c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
1138 		// [[fallthrough]]
1139 	case VK_FORMAT_R16_SINT:
1140 		c = Insert(c, Int(*Pointer<Short>(element)), 0);
1141 		break;
1142 	case VK_FORMAT_R16G16B16A16_UINT:
1143 		c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
1144 		c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
1145 		// [[fallthrough]]
1146 	case VK_FORMAT_R16G16_UINT:
1147 		c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
1148 		// [[fallthrough]]
1149 	case VK_FORMAT_R16_UINT:
1150 		c = Insert(c, Int(*Pointer<UShort>(element)), 0);
1151 		break;
1152 	case VK_FORMAT_R32G32B32A32_SINT:
1153 	case VK_FORMAT_R32G32B32A32_UINT:
1154 		c = *Pointer<Int4>(element);
1155 		break;
1156 	case VK_FORMAT_R32G32_SINT:
1157 	case VK_FORMAT_R32G32_UINT:
1158 		c = Insert(c, *Pointer<Int>(element + 4), 1);
1159 		// [[fallthrough]]
1160 	case VK_FORMAT_R32_SINT:
1161 	case VK_FORMAT_R32_UINT:
1162 		c = Insert(c, *Pointer<Int>(element), 0);
1163 		break;
1164 	default:
1165 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1166 	}
1167 
1168 	return c;
1169 }
1170 
write(Int4 & c,Pointer<Byte> element,const State & state)1171 void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
1172 {
1173 	bool writeR = state.writeRed;
1174 	bool writeG = state.writeGreen;
1175 	bool writeB = state.writeBlue;
1176 	bool writeA = state.writeAlpha;
1177 	bool writeRGBA = writeR && writeG && writeB && writeA;
1178 
1179 	ASSERT(state.sourceFormat.isUnsigned() == state.destFormat.isUnsigned());
1180 
1181 	switch(state.destFormat)
1182 	{
1183 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1184 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1185 		c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
1186 		break;
1187 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1188 	case VK_FORMAT_R8G8B8A8_UINT:
1189 	case VK_FORMAT_R8G8B8_UINT:
1190 	case VK_FORMAT_R8G8_UINT:
1191 	case VK_FORMAT_R8_UINT:
1192 	case VK_FORMAT_R8G8B8A8_USCALED:
1193 	case VK_FORMAT_R8G8B8_USCALED:
1194 	case VK_FORMAT_R8G8_USCALED:
1195 	case VK_FORMAT_R8_USCALED:
1196 	case VK_FORMAT_S8_UINT:
1197 		c = Min(As<UInt4>(c), UInt4(0xFF));
1198 		break;
1199 	case VK_FORMAT_R16G16B16A16_UINT:
1200 	case VK_FORMAT_R16G16B16_UINT:
1201 	case VK_FORMAT_R16G16_UINT:
1202 	case VK_FORMAT_R16_UINT:
1203 	case VK_FORMAT_R16G16B16A16_USCALED:
1204 	case VK_FORMAT_R16G16B16_USCALED:
1205 	case VK_FORMAT_R16G16_USCALED:
1206 	case VK_FORMAT_R16_USCALED:
1207 		c = Min(As<UInt4>(c), UInt4(0xFFFF));
1208 		break;
1209 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1210 	case VK_FORMAT_R8G8B8A8_SINT:
1211 	case VK_FORMAT_R8G8_SINT:
1212 	case VK_FORMAT_R8_SINT:
1213 	case VK_FORMAT_R8G8B8A8_SSCALED:
1214 	case VK_FORMAT_R8G8B8_SSCALED:
1215 	case VK_FORMAT_R8G8_SSCALED:
1216 	case VK_FORMAT_R8_SSCALED:
1217 		c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
1218 		break;
1219 	case VK_FORMAT_R16G16B16A16_SINT:
1220 	case VK_FORMAT_R16G16B16_SINT:
1221 	case VK_FORMAT_R16G16_SINT:
1222 	case VK_FORMAT_R16_SINT:
1223 	case VK_FORMAT_R16G16B16A16_SSCALED:
1224 	case VK_FORMAT_R16G16B16_SSCALED:
1225 	case VK_FORMAT_R16G16_SSCALED:
1226 	case VK_FORMAT_R16_SSCALED:
1227 		c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
1228 		break;
1229 	default:
1230 		break;
1231 	}
1232 
1233 	switch(state.destFormat)
1234 	{
1235 	case VK_FORMAT_B8G8R8A8_SINT:
1236 	case VK_FORMAT_B8G8R8A8_SSCALED:
1237 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1238 		// [[fallthrough]]
1239 	case VK_FORMAT_B8G8R8_SINT:
1240 	case VK_FORMAT_B8G8R8_SSCALED:
1241 		if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
1242 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1243 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
1244 		break;
1245 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1246 	case VK_FORMAT_R8G8B8A8_SINT:
1247 	case VK_FORMAT_R8G8B8A8_SSCALED:
1248 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
1249 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1250 		// [[fallthrough]]
1251 	case VK_FORMAT_R8G8B8_SINT:
1252 	case VK_FORMAT_R8G8B8_SSCALED:
1253 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
1254 		// [[fallthrough]]
1255 	case VK_FORMAT_R8G8_SINT:
1256 	case VK_FORMAT_R8G8_SSCALED:
1257 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1258 		// [[fallthrough]]
1259 	case VK_FORMAT_R8_SINT:
1260 	case VK_FORMAT_R8_SSCALED:
1261 		if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
1262 		break;
1263 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1264 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
1265 	case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
1266 	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
1267 		if(writeRGBA)
1268 		{
1269 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
1270 		}
1271 		else
1272 		{
1273 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1274 			                    (writeB ? 0x3FF00000 : 0x0000) |
1275 			                    (writeG ? 0x000FFC00 : 0x0000) |
1276 			                    (writeR ? 0x000003FF : 0x0000);
1277 			unsigned int unmask = ~mask;
1278 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1279 			                          (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
1280 		}
1281 		break;
1282 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1283 	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
1284 	case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
1285 	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
1286 		if(writeRGBA)
1287 		{
1288 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
1289 		}
1290 		else
1291 		{
1292 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1293 			                    (writeR ? 0x3FF00000 : 0x0000) |
1294 			                    (writeG ? 0x000FFC00 : 0x0000) |
1295 			                    (writeB ? 0x000003FF : 0x0000);
1296 			unsigned int unmask = ~mask;
1297 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1298 			                          (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
1299 		}
1300 		break;
1301 	case VK_FORMAT_B8G8R8A8_UINT:
1302 	case VK_FORMAT_B8G8R8A8_USCALED:
1303 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1304 		// [[fallthrough]]
1305 	case VK_FORMAT_B8G8R8_UINT:
1306 	case VK_FORMAT_B8G8R8_USCALED:
1307 	case VK_FORMAT_B8G8R8_SRGB:
1308 		if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
1309 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1310 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
1311 		break;
1312 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1313 	case VK_FORMAT_R8G8B8A8_UINT:
1314 	case VK_FORMAT_R8G8B8A8_USCALED:
1315 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
1316 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1317 		// [[fallthrough]]
1318 	case VK_FORMAT_R8G8B8_UINT:
1319 	case VK_FORMAT_R8G8B8_USCALED:
1320 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
1321 		// [[fallthrough]]
1322 	case VK_FORMAT_R8G8_UINT:
1323 	case VK_FORMAT_R8G8_USCALED:
1324 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1325 		// [[fallthrough]]
1326 	case VK_FORMAT_R8_UINT:
1327 	case VK_FORMAT_R8_USCALED:
1328 	case VK_FORMAT_S8_UINT:
1329 		if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
1330 		break;
1331 	case VK_FORMAT_R16G16B16A16_SINT:
1332 	case VK_FORMAT_R16G16B16A16_SSCALED:
1333 		if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
1334 		// [[fallthrough]]
1335 	case VK_FORMAT_R16G16B16_SINT:
1336 	case VK_FORMAT_R16G16B16_SSCALED:
1337 		if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
1338 		// [[fallthrough]]
1339 	case VK_FORMAT_R16G16_SINT:
1340 	case VK_FORMAT_R16G16_SSCALED:
1341 		if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
1342 		// [[fallthrough]]
1343 	case VK_FORMAT_R16_SINT:
1344 	case VK_FORMAT_R16_SSCALED:
1345 		if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
1346 		break;
1347 	case VK_FORMAT_R16G16B16A16_UINT:
1348 	case VK_FORMAT_R16G16B16A16_USCALED:
1349 		if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
1350 		// [[fallthrough]]
1351 	case VK_FORMAT_R16G16B16_UINT:
1352 	case VK_FORMAT_R16G16B16_USCALED:
1353 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
1354 		// [[fallthrough]]
1355 	case VK_FORMAT_R16G16_UINT:
1356 	case VK_FORMAT_R16G16_USCALED:
1357 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
1358 		// [[fallthrough]]
1359 	case VK_FORMAT_R16_UINT:
1360 	case VK_FORMAT_R16_USCALED:
1361 		if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
1362 		break;
1363 	case VK_FORMAT_R32G32B32A32_SINT:
1364 		if(writeRGBA)
1365 		{
1366 			*Pointer<Int4>(element) = c;
1367 		}
1368 		else
1369 		{
1370 			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1371 			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1372 			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1373 			if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
1374 		}
1375 		break;
1376 	case VK_FORMAT_R32G32B32_SINT:
1377 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1378 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1379 		if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1380 		break;
1381 	case VK_FORMAT_R32G32_SINT:
1382 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1383 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1384 		break;
1385 	case VK_FORMAT_R32_SINT:
1386 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1387 		break;
1388 	case VK_FORMAT_R32G32B32A32_UINT:
1389 		if(writeRGBA)
1390 		{
1391 			*Pointer<UInt4>(element) = As<UInt4>(c);
1392 		}
1393 		else
1394 		{
1395 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1396 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1397 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1398 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
1399 		}
1400 		break;
1401 	case VK_FORMAT_R32G32B32_UINT:
1402 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1403 		// [[fallthrough]]
1404 	case VK_FORMAT_R32G32_UINT:
1405 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1406 		// [[fallthrough]]
1407 	case VK_FORMAT_R32_UINT:
1408 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1409 		break;
1410 	default:
1411 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1412 	}
1413 }
1414 
ApplyScaleAndClamp(Float4 & value,const State & state,bool preScaled)1415 void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
1416 {
1417 	float4 scale{}, unscale{};
1418 
1419 	if(state.clearOperation &&
1420 	   state.sourceFormat.isUnnormalizedInteger() &&
1421 	   !state.destFormat.isUnnormalizedInteger())
1422 	{
1423 		// If we're clearing a buffer from an int or uint color into a normalized color,
1424 		// then the whole range of the int or uint color must be scaled between 0 and 1.
1425 		switch(state.sourceFormat)
1426 		{
1427 		case VK_FORMAT_R32G32B32A32_SINT:
1428 			unscale = float4(static_cast<float>(0x7FFFFFFF));
1429 			break;
1430 		case VK_FORMAT_R32G32B32A32_UINT:
1431 			unscale = float4(static_cast<float>(0xFFFFFFFF));
1432 			break;
1433 		default:
1434 			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1435 		}
1436 	}
1437 	else
1438 	{
1439 		unscale = state.sourceFormat.getScale();
1440 	}
1441 
1442 	scale = state.destFormat.getScale();
1443 
1444 	bool srcSRGB = state.sourceFormat.isSRGBformat();
1445 	bool dstSRGB = state.destFormat.isSRGBformat();
1446 
1447 	if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))  // One of the formats is sRGB encoded.
1448 	{
1449 		value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) :  // Unapply scale
1450 		             Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w);   // Apply unscale
1451 		value.xyz = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : linearToSRGB(value);
1452 		value *= Float4(scale.x, scale.y, scale.z, scale.w);  // Apply scale
1453 	}
1454 	else if(unscale != scale)
1455 	{
1456 		value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
1457 	}
1458 
1459 	if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
1460 	{
1461 		value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
1462 
1463 		value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
1464 		                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
1465 		                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
1466 		                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
1467 	}
1468 
1469 	if(!state.sourceFormat.isUnsigned() && state.destFormat.isUnsigned())
1470 	{
1471 		value = Max(value, Float4(0.0f));
1472 	}
1473 }
1474 
ComputeOffset(Int & x,Int & y,Int & pitchB,int bytes)1475 Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
1476 {
1477 	return y * pitchB + x * bytes;
1478 }
1479 
ComputeOffset(Int & x,Int & y,Int & z,Int & sliceB,Int & pitchB,int bytes)1480 Int Blitter::ComputeOffset(Int &x, Int &y, Int &z, Int &sliceB, Int &pitchB, int bytes)
1481 {
1482 	return z * sliceB + y * pitchB + x * bytes;
1483 }
1484 
sample(Pointer<Byte> & source,Float & x,Float & y,Float & z,Int & sWidth,Int & sHeight,Int & sDepth,Int & sSliceB,Int & sPitchB,const State & state)1485 Float4 Blitter::sample(Pointer<Byte> &source, Float &x, Float &y, Float &z,
1486                        Int &sWidth, Int &sHeight, Int &sDepth,
1487                        Int &sSliceB, Int &sPitchB, const State &state)
1488 {
1489 	bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1490 	int srcBytes = state.sourceFormat.bytes();
1491 
1492 	Float4 color;
1493 
1494 	bool preScaled = false;
1495 	if(!state.filter || intSrc)
1496 	{
1497 		Int X = Int(x);
1498 		Int Y = Int(y);
1499 		Int Z = Int(z);
1500 
1501 		if(state.clampToEdge)
1502 		{
1503 			X = Clamp(X, 0, sWidth - 1);
1504 			Y = Clamp(Y, 0, sHeight - 1);
1505 			Z = Clamp(Z, 0, sDepth - 1);
1506 		}
1507 
1508 		Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1509 
1510 		color = readFloat4(s, state);
1511 
1512 		if(state.srcSamples > 1)  // Resolve multisampled source
1513 		{
1514 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1515 			{
1516 				ApplyScaleAndClamp(color, state);
1517 				preScaled = true;
1518 			}
1519 			Float4 accum = color;
1520 			for(int sample = 1; sample < state.srcSamples; sample++)
1521 			{
1522 				s += sSliceB;
1523 				color = readFloat4(s, state);
1524 
1525 				if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1526 				{
1527 					ApplyScaleAndClamp(color, state);
1528 					preScaled = true;
1529 				}
1530 				accum += color;
1531 			}
1532 			color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
1533 		}
1534 	}
1535 	else  // Bilinear filtering
1536 	{
1537 		Float X = x;
1538 		Float Y = y;
1539 		Float Z = z;
1540 
1541 		if(state.clampToEdge)
1542 		{
1543 			X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
1544 			Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
1545 			Z = Min(Max(z, 0.5f), Float(sDepth) - 0.5f);
1546 		}
1547 
1548 		Float x0 = X - 0.5f;
1549 		Float y0 = Y - 0.5f;
1550 		Float z0 = Z - 0.5f;
1551 
1552 		Int X0 = Max(Int(x0), 0);
1553 		Int Y0 = Max(Int(y0), 0);
1554 		Int Z0 = Max(Int(z0), 0);
1555 
1556 		Int X1 = X0 + 1;
1557 		Int Y1 = Y0 + 1;
1558 		X1 = IfThenElse(X1 >= sWidth, X0, X1);
1559 		Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
1560 
1561 		if(state.filter3D)
1562 		{
1563 			Int Z1 = Z0 + 1;
1564 			Z1 = IfThenElse(Z1 >= sHeight, Z0, Z1);
1565 
1566 			Pointer<Byte> s000 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1567 			Pointer<Byte> s010 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1568 			Pointer<Byte> s100 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1569 			Pointer<Byte> s110 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1570 			Pointer<Byte> s001 = source + ComputeOffset(X0, Y0, Z1, sSliceB, sPitchB, srcBytes);
1571 			Pointer<Byte> s011 = source + ComputeOffset(X1, Y0, Z1, sSliceB, sPitchB, srcBytes);
1572 			Pointer<Byte> s101 = source + ComputeOffset(X0, Y1, Z1, sSliceB, sPitchB, srcBytes);
1573 			Pointer<Byte> s111 = source + ComputeOffset(X1, Y1, Z1, sSliceB, sPitchB, srcBytes);
1574 
1575 			Float4 c000 = readFloat4(s000, state);
1576 			Float4 c010 = readFloat4(s010, state);
1577 			Float4 c100 = readFloat4(s100, state);
1578 			Float4 c110 = readFloat4(s110, state);
1579 			Float4 c001 = readFloat4(s001, state);
1580 			Float4 c011 = readFloat4(s011, state);
1581 			Float4 c101 = readFloat4(s101, state);
1582 			Float4 c111 = readFloat4(s111, state);
1583 
1584 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1585 			{
1586 				ApplyScaleAndClamp(c000, state);
1587 				ApplyScaleAndClamp(c010, state);
1588 				ApplyScaleAndClamp(c100, state);
1589 				ApplyScaleAndClamp(c110, state);
1590 				ApplyScaleAndClamp(c001, state);
1591 				ApplyScaleAndClamp(c011, state);
1592 				ApplyScaleAndClamp(c101, state);
1593 				ApplyScaleAndClamp(c111, state);
1594 				preScaled = true;
1595 			}
1596 
1597 			Float4 fx = Float4(x0 - Float(X0));
1598 			Float4 fy = Float4(y0 - Float(Y0));
1599 			Float4 fz = Float4(z0 - Float(Z0));
1600 			Float4 ix = Float4(1.0f) - fx;
1601 			Float4 iy = Float4(1.0f) - fy;
1602 			Float4 iz = Float4(1.0f) - fz;
1603 
1604 			color = ((c000 * ix + c010 * fx) * iy +
1605 			         (c100 * ix + c110 * fx) * fy) *
1606 			            iz +
1607 			        ((c001 * ix + c011 * fx) * iy +
1608 			         (c101 * ix + c111 * fx) * fy) *
1609 			            fz;
1610 		}
1611 		else
1612 		{
1613 			Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1614 			Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1615 			Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1616 			Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1617 
1618 			Float4 c00 = readFloat4(s00, state);
1619 			Float4 c01 = readFloat4(s01, state);
1620 			Float4 c10 = readFloat4(s10, state);
1621 			Float4 c11 = readFloat4(s11, state);
1622 
1623 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1624 			{
1625 				ApplyScaleAndClamp(c00, state);
1626 				ApplyScaleAndClamp(c01, state);
1627 				ApplyScaleAndClamp(c10, state);
1628 				ApplyScaleAndClamp(c11, state);
1629 				preScaled = true;
1630 			}
1631 
1632 			Float4 fx = Float4(x0 - Float(X0));
1633 			Float4 fy = Float4(y0 - Float(Y0));
1634 			Float4 ix = Float4(1.0f) - fx;
1635 			Float4 iy = Float4(1.0f) - fy;
1636 
1637 			color = (c00 * ix + c01 * fx) * iy +
1638 			        (c10 * ix + c11 * fx) * fy;
1639 		}
1640 	}
1641 
1642 	ApplyScaleAndClamp(color, state, preScaled);
1643 
1644 	return color;
1645 }
1646 
generate(const State & state)1647 Blitter::BlitRoutineType Blitter::generate(const State &state)
1648 {
1649 	BlitFunction function;
1650 	{
1651 		Pointer<Byte> blit(function.Arg<0>());
1652 
1653 		Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, source));
1654 		Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, dest));
1655 		Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData, sPitchB));
1656 		Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData, dPitchB));
1657 		Int sSliceB = *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
1658 		Int dSliceB = *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
1659 
1660 		Float x0 = *Pointer<Float>(blit + OFFSET(BlitData, x0));
1661 		Float y0 = *Pointer<Float>(blit + OFFSET(BlitData, y0));
1662 		Float z0 = *Pointer<Float>(blit + OFFSET(BlitData, z0));
1663 		Float w = *Pointer<Float>(blit + OFFSET(BlitData, w));
1664 		Float h = *Pointer<Float>(blit + OFFSET(BlitData, h));
1665 		Float d = *Pointer<Float>(blit + OFFSET(BlitData, d));
1666 
1667 		Int x0d = *Pointer<Int>(blit + OFFSET(BlitData, x0d));
1668 		Int x1d = *Pointer<Int>(blit + OFFSET(BlitData, x1d));
1669 		Int y0d = *Pointer<Int>(blit + OFFSET(BlitData, y0d));
1670 		Int y1d = *Pointer<Int>(blit + OFFSET(BlitData, y1d));
1671 		Int z0d = *Pointer<Int>(blit + OFFSET(BlitData, z0d));
1672 		Int z1d = *Pointer<Int>(blit + OFFSET(BlitData, z1d));
1673 
1674 		Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData, sWidth));
1675 		Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData, sHeight));
1676 		Int sDepth = *Pointer<Int>(blit + OFFSET(BlitData, sDepth));
1677 
1678 		bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1679 		bool intDst = state.destFormat.isUnnormalizedInteger();
1680 		bool intBoth = intSrc && intDst;
1681 		int srcBytes = state.sourceFormat.bytes();
1682 		int dstBytes = state.destFormat.bytes();
1683 
1684 		bool hasConstantColorI = false;
1685 		Int4 constantColorI;
1686 		bool hasConstantColorF = false;
1687 		Float4 constantColorF;
1688 		if(state.clearOperation)
1689 		{
1690 			if(intBoth)  // Integer types
1691 			{
1692 				constantColorI = readInt4(source, state);
1693 				hasConstantColorI = true;
1694 			}
1695 			else
1696 			{
1697 				constantColorF = readFloat4(source, state);
1698 				hasConstantColorF = true;
1699 
1700 				ApplyScaleAndClamp(constantColorF, state);
1701 			}
1702 		}
1703 
1704 		For(Int k = z0d, k < z1d, k++)
1705 		{
1706 			Float z = state.clearOperation ? RValue<Float>(z0) : z0 + Float(k) * d;
1707 			Pointer<Byte> destSlice = dest + k * dSliceB;
1708 
1709 			For(Int j = y0d, j < y1d, j++)
1710 			{
1711 				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
1712 				Pointer<Byte> destLine = destSlice + j * dPitchB;
1713 
1714 				For(Int i = x0d, i < x1d, i++)
1715 				{
1716 					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
1717 					Pointer<Byte> d = destLine + i * dstBytes;
1718 
1719 					if(hasConstantColorI)
1720 					{
1721 						for(int s = 0; s < state.destSamples; s++)
1722 						{
1723 							write(constantColorI, d, state);
1724 
1725 							d += dSliceB;
1726 						}
1727 					}
1728 					else if(hasConstantColorF)
1729 					{
1730 						for(int s = 0; s < state.destSamples; s++)
1731 						{
1732 							write(constantColorF, d, state);
1733 
1734 							d += dSliceB;
1735 						}
1736 					}
1737 					else if(intBoth)  // Integer types do not support filtering
1738 					{
1739 						Int X = Int(x);
1740 						Int Y = Int(y);
1741 						Int Z = Int(z);
1742 
1743 						if(state.clampToEdge)
1744 						{
1745 							X = Clamp(X, 0, sWidth - 1);
1746 							Y = Clamp(Y, 0, sHeight - 1);
1747 							Z = Clamp(Z, 0, sDepth - 1);
1748 						}
1749 
1750 						Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1751 
1752 						// When both formats are true integer types, we don't go to float to avoid losing precision
1753 						Int4 color = readInt4(s, state);
1754 						for(int s = 0; s < state.destSamples; s++)
1755 						{
1756 							write(color, d, state);
1757 
1758 							d += dSliceB;
1759 						}
1760 					}
1761 					else
1762 					{
1763 						Float4 color = sample(source, x, y, z, sWidth, sHeight, sDepth, sSliceB, sPitchB, state);
1764 
1765 						for(int s = 0; s < state.destSamples; s++)
1766 						{
1767 							write(color, d, state);
1768 
1769 							d += dSliceB;
1770 						}
1771 					}
1772 				}
1773 			}
1774 		}
1775 	}
1776 
1777 	return function("BlitRoutine");
1778 }
1779 
getBlitRoutine(const State & state)1780 Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
1781 {
1782 	marl::lock lock(blitMutex);
1783 	auto blitRoutine = blitCache.lookup(state);
1784 
1785 	if(!blitRoutine)
1786 	{
1787 		blitRoutine = generate(state);
1788 		blitCache.add(state, blitRoutine);
1789 	}
1790 
1791 	return blitRoutine;
1792 }
1793 
getCornerUpdateRoutine(const State & state)1794 Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
1795 {
1796 	marl::lock lock(cornerUpdateMutex);
1797 	auto cornerUpdateRoutine = cornerUpdateCache.lookup(state);
1798 
1799 	if(!cornerUpdateRoutine)
1800 	{
1801 		cornerUpdateRoutine = generateCornerUpdate(state);
1802 		cornerUpdateCache.add(state, cornerUpdateRoutine);
1803 	}
1804 
1805 	return cornerUpdateRoutine;
1806 }
1807 
blit(const vk::Image * src,vk::Image * dst,VkImageBlit2KHR region,VkFilter filter)1808 void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit2KHR region, VkFilter filter)
1809 {
1810 	ASSERT(src->getFormat() != VK_FORMAT_UNDEFINED);
1811 	ASSERT(dst->getFormat() != VK_FORMAT_UNDEFINED);
1812 
1813 	// Vulkan 1.2 section 18.5. Image Copies with Scaling:
1814 	// "The layerCount member of srcSubresource and dstSubresource must match"
1815 	// "The aspectMask member of srcSubresource and dstSubresource must match"
1816 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
1817 	ASSERT(region.srcSubresource.aspectMask == region.dstSubresource.aspectMask);
1818 
1819 	if(region.dstOffsets[0].x > region.dstOffsets[1].x)
1820 	{
1821 		std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
1822 		std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
1823 	}
1824 
1825 	if(region.dstOffsets[0].y > region.dstOffsets[1].y)
1826 	{
1827 		std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
1828 		std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
1829 	}
1830 
1831 	if(region.dstOffsets[0].z > region.dstOffsets[1].z)
1832 	{
1833 		std::swap(region.srcOffsets[0].z, region.srcOffsets[1].z);
1834 		std::swap(region.dstOffsets[0].z, region.dstOffsets[1].z);
1835 	}
1836 
1837 	VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
1838 	VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
1839 	VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
1840 
1841 	float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
1842 	                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
1843 	float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
1844 	                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
1845 	float depthRatio = static_cast<float>(region.srcOffsets[1].z - region.srcOffsets[0].z) /
1846 	                   static_cast<float>(region.dstOffsets[1].z - region.dstOffsets[0].z);
1847 	float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
1848 	float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
1849 	float z0 = region.srcOffsets[0].z + (0.5f - region.dstOffsets[0].z) * depthRatio;
1850 
1851 	auto srcFormat = src->getFormat(srcAspect);
1852 	auto dstFormat = dst->getFormat(dstAspect);
1853 
1854 	bool doFilter = (filter != VK_FILTER_NEAREST);
1855 	bool allowSRGBConversion =
1856 	    doFilter ||
1857 	    (src->getSampleCountFlagBits() > 1) ||
1858 	    (srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
1859 
1860 	State state(srcFormat, dstFormat, src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
1861 	            Options{ doFilter, allowSRGBConversion });
1862 	state.clampToEdge = (region.srcOffsets[0].x < 0) ||
1863 	                    (region.srcOffsets[0].y < 0) ||
1864 	                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
1865 	                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
1866 	                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
1867 	state.filter3D = (region.srcOffsets[1].z - region.srcOffsets[0].z) !=
1868 	                 (region.dstOffsets[1].z - region.dstOffsets[0].z);
1869 
1870 	auto blitRoutine = getBlitRoutine(state);
1871 	if(!blitRoutine)
1872 	{
1873 		return;
1874 	}
1875 
1876 	BlitData data = {
1877 		nullptr,                                                                                 // source
1878 		nullptr,                                                                                 // dest
1879 		assert_cast<uint32_t>(src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel)),    // sPitchB
1880 		assert_cast<uint32_t>(dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel)),    // dPitchB
1881 		assert_cast<uint32_t>(src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel)),  // sSliceB
1882 		assert_cast<uint32_t>(dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel)),  // dSliceB
1883 
1884 		x0,
1885 		y0,
1886 		z0,
1887 		widthRatio,
1888 		heightRatio,
1889 		depthRatio,
1890 
1891 		region.dstOffsets[0].x,  // x0d
1892 		region.dstOffsets[1].x,  // x1d
1893 		region.dstOffsets[0].y,  // y0d
1894 		region.dstOffsets[1].y,  // y1d
1895 		region.dstOffsets[0].z,  // z0d
1896 		region.dstOffsets[1].z,  // z1d
1897 
1898 		static_cast<int>(srcExtent.width),   // sWidth
1899 		static_cast<int>(srcExtent.height),  // sHeight
1900 		static_cast<int>(srcExtent.depth),   // sDepth
1901 
1902 		false,  // filter3D
1903 	};
1904 
1905 	VkImageSubresource srcSubres = {
1906 		region.srcSubresource.aspectMask,
1907 		region.srcSubresource.mipLevel,
1908 		region.srcSubresource.baseArrayLayer
1909 	};
1910 
1911 	VkImageSubresource dstSubres = {
1912 		region.dstSubresource.aspectMask,
1913 		region.dstSubresource.mipLevel,
1914 		region.dstSubresource.baseArrayLayer
1915 	};
1916 
1917 	VkImageSubresourceRange dstSubresRange = {
1918 		region.dstSubresource.aspectMask,
1919 		region.dstSubresource.mipLevel,
1920 		1,  // levelCount
1921 		region.dstSubresource.baseArrayLayer,
1922 		region.dstSubresource.layerCount
1923 	};
1924 
1925 	uint32_t lastLayer = src->getLastLayerIndex(dstSubresRange);
1926 
1927 	for(; dstSubres.arrayLayer <= lastLayer; srcSubres.arrayLayer++, dstSubres.arrayLayer++)
1928 	{
1929 		data.source = src->getTexelPointer({ 0, 0, 0 }, srcSubres);
1930 		data.dest = dst->getTexelPointer({ 0, 0, 0 }, dstSubres);
1931 
1932 		ASSERT(data.source < src->end());
1933 		ASSERT(data.dest < dst->end());
1934 
1935 		blitRoutine(&data);
1936 	}
1937 
1938 	dst->contentsChanged(dstSubresRange);
1939 }
1940 
resolveDepth(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits depthResolveMode)1941 static void resolveDepth(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits depthResolveMode)
1942 {
1943 	if(depthResolveMode == VK_RESOLVE_MODE_NONE)
1944 	{
1945 		return;
1946 	}
1947 
1948 	vk::Format format = src->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT);
1949 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_DEPTH_BIT);
1950 	int width = extent.width;
1951 	int height = extent.height;
1952 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
1953 
1954 	// To support other resolve modes, get the slice bytes and get a pointer to each sample plane.
1955 	// Then modify the loop below to include logic for handling each new mode.
1956 	uint8_t *source = (uint8_t *)src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1957 	uint8_t *dest = (uint8_t *)dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1958 
1959 	size_t formatSize = format.bytes();
1960 	// TODO(b/167558951) support other resolve modes.
1961 	ASSERT(depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1962 	for(int y = 0; y < height; y++)
1963 	{
1964 		memcpy(dest, source, formatSize * width);
1965 
1966 		source += pitch;
1967 		dest += pitch;
1968 	}
1969 
1970 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
1971 }
1972 
resolveStencil(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits stencilResolveMode)1973 static void resolveStencil(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits stencilResolveMode)
1974 {
1975 	if(stencilResolveMode == VK_RESOLVE_MODE_NONE)
1976 	{
1977 		return;
1978 	}
1979 
1980 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_STENCIL_BIT);
1981 	int width = extent.width;
1982 	int height = extent.height;
1983 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
1984 
1985 	// To support other resolve modes, use src->slicePitchBytes() and get a pointer to each sample's slice.
1986 	// Then modify the loop below to include logic for handling each new mode.
1987 	uint8_t *source = reinterpret_cast<uint8_t *>(src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
1988 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
1989 
1990 	// TODO(b/167558951) support other resolve modes.
1991 	ASSERT(stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1992 	for(int y = 0; y < height; y++)
1993 	{
1994 		// Stencil is always 8 bits, so the width of the resource we're resolving is
1995 		// the number of bytes in each row we need to copy during for SAMPLE_ZERO
1996 		memcpy(dest, source, width);
1997 
1998 		source += pitch;
1999 		dest += pitch;
2000 	}
2001 
2002 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
2003 }
2004 
resolveDepthStencil(const vk::ImageView * src,vk::ImageView * dst,VkResolveModeFlagBits depthResolveMode,VkResolveModeFlagBits stencilResolveMode)2005 void Blitter::resolveDepthStencil(const vk::ImageView *src, vk::ImageView *dst, VkResolveModeFlagBits depthResolveMode, VkResolveModeFlagBits stencilResolveMode)
2006 {
2007 	VkImageSubresourceRange srcRange = src->getSubresourceRange();
2008 	VkImageSubresourceRange dstRange = src->getSubresourceRange();
2009 	ASSERT(src->getFormat() == dst->getFormat());
2010 	ASSERT(srcRange.layerCount == 1 && dstRange.layerCount == 1);
2011 	ASSERT(srcRange.aspectMask == dstRange.aspectMask);
2012 
2013 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2014 	{
2015 		resolveDepth(src, dst, depthResolveMode);
2016 	}
2017 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2018 	{
2019 		resolveStencil(src, dst, stencilResolveMode);
2020 	}
2021 }
2022 
resolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2023 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2024 {
2025 	// "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
2026 	ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2027 	ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2028 	// "The layerCount member of srcSubresource and dstSubresource must match"
2029 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
2030 
2031 	// We use this method both for explicit resolves from vkCmdResolveImage, and implicit ones for resolve attachments.
2032 	// - vkCmdResolveImage: "srcImage and dstImage must have been created with the same image format."
2033 	// - VkSubpassDescription: "each resolve attachment that is not VK_ATTACHMENT_UNUSED must have the same VkFormat as its corresponding color attachment."
2034 	ASSERT(src->getFormat() == dst->getFormat());
2035 
2036 	if(fastResolve(src, dst, region))
2037 	{
2038 		return;
2039 	}
2040 
2041 	// Fall back to a generic blit which performs the resolve.
2042 	VkImageBlit2KHR blitRegion;
2043 	blitRegion.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR;
2044 	blitRegion.pNext = nullptr;
2045 
2046 	blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
2047 	blitRegion.srcOffsets[1].x += region.extent.width;
2048 	blitRegion.srcOffsets[1].y += region.extent.height;
2049 	blitRegion.srcOffsets[1].z += region.extent.depth;
2050 
2051 	blitRegion.dstOffsets[0] = blitRegion.dstOffsets[1] = region.dstOffset;
2052 	blitRegion.dstOffsets[1].x += region.extent.width;
2053 	blitRegion.dstOffsets[1].y += region.extent.height;
2054 	blitRegion.dstOffsets[1].z += region.extent.depth;
2055 
2056 	blitRegion.srcSubresource = region.srcSubresource;
2057 	blitRegion.dstSubresource = region.dstSubresource;
2058 
2059 	blit(src, dst, blitRegion, VK_FILTER_NEAREST);
2060 }
2061 
averageByte4(uint32_t x,uint32_t y)2062 static inline uint32_t averageByte4(uint32_t x, uint32_t y)
2063 {
2064 	return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
2065 }
2066 
fastResolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2067 bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2068 {
2069 	if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
2070 	{
2071 		return false;
2072 	}
2073 
2074 	if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
2075 	{
2076 		return false;
2077 	}
2078 
2079 	if(region.srcSubresource.layerCount != 1)
2080 	{
2081 		return false;
2082 	}
2083 
2084 	if(region.extent != src->getExtent() ||
2085 	   region.extent != dst->getExtent() ||
2086 	   region.extent.depth != 1)
2087 	{
2088 		return false;
2089 	}
2090 
2091 	VkImageSubresource srcSubresource = {
2092 		region.srcSubresource.aspectMask,
2093 		region.srcSubresource.mipLevel,
2094 		region.srcSubresource.baseArrayLayer
2095 	};
2096 
2097 	VkImageSubresource dstSubresource = {
2098 		region.dstSubresource.aspectMask,
2099 		region.dstSubresource.mipLevel,
2100 		region.dstSubresource.baseArrayLayer
2101 	};
2102 
2103 	VkImageSubresourceRange dstSubresourceRange = {
2104 		region.dstSubresource.aspectMask,
2105 		region.dstSubresource.mipLevel,
2106 		1,  // levelCount
2107 		region.dstSubresource.baseArrayLayer,
2108 		region.dstSubresource.layerCount
2109 	};
2110 
2111 	void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
2112 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
2113 
2114 	auto format = src->getFormat();
2115 	auto samples = src->getSampleCountFlagBits();
2116 	auto extent = src->getExtent();
2117 
2118 	int width = extent.width;
2119 	int height = extent.height;
2120 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2121 	int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2122 
2123 	uint8_t *source0 = (uint8_t *)source;
2124 	uint8_t *source1 = source0 + slice;
2125 	uint8_t *source2 = source1 + slice;
2126 	uint8_t *source3 = source2 + slice;
2127 
2128 	[[maybe_unused]] const bool SSE2 = CPUID::supportsSSE2();
2129 
2130 	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
2131 	{
2132 		if(samples == 4)
2133 		{
2134 			for(int y = 0; y < height; y++)
2135 			{
2136 				int x = 0;
2137 
2138 #if defined(__i386__) || defined(__x86_64__)
2139 				if(SSE2)
2140 				{
2141 					for(; (x + 3) < width; x += 4)
2142 					{
2143 						__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
2144 						__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
2145 						__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
2146 						__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
2147 
2148 						c0 = _mm_avg_epu8(c0, c1);
2149 						c2 = _mm_avg_epu8(c2, c3);
2150 						c0 = _mm_avg_epu8(c0, c2);
2151 
2152 						_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
2153 					}
2154 				}
2155 #endif
2156 
2157 				for(; x < width; x++)
2158 				{
2159 					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
2160 					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
2161 					uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
2162 					uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
2163 
2164 					uint32_t c01 = averageByte4(c0, c1);
2165 					uint32_t c23 = averageByte4(c2, c3);
2166 					uint32_t c03 = averageByte4(c01, c23);
2167 
2168 					*(uint32_t *)(dest + 4 * x) = c03;
2169 				}
2170 
2171 				source0 += pitch;
2172 				source1 += pitch;
2173 				source2 += pitch;
2174 				source3 += pitch;
2175 				dest += pitch;
2176 
2177 				ASSERT(source0 < src->end());
2178 				ASSERT(source3 < src->end());
2179 				ASSERT(dest < dst->end());
2180 			}
2181 		}
2182 		else
2183 			UNSUPPORTED("Samples: %d", samples);
2184 	}
2185 	else
2186 	{
2187 		return false;
2188 	}
2189 
2190 	dst->contentsChanged(dstSubresourceRange);
2191 
2192 	return true;
2193 }
2194 
copy(const vk::Image * src,uint8_t * dst,unsigned int dstPitch)2195 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
2196 {
2197 	VkExtent3D extent = src->getExtent();
2198 	size_t rowBytes = src->getFormat(VK_IMAGE_ASPECT_COLOR_BIT).bytes() * extent.width;
2199 	unsigned int srcPitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
2200 	ASSERT(dstPitch >= rowBytes && srcPitch >= rowBytes && src->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0).height >= extent.height);
2201 
2202 	const uint8_t *s = (uint8_t *)src->getTexelPointer({ 0, 0, 0 }, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0 });
2203 	uint8_t *d = dst;
2204 
2205 	for(uint32_t y = 0; y < extent.height; y++)
2206 	{
2207 		memcpy(d, s, rowBytes);
2208 
2209 		s += srcPitch;
2210 		d += dstPitch;
2211 	}
2212 }
2213 
computeCubeCorner(Pointer<Byte> & layer,Int & x0,Int & x1,Int & y0,Int & y1,Int & pitchB,const State & state)2214 void Blitter::computeCubeCorner(Pointer<Byte> &layer, Int &x0, Int &x1, Int &y0, Int &y1, Int &pitchB, const State &state)
2215 {
2216 	int bytes = state.sourceFormat.bytes();
2217 
2218 	Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
2219 	           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
2220 	           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
2221 
2222 	c *= Float4(1.0f / 3.0f);
2223 
2224 	write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
2225 }
2226 
generateCornerUpdate(const State & state)2227 Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State &state)
2228 {
2229 	// Reading and writing from/to the same image
2230 	ASSERT(state.sourceFormat == state.destFormat);
2231 	ASSERT(state.srcSamples == state.destSamples);
2232 
2233 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2234 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2235 	ASSERT(state.srcSamples == 1);
2236 
2237 	CornerUpdateFunction function;
2238 	{
2239 		Pointer<Byte> blit(function.Arg<0>());
2240 
2241 		Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
2242 		Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
2243 		UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
2244 		UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
2245 
2246 		// Low Border, Low Pixel, High Border, High Pixel
2247 		Int LB(-1), LP(0), HB(dim), HP(dim - 1);
2248 
2249 		for(int face = 0; face < 6; face++)
2250 		{
2251 			computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
2252 			computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
2253 			computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
2254 			computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
2255 			layers = layers + layerSize;
2256 		}
2257 	}
2258 
2259 	return function("BlitRoutine");
2260 }
2261 
updateBorders(const vk::Image * image,const VkImageSubresource & subresource)2262 void Blitter::updateBorders(const vk::Image *image, const VkImageSubresource &subresource)
2263 {
2264 	ASSERT(image->getArrayLayers() >= (subresource.arrayLayer + 6));
2265 
2266 	// From Vulkan 1.1 spec, section 11.5. Image Views:
2267 	// "For cube and cube array image views, the layers of the image view starting
2268 	//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
2269 	VkImageSubresource posX = subresource;
2270 	VkImageSubresource negX = posX;
2271 	negX.arrayLayer++;
2272 	VkImageSubresource posY = negX;
2273 	posY.arrayLayer++;
2274 	VkImageSubresource negY = posY;
2275 	negY.arrayLayer++;
2276 	VkImageSubresource posZ = negY;
2277 	posZ.arrayLayer++;
2278 	VkImageSubresource negZ = posZ;
2279 	negZ.arrayLayer++;
2280 
2281 	// Copy top / bottom
2282 	copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
2283 	copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
2284 	copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
2285 	copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
2286 	copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
2287 	copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
2288 
2289 	copyCubeEdge(image, posX, TOP, posY, RIGHT);
2290 	copyCubeEdge(image, posY, TOP, negZ, TOP);
2291 	copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
2292 	copyCubeEdge(image, negX, TOP, posY, LEFT);
2293 	copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
2294 	copyCubeEdge(image, negZ, TOP, posY, TOP);
2295 
2296 	// Copy left / right
2297 	copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
2298 	copyCubeEdge(image, posY, RIGHT, posX, TOP);
2299 	copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
2300 	copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
2301 	copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
2302 	copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
2303 
2304 	copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
2305 	copyCubeEdge(image, posY, LEFT, negX, TOP);
2306 	copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
2307 	copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
2308 	copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
2309 	copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
2310 
2311 	// Compute corner colors
2312 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
2313 	vk::Format format = image->getFormat(aspect);
2314 	VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
2315 	State state(format, format, samples, samples, Options{ 0xF });
2316 
2317 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2318 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2319 	ASSERT(samples == VK_SAMPLE_COUNT_1_BIT);
2320 
2321 	auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
2322 	if(!cornerUpdateRoutine)
2323 	{
2324 		return;
2325 	}
2326 
2327 	VkExtent3D extent = image->getMipLevelExtent(aspect, subresource.mipLevel);
2328 	CubeBorderData data = {
2329 		image->getTexelPointer({ 0, 0, 0 }, posX),
2330 		assert_cast<uint32_t>(image->rowPitchBytes(aspect, subresource.mipLevel)),
2331 		assert_cast<uint32_t>(image->getLayerSize(aspect)),
2332 		extent.width
2333 	};
2334 	cornerUpdateRoutine(&data);
2335 }
2336 
copyCubeEdge(const vk::Image * image,const VkImageSubresource & dstSubresource,Edge dstEdge,const VkImageSubresource & srcSubresource,Edge srcEdge)2337 void Blitter::copyCubeEdge(const vk::Image *image,
2338                            const VkImageSubresource &dstSubresource, Edge dstEdge,
2339                            const VkImageSubresource &srcSubresource, Edge srcEdge)
2340 {
2341 	ASSERT(srcSubresource.aspectMask == dstSubresource.aspectMask);
2342 	ASSERT(srcSubresource.mipLevel == dstSubresource.mipLevel);
2343 	ASSERT(srcSubresource.arrayLayer != dstSubresource.arrayLayer);
2344 
2345 	// Figure out if the edges to be copied in reverse order respectively from one another
2346 	// The copy should be reversed whenever the same edges are contiguous or if we're
2347 	// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
2348 	//
2349 	//      | +y |
2350 	// | -x | +z | +x | -z |
2351 	//      | -y |
2352 
2353 	bool reverse = (srcEdge == dstEdge) ||
2354 	               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
2355 	               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
2356 	               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
2357 	               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
2358 
2359 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresource.aspectMask);
2360 	int bytes = image->getFormat(aspect).bytes();
2361 	int pitchB = image->rowPitchBytes(aspect, srcSubresource.mipLevel);
2362 
2363 	VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresource.mipLevel);
2364 	int w = extent.width;
2365 	int h = extent.height;
2366 	if(w != h)
2367 	{
2368 		UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
2369 	}
2370 
2371 	// Src is expressed in the regular [0, width-1], [0, height-1] space
2372 	bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
2373 	int srcDelta = srcHorizontal ? bytes : pitchB;
2374 	VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
2375 
2376 	// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
2377 	bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
2378 	int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
2379 	VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
2380 
2381 	// Don't write in the corners
2382 	if(dstHorizontal)
2383 	{
2384 		dstOffset.x += reverse ? w : 1;
2385 	}
2386 	else
2387 	{
2388 		dstOffset.y += reverse ? h : 1;
2389 	}
2390 
2391 	const uint8_t *src = static_cast<const uint8_t *>(image->getTexelPointer(srcOffset, srcSubresource));
2392 	uint8_t *dst = static_cast<uint8_t *>(image->getTexelPointer(dstOffset, dstSubresource));
2393 	ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
2394 	ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
2395 
2396 	for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
2397 	{
2398 		memcpy(dst, src, bytes);
2399 	}
2400 }
2401 
2402 }  // namespace sw
2403