• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Blitter.hpp"
16 
17 #include "Pipeline/ShaderCore.hpp"
18 #include "Reactor/Reactor.hpp"
19 #include "System/CPUID.hpp"
20 #include "System/Debug.hpp"
21 #include "System/Half.hpp"
22 #include "System/Memory.hpp"
23 #include "Vulkan/VkBuffer.hpp"
24 #include "Vulkan/VkImage.hpp"
25 #include "Vulkan/VkImageView.hpp"
26 
27 #include <utility>
28 
29 #if defined(__i386__) || defined(__x86_64__)
30 #	include <xmmintrin.h>
31 #	include <emmintrin.h>
32 #endif
33 
34 namespace {
PackFields(rr::Int4 const & ints,const sw::int4 shifts)35 rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
36 {
37 	return (rr::Int(ints.x) << shifts[0]) |
38 	       (rr::Int(ints.y) << shifts[1]) |
39 	       (rr::Int(ints.z) << shifts[2]) |
40 	       (rr::Int(ints.w) << shifts[3]);
41 }
42 }  // namespace
43 
44 namespace sw {
45 
Blitter()46 Blitter::Blitter()
47     : blitMutex()
48     , blitCache(1024)
49     , cornerUpdateMutex()
50     , cornerUpdateCache(64)  // We only need one of these per format
51 {
52 }
53 
~Blitter()54 Blitter::~Blitter()
55 {
56 }
57 
clear(const void * pixel,vk::Format format,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)58 void Blitter::clear(const void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
59 {
60 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
61 	vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
62 	if(dstFormat == VK_FORMAT_UNDEFINED)
63 	{
64 		return;
65 	}
66 
67 	VkClearColorValue clampedPixel;
68 	if(viewFormat.isSignedNormalized() || viewFormat.isUnsignedNormalized())
69 	{
70 		const float minValue = viewFormat.isSignedNormalized() ? -1.0f : 0.0f;
71 		memcpy(clampedPixel.float32, pixel, sizeof(VkClearColorValue));
72 		clampedPixel.float32[0] = sw::clamp(clampedPixel.float32[0], minValue, 1.0f);
73 		clampedPixel.float32[1] = sw::clamp(clampedPixel.float32[1], minValue, 1.0f);
74 		clampedPixel.float32[2] = sw::clamp(clampedPixel.float32[2], minValue, 1.0f);
75 		clampedPixel.float32[3] = sw::clamp(clampedPixel.float32[3], minValue, 1.0f);
76 		pixel = clampedPixel.float32;
77 	}
78 
79 	if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
80 	{
81 		return;
82 	}
83 
84 	State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
85 	auto blitRoutine = getBlitRoutine(state);
86 	if(!blitRoutine)
87 	{
88 		return;
89 	}
90 
91 	VkImageSubresource subres = {
92 		subresourceRange.aspectMask,
93 		subresourceRange.baseMipLevel,
94 		subresourceRange.baseArrayLayer
95 	};
96 
97 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
98 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
99 
100 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
101 	if(renderArea)
102 	{
103 		ASSERT(subresourceRange.levelCount == 1);
104 		area = *renderArea;
105 	}
106 
107 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
108 	{
109 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
110 		if(!renderArea)
111 		{
112 			area.extent.width = extent.width;
113 			area.extent.height = extent.height;
114 		}
115 
116 		BlitData data = {
117 			pixel, nullptr,  // source, dest
118 
119 			assert_cast<uint32_t>(format.bytes()),                                  // sPitchB
120 			assert_cast<uint32_t>(dest->rowPitchBytes(aspect, subres.mipLevel)),    // dPitchB
121 			0,                                                                      // sSliceB (unused in clear operations)
122 			assert_cast<uint32_t>(dest->slicePitchBytes(aspect, subres.mipLevel)),  // dSliceB
123 
124 			0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f,  // x0, y0, z0, w, h, d
125 
126 			area.offset.x, static_cast<int>(area.offset.x + area.extent.width),   // x0d, x1d
127 			area.offset.y, static_cast<int>(area.offset.y + area.extent.height),  // y0d, y1d
128 			0, 1,                                                                 // z0d, z1d
129 
130 			0, 0, 0,  // sWidth, sHeight, sDepth
131 
132 			false,  // filter3D
133 		};
134 
135 		if(renderArea && dest->is3DSlice())
136 		{
137 			// Reinterpret layers as depth slices
138 			subres.arrayLayer = 0;
139 			for(uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
140 			{
141 				data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
142 				blitRoutine(&data);
143 			}
144 		}
145 		else
146 		{
147 			for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
148 			{
149 				for(uint32_t depth = 0; depth < extent.depth; depth++)
150 				{
151 					data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
152 
153 					blitRoutine(&data);
154 				}
155 			}
156 		}
157 	}
158 	dest->contentsChanged(subresourceRange);
159 }
160 
fastClear(const void * clearValue,vk::Format clearFormat,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)161 bool Blitter::fastClear(const void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
162 {
163 	if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
164 	   clearFormat != VK_FORMAT_D32_SFLOAT &&
165 	   clearFormat != VK_FORMAT_S8_UINT)
166 	{
167 		return false;
168 	}
169 
170 	union ClearValue
171 	{
172 		struct
173 		{
174 			float r;
175 			float g;
176 			float b;
177 			float a;
178 		};
179 
180 		float rgb[3];
181 
182 		float d;
183 		uint32_t d_as_u32;
184 
185 		uint32_t s;
186 	};
187 
188 	const ClearValue &c = *reinterpret_cast<const ClearValue *>(clearValue);
189 
190 	uint32_t packed = 0;
191 
192 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
193 	switch(viewFormat)
194 	{
195 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
196 		packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
197 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
198 		         ((uint16_t)(31 * c.r + 0.5f) << 11);
199 		break;
200 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
201 		packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
202 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
203 		         ((uint16_t)(31 * c.b + 0.5f) << 11);
204 		break;
205 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
206 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
207 	case VK_FORMAT_R8G8B8A8_UNORM:
208 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
209 		         ((uint32_t)(255 * c.b + 0.5f) << 16) |
210 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
211 		         ((uint32_t)(255 * c.r + 0.5f) << 0);
212 		break;
213 	case VK_FORMAT_B8G8R8A8_UNORM:
214 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
215 		         ((uint32_t)(255 * c.r + 0.5f) << 16) |
216 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
217 		         ((uint32_t)(255 * c.b + 0.5f) << 0);
218 		break;
219 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
220 		packed = R11G11B10F(c.rgb);
221 		break;
222 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
223 		packed = RGB9E5(c.rgb);
224 		break;
225 	case VK_FORMAT_D32_SFLOAT:
226 		ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
227 		packed = c.d_as_u32;  // float reinterpreted as uint32
228 		break;
229 	case VK_FORMAT_S8_UINT:
230 		ASSERT(clearFormat == VK_FORMAT_S8_UINT);
231 		packed = static_cast<uint8_t>(c.s);
232 		break;
233 	default:
234 		return false;
235 	}
236 
237 	VkImageSubresource subres = {
238 		subresourceRange.aspectMask,
239 		subresourceRange.baseMipLevel,
240 		subresourceRange.baseArrayLayer
241 	};
242 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
243 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
244 
245 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
246 	if(renderArea)
247 	{
248 		ASSERT(subresourceRange.levelCount == 1);
249 		area = *renderArea;
250 	}
251 
252 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
253 	{
254 		int rowPitchBytes = dest->rowPitchBytes(aspect, subres.mipLevel);
255 		int slicePitchBytes = dest->slicePitchBytes(aspect, subres.mipLevel);
256 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
257 		if(!renderArea)
258 		{
259 			area.extent.width = extent.width;
260 			area.extent.height = extent.height;
261 		}
262 		if(dest->is3DSlice())
263 		{
264 			extent.depth = 1;  // The 3D image is instead interpreted as a 2D image with layers
265 		}
266 
267 		for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
268 		{
269 			for(uint32_t depth = 0; depth < extent.depth; depth++)
270 			{
271 				uint8_t *slice = (uint8_t *)dest->getTexelPointer(
272 				    { area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subres);
273 
274 				for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
275 				{
276 					uint8_t *d = slice;
277 
278 					switch(viewFormat.bytes())
279 					{
280 					case 4:
281 						for(uint32_t i = 0; i < area.extent.height; i++)
282 						{
283 							ASSERT(d < dest->end());
284 							sw::clear((uint32_t *)d, packed, area.extent.width);
285 							d += rowPitchBytes;
286 						}
287 						break;
288 					case 2:
289 						for(uint32_t i = 0; i < area.extent.height; i++)
290 						{
291 							ASSERT(d < dest->end());
292 							sw::clear((uint16_t *)d, static_cast<uint16_t>(packed), area.extent.width);
293 							d += rowPitchBytes;
294 						}
295 						break;
296 					case 1:
297 						for(uint32_t i = 0; i < area.extent.height; i++)
298 						{
299 							ASSERT(d < dest->end());
300 							memset(d, packed, area.extent.width);
301 							d += rowPitchBytes;
302 						}
303 						break;
304 					default:
305 						assert(false);
306 					}
307 
308 					slice += slicePitchBytes;
309 				}
310 			}
311 		}
312 	}
313 	dest->contentsChanged(subresourceRange);
314 
315 	return true;
316 }
317 
readFloat4(Pointer<Byte> element,const State & state)318 Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
319 {
320 	Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
321 
322 	switch(state.sourceFormat)
323 	{
324 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
325 		c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
326 		c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
327 		c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
328 		c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
329 		break;
330 	case VK_FORMAT_R8_SINT:
331 	case VK_FORMAT_R8_SNORM:
332 		c.x = Float(Int(*Pointer<SByte>(element)));
333 		c.w = float(0x7F);
334 		break;
335 	case VK_FORMAT_R8_UNORM:
336 	case VK_FORMAT_R8_UINT:
337 	case VK_FORMAT_R8_SRGB:
338 		c.x = Float(Int(*Pointer<Byte>(element)));
339 		c.w = float(0xFF);
340 		break;
341 	case VK_FORMAT_R16_SINT:
342 	case VK_FORMAT_R16_SNORM:
343 		c.x = Float(Int(*Pointer<Short>(element)));
344 		c.w = float(0x7FFF);
345 		break;
346 	case VK_FORMAT_R16_UNORM:
347 	case VK_FORMAT_R16_UINT:
348 		c.x = Float(Int(*Pointer<UShort>(element)));
349 		c.w = float(0xFFFF);
350 		break;
351 	case VK_FORMAT_R32_SINT:
352 		c.x = Float(*Pointer<Int>(element));
353 		c.w = float(0x7FFFFFFF);
354 		break;
355 	case VK_FORMAT_R32_UINT:
356 		c.x = Float(*Pointer<UInt>(element));
357 		c.w = float(0xFFFFFFFF);
358 		break;
359 	case VK_FORMAT_B8G8R8A8_SRGB:
360 	case VK_FORMAT_B8G8R8A8_UNORM:
361 		c = Float4(*Pointer<Byte4>(element)).zyxw;
362 		break;
363 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
364 	case VK_FORMAT_R8G8B8A8_SINT:
365 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
366 	case VK_FORMAT_R8G8B8A8_SNORM:
367 		c = Float4(*Pointer<SByte4>(element));
368 		break;
369 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
370 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
371 	case VK_FORMAT_R8G8B8A8_UNORM:
372 	case VK_FORMAT_R8G8B8A8_UINT:
373 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
374 	case VK_FORMAT_R8G8B8A8_SRGB:
375 		c = Float4(*Pointer<Byte4>(element));
376 		break;
377 	case VK_FORMAT_R16G16B16A16_SINT:
378 	case VK_FORMAT_R16G16B16A16_SNORM:
379 		c = Float4(*Pointer<Short4>(element));
380 		break;
381 	case VK_FORMAT_R16G16B16A16_UNORM:
382 	case VK_FORMAT_R16G16B16A16_UINT:
383 		c = Float4(*Pointer<UShort4>(element));
384 		break;
385 	case VK_FORMAT_R32G32B32A32_SINT:
386 		c = Float4(*Pointer<Int4>(element));
387 		break;
388 	case VK_FORMAT_R32G32B32A32_UINT:
389 		c = Float4(*Pointer<UInt4>(element));
390 		break;
391 	case VK_FORMAT_R8G8_SINT:
392 	case VK_FORMAT_R8G8_SNORM:
393 		c.x = Float(Int(*Pointer<SByte>(element + 0)));
394 		c.y = Float(Int(*Pointer<SByte>(element + 1)));
395 		c.w = float(0x7F);
396 		break;
397 	case VK_FORMAT_R8G8_UNORM:
398 	case VK_FORMAT_R8G8_UINT:
399 	case VK_FORMAT_R8G8_SRGB:
400 		c.x = Float(Int(*Pointer<Byte>(element + 0)));
401 		c.y = Float(Int(*Pointer<Byte>(element + 1)));
402 		c.w = float(0xFF);
403 		break;
404 	case VK_FORMAT_R16G16_SINT:
405 	case VK_FORMAT_R16G16_SNORM:
406 		c.x = Float(Int(*Pointer<Short>(element + 0)));
407 		c.y = Float(Int(*Pointer<Short>(element + 2)));
408 		c.w = float(0x7FFF);
409 		break;
410 	case VK_FORMAT_R16G16_UNORM:
411 	case VK_FORMAT_R16G16_UINT:
412 		c.x = Float(Int(*Pointer<UShort>(element + 0)));
413 		c.y = Float(Int(*Pointer<UShort>(element + 2)));
414 		c.w = float(0xFFFF);
415 		break;
416 	case VK_FORMAT_R32G32_SINT:
417 		c.x = Float(*Pointer<Int>(element + 0));
418 		c.y = Float(*Pointer<Int>(element + 4));
419 		c.w = float(0x7FFFFFFF);
420 		break;
421 	case VK_FORMAT_R32G32_UINT:
422 		c.x = Float(*Pointer<UInt>(element + 0));
423 		c.y = Float(*Pointer<UInt>(element + 4));
424 		c.w = float(0xFFFFFFFF);
425 		break;
426 	case VK_FORMAT_R32G32B32A32_SFLOAT:
427 		c = *Pointer<Float4>(element);
428 		break;
429 	case VK_FORMAT_R32G32_SFLOAT:
430 		c.x = *Pointer<Float>(element + 0);
431 		c.y = *Pointer<Float>(element + 4);
432 		break;
433 	case VK_FORMAT_R32_SFLOAT:
434 		c.x = *Pointer<Float>(element);
435 		break;
436 	case VK_FORMAT_R16G16B16A16_SFLOAT:
437 		c.w = Float(*Pointer<Half>(element + 6));
438 	case VK_FORMAT_R16G16B16_SFLOAT:
439 		c.z = Float(*Pointer<Half>(element + 4));
440 	case VK_FORMAT_R16G16_SFLOAT:
441 		c.y = Float(*Pointer<Half>(element + 2));
442 	case VK_FORMAT_R16_SFLOAT:
443 		c.x = Float(*Pointer<Half>(element));
444 		break;
445 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
446 		c = r11g11b10Unpack(*Pointer<UInt>(element));
447 		break;
448 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
449 		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
450 		c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));          // R's mantissa (bits 0-8)
451 		c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);   // G's mantissa (bits 9-17)
452 		c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18);  // B's mantissa (bits 18-26)
453 		c *= Float4(
454 		    // 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
455 		    Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
456 		    // Since the 9 bit mantissa values currently stored in RGB were converted straight
457 		    // from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
458 		    // are (1 << 9) times too high.
459 		    // Also, the exponent has 5 bits and we compute the exponent bias of floating point
460 		    // formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
461 		    // Exponent bias (15) + number of mantissa bits per component (9) = 24
462 		    Float(1.0f / (1 << 24)));
463 		c.w = 1.0f;
464 		break;
465 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
466 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
467 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
468 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
469 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
470 		break;
471 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
472 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
473 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
474 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
475 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
476 		break;
477 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
478 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
479 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
480 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
481 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
482 		break;
483 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
484 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
485 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
486 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
487 		break;
488 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
489 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
490 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
491 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
492 		break;
493 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
494 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
495 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
496 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
497 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
498 		break;
499 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
500 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
501 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
502 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
503 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
504 		break;
505 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
506 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
507 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
508 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
509 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
510 		break;
511 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
512 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
513 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
514 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
515 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
516 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
517 		break;
518 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
519 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
520 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
521 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
522 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
523 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
524 		break;
525 	case VK_FORMAT_D16_UNORM:
526 		c.x = Float(Int((*Pointer<UShort>(element))));
527 		break;
528 	case VK_FORMAT_X8_D24_UNORM_PACK32:
529 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
530 		break;
531 	case VK_FORMAT_D32_SFLOAT:
532 		c.x = *Pointer<Float>(element);
533 		break;
534 	case VK_FORMAT_S8_UINT:
535 		c.x = Float(Int(*Pointer<Byte>(element)));
536 		break;
537 	default:
538 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
539 	}
540 
541 	return c;
542 }
543 
write(Float4 & c,Pointer<Byte> element,const State & state)544 void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
545 {
546 	bool writeR = state.writeRed;
547 	bool writeG = state.writeGreen;
548 	bool writeB = state.writeBlue;
549 	bool writeA = state.writeAlpha;
550 	bool writeRGBA = writeR && writeG && writeB && writeA;
551 
552 	switch(state.destFormat)
553 	{
554 	case VK_FORMAT_R4G4_UNORM_PACK8:
555 		if(writeR | writeG)
556 		{
557 			if(!writeR)
558 			{
559 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
560 				                          (*Pointer<Byte>(element) & Byte(0xF0));
561 			}
562 			else if(!writeG)
563 			{
564 				*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
565 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
566 			}
567 			else
568 			{
569 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
570 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
571 			}
572 		}
573 		break;
574 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
575 		if(writeRGBA)
576 		{
577 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 }));
578 		}
579 		else
580 		{
581 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
582 			                      (writeB ? 0x00F0 : 0x0000) |
583 			                      (writeG ? 0x0F00 : 0x0000) |
584 			                      (writeR ? 0xF000 : 0x0000);
585 			unsigned short unmask = ~mask;
586 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
587 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 })) & UShort(mask));
588 		}
589 		break;
590 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
591 		if(writeRGBA)
592 		{
593 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 }));
594 		}
595 		else
596 		{
597 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
598 			                      (writeR ? 0x00F0 : 0x0000) |
599 			                      (writeG ? 0x0F00 : 0x0000) |
600 			                      (writeB ? 0xF000 : 0x0000);
601 			unsigned short unmask = ~mask;
602 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
603 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 })) & UShort(mask));
604 		}
605 		break;
606 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
607 		if(writeRGBA)
608 		{
609 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 }));
610 		}
611 		else
612 		{
613 			unsigned short mask = (writeB ? 0x000F : 0x0000) |
614 			                      (writeG ? 0x00F0 : 0x0000) |
615 			                      (writeR ? 0x0F00 : 0x0000) |
616 			                      (writeA ? 0xF000 : 0x0000);
617 			unsigned short unmask = ~mask;
618 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
619 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 })) & UShort(mask));
620 		}
621 		break;
622 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
623 		if(writeRGBA)
624 		{
625 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 }));
626 		}
627 		else
628 		{
629 			unsigned short mask = (writeR ? 0x000F : 0x0000) |
630 			                      (writeG ? 0x00F0 : 0x0000) |
631 			                      (writeB ? 0x0F00 : 0x0000) |
632 			                      (writeA ? 0xF000 : 0x0000);
633 			unsigned short unmask = ~mask;
634 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
635 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 })) & UShort(mask));
636 		}
637 		break;
638 	case VK_FORMAT_B8G8R8A8_SRGB:
639 	case VK_FORMAT_B8G8R8A8_UNORM:
640 		if(writeRGBA)
641 		{
642 			Short4 c0 = RoundShort4(c.zyxw);
643 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
644 		}
645 		else
646 		{
647 			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
648 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
649 			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
650 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
651 		}
652 		break;
653 	case VK_FORMAT_B8G8R8_SNORM:
654 		if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
655 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
656 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
657 		break;
658 	case VK_FORMAT_B8G8R8_UNORM:
659 	case VK_FORMAT_B8G8R8_SRGB:
660 		if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
661 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
662 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
663 		break;
664 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
665 	case VK_FORMAT_R8G8B8A8_UNORM:
666 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
667 	case VK_FORMAT_R8G8B8A8_SRGB:
668 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
669 	case VK_FORMAT_R8G8B8A8_UINT:
670 	case VK_FORMAT_R8G8B8A8_USCALED:
671 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
672 		if(writeRGBA)
673 		{
674 			Short4 c0 = RoundShort4(c);
675 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
676 		}
677 		else
678 		{
679 			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
680 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
681 			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
682 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
683 		}
684 		break;
685 	case VK_FORMAT_R32G32B32A32_SFLOAT:
686 		if(writeRGBA)
687 		{
688 			*Pointer<Float4>(element) = c;
689 		}
690 		else
691 		{
692 			if(writeR) { *Pointer<Float>(element) = c.x; }
693 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
694 			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
695 			if(writeA) { *Pointer<Float>(element + 12) = c.w; }
696 		}
697 		break;
698 	case VK_FORMAT_R32G32B32_SFLOAT:
699 		if(writeR) { *Pointer<Float>(element) = c.x; }
700 		if(writeG) { *Pointer<Float>(element + 4) = c.y; }
701 		if(writeB) { *Pointer<Float>(element + 8) = c.z; }
702 		break;
703 	case VK_FORMAT_R32G32_SFLOAT:
704 		if(writeR && writeG)
705 		{
706 			*Pointer<Float2>(element) = Float2(c);
707 		}
708 		else
709 		{
710 			if(writeR) { *Pointer<Float>(element) = c.x; }
711 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
712 		}
713 		break;
714 	case VK_FORMAT_R32_SFLOAT:
715 		if(writeR) { *Pointer<Float>(element) = c.x; }
716 		break;
717 	case VK_FORMAT_R16G16B16A16_SFLOAT:
718 		if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
719 		// [[fallthrough]]
720 	case VK_FORMAT_R16G16B16_SFLOAT:
721 		if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
722 		// [[fallthrough]]
723 	case VK_FORMAT_R16G16_SFLOAT:
724 		if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
725 		// [[fallthrough]]
726 	case VK_FORMAT_R16_SFLOAT:
727 		if(writeR) { *Pointer<Half>(element) = Half(c.x); }
728 		break;
729 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
730 		{
731 			UInt rgb = r11g11b10Pack(c);
732 
733 			UInt old = *Pointer<UInt>(element);
734 
735 			unsigned int mask = (writeR ? 0x000007FF : 0) |
736 			                    (writeG ? 0x003FF800 : 0) |
737 			                    (writeB ? 0xFFC00000 : 0);
738 
739 			*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
740 		}
741 		break;
742 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
743 		{
744 			ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
745 
746 			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
747 
748 			constexpr int N = 9;       // number of mantissa bits per component
749 			constexpr int B = 15;      // exponent bias
750 			constexpr int E_max = 31;  // maximum possible biased exponent value
751 
752 			// Maximum representable value.
753 			constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
754 
755 			// Clamp components to valid range. NaN becomes 0.
756 			Float red_c = Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
757 			Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
758 			Float blue_c = Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
759 
760 			// We're reducing the mantissa to 9 bits, so we must round up if the next
761 			// bit is 1. In other words add 0.5 to the new mantissa's position and
762 			// allow overflow into the exponent so we can scale correctly.
763 			constexpr int half = 1 << (23 - N);
764 			Float red_r = As<Float>(As<Int>(red_c) + half);
765 			Float green_r = As<Float>(As<Int>(green_c) + half);
766 			Float blue_r = As<Float>(As<Int>(blue_c) + half);
767 
768 			// The largest component determines the shared exponent. It can't be lower
769 			// than 0 (after bias subtraction) so also limit to the mimimum representable.
770 			constexpr float min_s = 0.5f / (1 << B);
771 			Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
772 
773 			// Obtain the reciprocal of the shared exponent by inverting the bits,
774 			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
775 			// format has an implicit leading 1, but this shared component format does not.
776 			Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
777 
778 			UInt R9 = RoundInt(red_c * scale);
779 			UInt G9 = UInt(RoundInt(green_c * scale));
780 			UInt B9 = UInt(RoundInt(blue_c * scale));
781 			UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
782 
783 			UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
784 
785 			*Pointer<UInt>(element) = E5B9G9R9;
786 		}
787 		break;
788 	case VK_FORMAT_B8G8R8A8_SNORM:
789 		if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
790 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
791 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
792 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
793 		break;
794 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
795 	case VK_FORMAT_R8G8B8A8_SINT:
796 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
797 	case VK_FORMAT_R8G8B8A8_SNORM:
798 	case VK_FORMAT_R8G8B8A8_SSCALED:
799 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
800 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
801 		// [[fallthrough]]
802 	case VK_FORMAT_R8G8B8_SINT:
803 	case VK_FORMAT_R8G8B8_SNORM:
804 	case VK_FORMAT_R8G8B8_SSCALED:
805 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
806 		// [[fallthrough]]
807 	case VK_FORMAT_R8G8_SINT:
808 	case VK_FORMAT_R8G8_SNORM:
809 	case VK_FORMAT_R8G8_SSCALED:
810 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
811 		// [[fallthrough]]
812 	case VK_FORMAT_R8_SINT:
813 	case VK_FORMAT_R8_SNORM:
814 	case VK_FORMAT_R8_SSCALED:
815 		if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
816 		break;
817 	case VK_FORMAT_R8G8B8_UINT:
818 	case VK_FORMAT_R8G8B8_UNORM:
819 	case VK_FORMAT_R8G8B8_USCALED:
820 	case VK_FORMAT_R8G8B8_SRGB:
821 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
822 		// [[fallthrough]]
823 	case VK_FORMAT_R8G8_UINT:
824 	case VK_FORMAT_R8G8_UNORM:
825 	case VK_FORMAT_R8G8_USCALED:
826 	case VK_FORMAT_R8G8_SRGB:
827 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
828 		// [[fallthrough]]
829 	case VK_FORMAT_R8_UINT:
830 	case VK_FORMAT_R8_UNORM:
831 	case VK_FORMAT_R8_USCALED:
832 	case VK_FORMAT_R8_SRGB:
833 		if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
834 		break;
835 	case VK_FORMAT_R16G16B16A16_SINT:
836 	case VK_FORMAT_R16G16B16A16_SNORM:
837 	case VK_FORMAT_R16G16B16A16_SSCALED:
838 		if(writeRGBA)
839 		{
840 			*Pointer<Short4>(element) = Short4(RoundInt(c));
841 		}
842 		else
843 		{
844 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
845 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
846 			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
847 			if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
848 		}
849 		break;
850 	case VK_FORMAT_R16G16B16_SINT:
851 	case VK_FORMAT_R16G16B16_SNORM:
852 	case VK_FORMAT_R16G16B16_SSCALED:
853 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
854 		if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
855 		if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
856 		break;
857 	case VK_FORMAT_R16G16_SINT:
858 	case VK_FORMAT_R16G16_SNORM:
859 	case VK_FORMAT_R16G16_SSCALED:
860 		if(writeR && writeG)
861 		{
862 			*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
863 		}
864 		else
865 		{
866 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
867 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
868 		}
869 		break;
870 	case VK_FORMAT_R16_SINT:
871 	case VK_FORMAT_R16_SNORM:
872 	case VK_FORMAT_R16_SSCALED:
873 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
874 		break;
875 	case VK_FORMAT_R16G16B16A16_UINT:
876 	case VK_FORMAT_R16G16B16A16_UNORM:
877 	case VK_FORMAT_R16G16B16A16_USCALED:
878 		if(writeRGBA)
879 		{
880 			*Pointer<UShort4>(element) = UShort4(RoundInt(c));
881 		}
882 		else
883 		{
884 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
885 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
886 			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
887 			if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
888 		}
889 		break;
890 	case VK_FORMAT_R16G16B16_UINT:
891 	case VK_FORMAT_R16G16B16_UNORM:
892 	case VK_FORMAT_R16G16B16_USCALED:
893 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
894 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
895 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
896 		break;
897 	case VK_FORMAT_R16G16_UINT:
898 	case VK_FORMAT_R16G16_UNORM:
899 	case VK_FORMAT_R16G16_USCALED:
900 		if(writeR && writeG)
901 		{
902 			*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
903 		}
904 		else
905 		{
906 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
907 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
908 		}
909 		break;
910 	case VK_FORMAT_R16_UINT:
911 	case VK_FORMAT_R16_UNORM:
912 	case VK_FORMAT_R16_USCALED:
913 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
914 		break;
915 	case VK_FORMAT_R32G32B32A32_SINT:
916 		if(writeRGBA)
917 		{
918 			*Pointer<Int4>(element) = RoundInt(c);
919 		}
920 		else
921 		{
922 			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
923 			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
924 			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
925 			if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
926 		}
927 		break;
928 	case VK_FORMAT_R32G32B32_SINT:
929 		if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
930 		// [[fallthrough]]
931 	case VK_FORMAT_R32G32_SINT:
932 		if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
933 		// [[fallthrough]]
934 	case VK_FORMAT_R32_SINT:
935 		if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
936 		break;
937 	case VK_FORMAT_R32G32B32A32_UINT:
938 		if(writeRGBA)
939 		{
940 			*Pointer<UInt4>(element) = UInt4(RoundInt(c));
941 		}
942 		else
943 		{
944 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
945 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
946 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
947 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
948 		}
949 		break;
950 	case VK_FORMAT_R32G32B32_UINT:
951 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
952 		// [[fallthrough]]
953 	case VK_FORMAT_R32G32_UINT:
954 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
955 		// [[fallthrough]]
956 	case VK_FORMAT_R32_UINT:
957 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
958 		break;
959 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
960 		if(writeR && writeG && writeB)
961 		{
962 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
963 		}
964 		else
965 		{
966 			unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
967 			unsigned short unmask = ~mask;
968 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
969 			                            (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
970 			                             UShort(mask));
971 		}
972 		break;
973 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
974 		if(writeR && writeG && writeB)
975 		{
976 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
977 		}
978 		else
979 		{
980 			unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
981 			unsigned short unmask = ~mask;
982 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
983 			                            (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
984 			                             UShort(mask));
985 		}
986 		break;
987 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
988 		if(writeRGBA)
989 		{
990 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
991 		}
992 		else
993 		{
994 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
995 			                      (writeR ? 0x7C00 : 0x0000) |
996 			                      (writeG ? 0x03E0 : 0x0000) |
997 			                      (writeB ? 0x001F : 0x0000);
998 			unsigned short unmask = ~mask;
999 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1000 			                            (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
1001 			                             UShort(mask));
1002 		}
1003 		break;
1004 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1005 		if(writeRGBA)
1006 		{
1007 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
1008 		}
1009 		else
1010 		{
1011 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1012 			                      (writeR ? 0x7C00 : 0x0000) |
1013 			                      (writeG ? 0x03E0 : 0x0000) |
1014 			                      (writeB ? 0x001F : 0x0000);
1015 			unsigned short unmask = ~mask;
1016 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1017 			                            (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
1018 			                             UShort(mask));
1019 		}
1020 		break;
1021 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1022 		if(writeRGBA)
1023 		{
1024 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
1025 		}
1026 		else
1027 		{
1028 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1029 			                      (writeR ? 0x7C00 : 0x0000) |
1030 			                      (writeG ? 0x03E0 : 0x0000) |
1031 			                      (writeB ? 0x001F : 0x0000);
1032 			unsigned short unmask = ~mask;
1033 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1034 			                            (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
1035 			                             UShort(mask));
1036 		}
1037 		break;
1038 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1039 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1040 	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1041 		if(writeRGBA)
1042 		{
1043 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
1044 		}
1045 		else
1046 		{
1047 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1048 			                    (writeB ? 0x3FF00000 : 0x0000) |
1049 			                    (writeG ? 0x000FFC00 : 0x0000) |
1050 			                    (writeR ? 0x000003FF : 0x0000);
1051 			unsigned int unmask = ~mask;
1052 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1053 			                          (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
1054 			                           UInt(mask));
1055 		}
1056 		break;
1057 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1058 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1059 	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1060 		if(writeRGBA)
1061 		{
1062 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
1063 		}
1064 		else
1065 		{
1066 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1067 			                    (writeR ? 0x3FF00000 : 0x0000) |
1068 			                    (writeG ? 0x000FFC00 : 0x0000) |
1069 			                    (writeB ? 0x000003FF : 0x0000);
1070 			unsigned int unmask = ~mask;
1071 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1072 			                          (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
1073 			                           UInt(mask));
1074 		}
1075 		break;
1076 	case VK_FORMAT_D16_UNORM:
1077 		*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
1078 		break;
1079 	case VK_FORMAT_X8_D24_UNORM_PACK32:
1080 		*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
1081 		break;
1082 	case VK_FORMAT_D32_SFLOAT:
1083 		*Pointer<Float>(element) = c.x;
1084 		break;
1085 	case VK_FORMAT_S8_UINT:
1086 		*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
1087 		break;
1088 	default:
1089 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1090 		break;
1091 	}
1092 }
1093 
readInt4(Pointer<Byte> element,const State & state)1094 Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
1095 {
1096 	Int4 c(0, 0, 0, 1);
1097 
1098 	switch(state.sourceFormat)
1099 	{
1100 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1101 	case VK_FORMAT_R8G8B8A8_SINT:
1102 		c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
1103 		c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
1104 		// [[fallthrough]]
1105 	case VK_FORMAT_R8G8_SINT:
1106 		c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
1107 		// [[fallthrough]]
1108 	case VK_FORMAT_R8_SINT:
1109 		c = Insert(c, Int(*Pointer<SByte>(element)), 0);
1110 		break;
1111 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1112 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
1113 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1114 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
1115 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1116 		break;
1117 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1118 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 2);
1119 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1120 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 0);
1121 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1122 		break;
1123 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1124 	case VK_FORMAT_R8G8B8A8_UINT:
1125 		c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
1126 		c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
1127 		// [[fallthrough]]
1128 	case VK_FORMAT_R8G8_UINT:
1129 		c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
1130 		// [[fallthrough]]
1131 	case VK_FORMAT_R8_UINT:
1132 	case VK_FORMAT_S8_UINT:
1133 		c = Insert(c, Int(*Pointer<Byte>(element)), 0);
1134 		break;
1135 	case VK_FORMAT_R16G16B16A16_SINT:
1136 		c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
1137 		c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
1138 		// [[fallthrough]]
1139 	case VK_FORMAT_R16G16_SINT:
1140 		c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
1141 		// [[fallthrough]]
1142 	case VK_FORMAT_R16_SINT:
1143 		c = Insert(c, Int(*Pointer<Short>(element)), 0);
1144 		break;
1145 	case VK_FORMAT_R16G16B16A16_UINT:
1146 		c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
1147 		c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
1148 		// [[fallthrough]]
1149 	case VK_FORMAT_R16G16_UINT:
1150 		c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
1151 		// [[fallthrough]]
1152 	case VK_FORMAT_R16_UINT:
1153 		c = Insert(c, Int(*Pointer<UShort>(element)), 0);
1154 		break;
1155 	case VK_FORMAT_R32G32B32A32_SINT:
1156 	case VK_FORMAT_R32G32B32A32_UINT:
1157 		c = *Pointer<Int4>(element);
1158 		break;
1159 	case VK_FORMAT_R32G32_SINT:
1160 	case VK_FORMAT_R32G32_UINT:
1161 		c = Insert(c, *Pointer<Int>(element + 4), 1);
1162 		// [[fallthrough]]
1163 	case VK_FORMAT_R32_SINT:
1164 	case VK_FORMAT_R32_UINT:
1165 		c = Insert(c, *Pointer<Int>(element), 0);
1166 		break;
1167 	default:
1168 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1169 	}
1170 
1171 	return c;
1172 }
1173 
write(Int4 & c,Pointer<Byte> element,const State & state)1174 void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
1175 {
1176 	bool writeR = state.writeRed;
1177 	bool writeG = state.writeGreen;
1178 	bool writeB = state.writeBlue;
1179 	bool writeA = state.writeAlpha;
1180 	bool writeRGBA = writeR && writeG && writeB && writeA;
1181 
1182 	ASSERT(state.sourceFormat.isUnsigned() == state.destFormat.isUnsigned());
1183 
1184 	switch(state.destFormat)
1185 	{
1186 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1187 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1188 		c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
1189 		break;
1190 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1191 	case VK_FORMAT_R8G8B8A8_UINT:
1192 	case VK_FORMAT_R8G8B8_UINT:
1193 	case VK_FORMAT_R8G8_UINT:
1194 	case VK_FORMAT_R8_UINT:
1195 	case VK_FORMAT_R8G8B8A8_USCALED:
1196 	case VK_FORMAT_R8G8B8_USCALED:
1197 	case VK_FORMAT_R8G8_USCALED:
1198 	case VK_FORMAT_R8_USCALED:
1199 	case VK_FORMAT_S8_UINT:
1200 		c = Min(As<UInt4>(c), UInt4(0xFF));
1201 		break;
1202 	case VK_FORMAT_R16G16B16A16_UINT:
1203 	case VK_FORMAT_R16G16B16_UINT:
1204 	case VK_FORMAT_R16G16_UINT:
1205 	case VK_FORMAT_R16_UINT:
1206 	case VK_FORMAT_R16G16B16A16_USCALED:
1207 	case VK_FORMAT_R16G16B16_USCALED:
1208 	case VK_FORMAT_R16G16_USCALED:
1209 	case VK_FORMAT_R16_USCALED:
1210 		c = Min(As<UInt4>(c), UInt4(0xFFFF));
1211 		break;
1212 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1213 	case VK_FORMAT_R8G8B8A8_SINT:
1214 	case VK_FORMAT_R8G8_SINT:
1215 	case VK_FORMAT_R8_SINT:
1216 	case VK_FORMAT_R8G8B8A8_SSCALED:
1217 	case VK_FORMAT_R8G8B8_SSCALED:
1218 	case VK_FORMAT_R8G8_SSCALED:
1219 	case VK_FORMAT_R8_SSCALED:
1220 		c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
1221 		break;
1222 	case VK_FORMAT_R16G16B16A16_SINT:
1223 	case VK_FORMAT_R16G16B16_SINT:
1224 	case VK_FORMAT_R16G16_SINT:
1225 	case VK_FORMAT_R16_SINT:
1226 	case VK_FORMAT_R16G16B16A16_SSCALED:
1227 	case VK_FORMAT_R16G16B16_SSCALED:
1228 	case VK_FORMAT_R16G16_SSCALED:
1229 	case VK_FORMAT_R16_SSCALED:
1230 		c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
1231 		break;
1232 	default:
1233 		break;
1234 	}
1235 
1236 	switch(state.destFormat)
1237 	{
1238 	case VK_FORMAT_B8G8R8A8_SINT:
1239 	case VK_FORMAT_B8G8R8A8_SSCALED:
1240 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1241 		// [[fallthrough]]
1242 	case VK_FORMAT_B8G8R8_SINT:
1243 	case VK_FORMAT_B8G8R8_SSCALED:
1244 		if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
1245 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1246 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
1247 		break;
1248 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1249 	case VK_FORMAT_R8G8B8A8_SINT:
1250 	case VK_FORMAT_R8G8B8A8_SSCALED:
1251 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
1252 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1253 		// [[fallthrough]]
1254 	case VK_FORMAT_R8G8B8_SINT:
1255 	case VK_FORMAT_R8G8B8_SSCALED:
1256 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
1257 		// [[fallthrough]]
1258 	case VK_FORMAT_R8G8_SINT:
1259 	case VK_FORMAT_R8G8_SSCALED:
1260 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1261 		// [[fallthrough]]
1262 	case VK_FORMAT_R8_SINT:
1263 	case VK_FORMAT_R8_SSCALED:
1264 		if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
1265 		break;
1266 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1267 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
1268 	case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
1269 	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
1270 		if(writeRGBA)
1271 		{
1272 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
1273 		}
1274 		else
1275 		{
1276 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1277 			                    (writeB ? 0x3FF00000 : 0x0000) |
1278 			                    (writeG ? 0x000FFC00 : 0x0000) |
1279 			                    (writeR ? 0x000003FF : 0x0000);
1280 			unsigned int unmask = ~mask;
1281 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1282 			                          (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
1283 		}
1284 		break;
1285 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1286 	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
1287 	case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
1288 	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
1289 		if(writeRGBA)
1290 		{
1291 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
1292 		}
1293 		else
1294 		{
1295 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1296 			                    (writeR ? 0x3FF00000 : 0x0000) |
1297 			                    (writeG ? 0x000FFC00 : 0x0000) |
1298 			                    (writeB ? 0x000003FF : 0x0000);
1299 			unsigned int unmask = ~mask;
1300 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1301 			                          (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
1302 		}
1303 		break;
1304 	case VK_FORMAT_B8G8R8A8_UINT:
1305 	case VK_FORMAT_B8G8R8A8_USCALED:
1306 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1307 		// [[fallthrough]]
1308 	case VK_FORMAT_B8G8R8_UINT:
1309 	case VK_FORMAT_B8G8R8_USCALED:
1310 	case VK_FORMAT_B8G8R8_SRGB:
1311 		if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
1312 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1313 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
1314 		break;
1315 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1316 	case VK_FORMAT_R8G8B8A8_UINT:
1317 	case VK_FORMAT_R8G8B8A8_USCALED:
1318 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
1319 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1320 		// [[fallthrough]]
1321 	case VK_FORMAT_R8G8B8_UINT:
1322 	case VK_FORMAT_R8G8B8_USCALED:
1323 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
1324 		// [[fallthrough]]
1325 	case VK_FORMAT_R8G8_UINT:
1326 	case VK_FORMAT_R8G8_USCALED:
1327 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1328 		// [[fallthrough]]
1329 	case VK_FORMAT_R8_UINT:
1330 	case VK_FORMAT_R8_USCALED:
1331 	case VK_FORMAT_S8_UINT:
1332 		if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
1333 		break;
1334 	case VK_FORMAT_R16G16B16A16_SINT:
1335 	case VK_FORMAT_R16G16B16A16_SSCALED:
1336 		if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
1337 		// [[fallthrough]]
1338 	case VK_FORMAT_R16G16B16_SINT:
1339 	case VK_FORMAT_R16G16B16_SSCALED:
1340 		if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
1341 		// [[fallthrough]]
1342 	case VK_FORMAT_R16G16_SINT:
1343 	case VK_FORMAT_R16G16_SSCALED:
1344 		if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
1345 		// [[fallthrough]]
1346 	case VK_FORMAT_R16_SINT:
1347 	case VK_FORMAT_R16_SSCALED:
1348 		if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
1349 		break;
1350 	case VK_FORMAT_R16G16B16A16_UINT:
1351 	case VK_FORMAT_R16G16B16A16_USCALED:
1352 		if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
1353 		// [[fallthrough]]
1354 	case VK_FORMAT_R16G16B16_UINT:
1355 	case VK_FORMAT_R16G16B16_USCALED:
1356 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
1357 		// [[fallthrough]]
1358 	case VK_FORMAT_R16G16_UINT:
1359 	case VK_FORMAT_R16G16_USCALED:
1360 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
1361 		// [[fallthrough]]
1362 	case VK_FORMAT_R16_UINT:
1363 	case VK_FORMAT_R16_USCALED:
1364 		if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
1365 		break;
1366 	case VK_FORMAT_R32G32B32A32_SINT:
1367 		if(writeRGBA)
1368 		{
1369 			*Pointer<Int4>(element) = c;
1370 		}
1371 		else
1372 		{
1373 			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1374 			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1375 			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1376 			if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
1377 		}
1378 		break;
1379 	case VK_FORMAT_R32G32B32_SINT:
1380 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1381 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1382 		if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1383 		break;
1384 	case VK_FORMAT_R32G32_SINT:
1385 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1386 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1387 		break;
1388 	case VK_FORMAT_R32_SINT:
1389 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1390 		break;
1391 	case VK_FORMAT_R32G32B32A32_UINT:
1392 		if(writeRGBA)
1393 		{
1394 			*Pointer<UInt4>(element) = As<UInt4>(c);
1395 		}
1396 		else
1397 		{
1398 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1399 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1400 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1401 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
1402 		}
1403 		break;
1404 	case VK_FORMAT_R32G32B32_UINT:
1405 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1406 		// [[fallthrough]]
1407 	case VK_FORMAT_R32G32_UINT:
1408 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1409 		// [[fallthrough]]
1410 	case VK_FORMAT_R32_UINT:
1411 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1412 		break;
1413 	default:
1414 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1415 	}
1416 }
1417 
ApplyScaleAndClamp(Float4 & value,const State & state,bool preScaled)1418 void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
1419 {
1420 	float4 scale{}, unscale{};
1421 
1422 	if(state.clearOperation &&
1423 	   state.sourceFormat.isUnnormalizedInteger() &&
1424 	   !state.destFormat.isUnnormalizedInteger())
1425 	{
1426 		// If we're clearing a buffer from an int or uint color into a normalized color,
1427 		// then the whole range of the int or uint color must be scaled between 0 and 1.
1428 		switch(state.sourceFormat)
1429 		{
1430 		case VK_FORMAT_R32G32B32A32_SINT:
1431 			unscale = float4(static_cast<float>(0x7FFFFFFF));
1432 			break;
1433 		case VK_FORMAT_R32G32B32A32_UINT:
1434 			unscale = float4(static_cast<float>(0xFFFFFFFF));
1435 			break;
1436 		default:
1437 			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1438 		}
1439 	}
1440 	else
1441 	{
1442 		unscale = state.sourceFormat.getScale();
1443 	}
1444 
1445 	scale = state.destFormat.getScale();
1446 
1447 	bool srcSRGB = state.sourceFormat.isSRGBformat();
1448 	bool dstSRGB = state.destFormat.isSRGBformat();
1449 
1450 	if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))  // One of the formats is sRGB encoded.
1451 	{
1452 		value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) :  // Unapply scale
1453 		             Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w);   // Apply unscale
1454 		value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
1455 		value *= Float4(scale.x, scale.y, scale.z, scale.w);  // Apply scale
1456 	}
1457 	else if(unscale != scale)
1458 	{
1459 		value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
1460 	}
1461 
1462 	if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
1463 	{
1464 		value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
1465 
1466 		value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
1467 		                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
1468 		                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
1469 		                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
1470 	}
1471 
1472 	if(!state.sourceFormat.isUnsigned() && state.destFormat.isUnsigned())
1473 	{
1474 		value = Max(value, Float4(0.0f));
1475 	}
1476 }
1477 
ComputeOffset(Int & x,Int & y,Int & pitchB,int bytes)1478 Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
1479 {
1480 	return y * pitchB + x * bytes;
1481 }
1482 
ComputeOffset(Int & x,Int & y,Int & z,Int & sliceB,Int & pitchB,int bytes)1483 Int Blitter::ComputeOffset(Int &x, Int &y, Int &z, Int &sliceB, Int &pitchB, int bytes)
1484 {
1485 	return z * sliceB + y * pitchB + x * bytes;
1486 }
1487 
LinearToSRGB(const Float4 & c)1488 Float4 Blitter::LinearToSRGB(const Float4 &c)
1489 {
1490 	Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
1491 	Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
1492 
1493 	Float4 s = c;
1494 	s.xyz = Max(lc, ec);
1495 
1496 	return s;
1497 }
1498 
sRGBtoLinear(const Float4 & c)1499 Float4 Blitter::sRGBtoLinear(const Float4 &c)
1500 {
1501 	Float4 lc = c * Float4(1.0f / 12.92f);
1502 	Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
1503 
1504 	Int4 linear = CmpLT(c, Float4(0.04045f));
1505 
1506 	Float4 s = c;
1507 	s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));  // TODO: IfThenElse()
1508 
1509 	return s;
1510 }
1511 
sample(Pointer<Byte> & source,Float & x,Float & y,Float & z,Int & sWidth,Int & sHeight,Int & sDepth,Int & sSliceB,Int & sPitchB,const State & state)1512 Float4 Blitter::sample(Pointer<Byte> &source, Float &x, Float &y, Float &z,
1513                        Int &sWidth, Int &sHeight, Int &sDepth,
1514                        Int &sSliceB, Int &sPitchB, const State &state)
1515 {
1516 	bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1517 	int srcBytes = state.sourceFormat.bytes();
1518 
1519 	Float4 color;
1520 
1521 	bool preScaled = false;
1522 	if(!state.filter || intSrc)
1523 	{
1524 		Int X = Int(x);
1525 		Int Y = Int(y);
1526 		Int Z = Int(z);
1527 
1528 		if(state.clampToEdge)
1529 		{
1530 			X = Clamp(X, 0, sWidth - 1);
1531 			Y = Clamp(Y, 0, sHeight - 1);
1532 			Z = Clamp(Z, 0, sDepth - 1);
1533 		}
1534 
1535 		Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1536 
1537 		color = readFloat4(s, state);
1538 
1539 		if(state.srcSamples > 1)  // Resolve multisampled source
1540 		{
1541 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1542 			{
1543 				ApplyScaleAndClamp(color, state);
1544 				preScaled = true;
1545 			}
1546 			Float4 accum = color;
1547 			for(int sample = 1; sample < state.srcSamples; sample++)
1548 			{
1549 				s += sSliceB;
1550 				color = readFloat4(s, state);
1551 
1552 				if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1553 				{
1554 					ApplyScaleAndClamp(color, state);
1555 					preScaled = true;
1556 				}
1557 				accum += color;
1558 			}
1559 			color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
1560 		}
1561 	}
1562 	else  // Bilinear filtering
1563 	{
1564 		Float X = x;
1565 		Float Y = y;
1566 		Float Z = z;
1567 
1568 		if(state.clampToEdge)
1569 		{
1570 			X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
1571 			Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
1572 			Z = Min(Max(z, 0.5f), Float(sDepth) - 0.5f);
1573 		}
1574 
1575 		Float x0 = X - 0.5f;
1576 		Float y0 = Y - 0.5f;
1577 		Float z0 = Z - 0.5f;
1578 
1579 		Int X0 = Max(Int(x0), 0);
1580 		Int Y0 = Max(Int(y0), 0);
1581 		Int Z0 = Max(Int(z0), 0);
1582 
1583 		Int X1 = X0 + 1;
1584 		Int Y1 = Y0 + 1;
1585 		X1 = IfThenElse(X1 >= sWidth, X0, X1);
1586 		Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
1587 
1588 		if(state.filter3D)
1589 		{
1590 			Int Z1 = Z0 + 1;
1591 			Z1 = IfThenElse(Z1 >= sHeight, Z0, Z1);
1592 
1593 			Pointer<Byte> s000 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1594 			Pointer<Byte> s010 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1595 			Pointer<Byte> s100 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1596 			Pointer<Byte> s110 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1597 			Pointer<Byte> s001 = source + ComputeOffset(X0, Y0, Z1, sSliceB, sPitchB, srcBytes);
1598 			Pointer<Byte> s011 = source + ComputeOffset(X1, Y0, Z1, sSliceB, sPitchB, srcBytes);
1599 			Pointer<Byte> s101 = source + ComputeOffset(X0, Y1, Z1, sSliceB, sPitchB, srcBytes);
1600 			Pointer<Byte> s111 = source + ComputeOffset(X1, Y1, Z1, sSliceB, sPitchB, srcBytes);
1601 
1602 			Float4 c000 = readFloat4(s000, state);
1603 			Float4 c010 = readFloat4(s010, state);
1604 			Float4 c100 = readFloat4(s100, state);
1605 			Float4 c110 = readFloat4(s110, state);
1606 			Float4 c001 = readFloat4(s001, state);
1607 			Float4 c011 = readFloat4(s011, state);
1608 			Float4 c101 = readFloat4(s101, state);
1609 			Float4 c111 = readFloat4(s111, state);
1610 
1611 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1612 			{
1613 				ApplyScaleAndClamp(c000, state);
1614 				ApplyScaleAndClamp(c010, state);
1615 				ApplyScaleAndClamp(c100, state);
1616 				ApplyScaleAndClamp(c110, state);
1617 				ApplyScaleAndClamp(c001, state);
1618 				ApplyScaleAndClamp(c011, state);
1619 				ApplyScaleAndClamp(c101, state);
1620 				ApplyScaleAndClamp(c111, state);
1621 				preScaled = true;
1622 			}
1623 
1624 			Float4 fx = Float4(x0 - Float(X0));
1625 			Float4 fy = Float4(y0 - Float(Y0));
1626 			Float4 fz = Float4(z0 - Float(Z0));
1627 			Float4 ix = Float4(1.0f) - fx;
1628 			Float4 iy = Float4(1.0f) - fy;
1629 			Float4 iz = Float4(1.0f) - fz;
1630 
1631 			color = ((c000 * ix + c010 * fx) * iy +
1632 			         (c100 * ix + c110 * fx) * fy) *
1633 			            iz +
1634 			        ((c001 * ix + c011 * fx) * iy +
1635 			         (c101 * ix + c111 * fx) * fy) *
1636 			            fz;
1637 		}
1638 		else
1639 		{
1640 			Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1641 			Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1642 			Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1643 			Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1644 
1645 			Float4 c00 = readFloat4(s00, state);
1646 			Float4 c01 = readFloat4(s01, state);
1647 			Float4 c10 = readFloat4(s10, state);
1648 			Float4 c11 = readFloat4(s11, state);
1649 
1650 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1651 			{
1652 				ApplyScaleAndClamp(c00, state);
1653 				ApplyScaleAndClamp(c01, state);
1654 				ApplyScaleAndClamp(c10, state);
1655 				ApplyScaleAndClamp(c11, state);
1656 				preScaled = true;
1657 			}
1658 
1659 			Float4 fx = Float4(x0 - Float(X0));
1660 			Float4 fy = Float4(y0 - Float(Y0));
1661 			Float4 ix = Float4(1.0f) - fx;
1662 			Float4 iy = Float4(1.0f) - fy;
1663 
1664 			color = (c00 * ix + c01 * fx) * iy +
1665 			        (c10 * ix + c11 * fx) * fy;
1666 		}
1667 	}
1668 
1669 	ApplyScaleAndClamp(color, state, preScaled);
1670 
1671 	return color;
1672 }
1673 
generate(const State & state)1674 Blitter::BlitRoutineType Blitter::generate(const State &state)
1675 {
1676 	BlitFunction function;
1677 	{
1678 		Pointer<Byte> blit(function.Arg<0>());
1679 
1680 		Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, source));
1681 		Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, dest));
1682 		Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData, sPitchB));
1683 		Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData, dPitchB));
1684 		Int sSliceB = *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
1685 		Int dSliceB = *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
1686 
1687 		Float x0 = *Pointer<Float>(blit + OFFSET(BlitData, x0));
1688 		Float y0 = *Pointer<Float>(blit + OFFSET(BlitData, y0));
1689 		Float z0 = *Pointer<Float>(blit + OFFSET(BlitData, z0));
1690 		Float w = *Pointer<Float>(blit + OFFSET(BlitData, w));
1691 		Float h = *Pointer<Float>(blit + OFFSET(BlitData, h));
1692 		Float d = *Pointer<Float>(blit + OFFSET(BlitData, d));
1693 
1694 		Int x0d = *Pointer<Int>(blit + OFFSET(BlitData, x0d));
1695 		Int x1d = *Pointer<Int>(blit + OFFSET(BlitData, x1d));
1696 		Int y0d = *Pointer<Int>(blit + OFFSET(BlitData, y0d));
1697 		Int y1d = *Pointer<Int>(blit + OFFSET(BlitData, y1d));
1698 		Int z0d = *Pointer<Int>(blit + OFFSET(BlitData, z0d));
1699 		Int z1d = *Pointer<Int>(blit + OFFSET(BlitData, z1d));
1700 
1701 		Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData, sWidth));
1702 		Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData, sHeight));
1703 		Int sDepth = *Pointer<Int>(blit + OFFSET(BlitData, sDepth));
1704 
1705 		bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1706 		bool intDst = state.destFormat.isUnnormalizedInteger();
1707 		bool intBoth = intSrc && intDst;
1708 		int srcBytes = state.sourceFormat.bytes();
1709 		int dstBytes = state.destFormat.bytes();
1710 
1711 		bool hasConstantColorI = false;
1712 		Int4 constantColorI;
1713 		bool hasConstantColorF = false;
1714 		Float4 constantColorF;
1715 		if(state.clearOperation)
1716 		{
1717 			if(intBoth)  // Integer types
1718 			{
1719 				constantColorI = readInt4(source, state);
1720 				hasConstantColorI = true;
1721 			}
1722 			else
1723 			{
1724 				constantColorF = readFloat4(source, state);
1725 				hasConstantColorF = true;
1726 
1727 				ApplyScaleAndClamp(constantColorF, state);
1728 			}
1729 		}
1730 
1731 		For(Int k = z0d, k < z1d, k++)
1732 		{
1733 			Float z = state.clearOperation ? RValue<Float>(z0) : z0 + Float(k) * d;
1734 			Pointer<Byte> destSlice = dest + k * dSliceB;
1735 
1736 			For(Int j = y0d, j < y1d, j++)
1737 			{
1738 				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
1739 				Pointer<Byte> destLine = destSlice + j * dPitchB;
1740 
1741 				For(Int i = x0d, i < x1d, i++)
1742 				{
1743 					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
1744 					Pointer<Byte> d = destLine + i * dstBytes;
1745 
1746 					if(hasConstantColorI)
1747 					{
1748 						for(int s = 0; s < state.destSamples; s++)
1749 						{
1750 							write(constantColorI, d, state);
1751 
1752 							d += dSliceB;
1753 						}
1754 					}
1755 					else if(hasConstantColorF)
1756 					{
1757 						for(int s = 0; s < state.destSamples; s++)
1758 						{
1759 							write(constantColorF, d, state);
1760 
1761 							d += dSliceB;
1762 						}
1763 					}
1764 					else if(intBoth)  // Integer types do not support filtering
1765 					{
1766 						Int X = Int(x);
1767 						Int Y = Int(y);
1768 						Int Z = Int(z);
1769 
1770 						if(state.clampToEdge)
1771 						{
1772 							X = Clamp(X, 0, sWidth - 1);
1773 							Y = Clamp(Y, 0, sHeight - 1);
1774 							Z = Clamp(Z, 0, sDepth - 1);
1775 						}
1776 
1777 						Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1778 
1779 						// When both formats are true integer types, we don't go to float to avoid losing precision
1780 						Int4 color = readInt4(s, state);
1781 						for(int s = 0; s < state.destSamples; s++)
1782 						{
1783 							write(color, d, state);
1784 
1785 							d += dSliceB;
1786 						}
1787 					}
1788 					else
1789 					{
1790 						Float4 color = sample(source, x, y, z, sWidth, sHeight, sDepth, sSliceB, sPitchB, state);
1791 
1792 						for(int s = 0; s < state.destSamples; s++)
1793 						{
1794 							write(color, d, state);
1795 
1796 							d += dSliceB;
1797 						}
1798 					}
1799 				}
1800 			}
1801 		}
1802 	}
1803 
1804 	return function("BlitRoutine");
1805 }
1806 
getBlitRoutine(const State & state)1807 Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
1808 {
1809 	marl::lock lock(blitMutex);
1810 	auto blitRoutine = blitCache.lookup(state);
1811 
1812 	if(!blitRoutine)
1813 	{
1814 		blitRoutine = generate(state);
1815 		blitCache.add(state, blitRoutine);
1816 	}
1817 
1818 	return blitRoutine;
1819 }
1820 
getCornerUpdateRoutine(const State & state)1821 Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
1822 {
1823 	marl::lock lock(cornerUpdateMutex);
1824 	auto cornerUpdateRoutine = cornerUpdateCache.lookup(state);
1825 
1826 	if(!cornerUpdateRoutine)
1827 	{
1828 		cornerUpdateRoutine = generateCornerUpdate(state);
1829 		cornerUpdateCache.add(state, cornerUpdateRoutine);
1830 	}
1831 
1832 	return cornerUpdateRoutine;
1833 }
1834 
blit(const vk::Image * src,vk::Image * dst,VkImageBlit2KHR region,VkFilter filter)1835 void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit2KHR region, VkFilter filter)
1836 {
1837 	ASSERT(src->getFormat() != VK_FORMAT_UNDEFINED);
1838 	ASSERT(dst->getFormat() != VK_FORMAT_UNDEFINED);
1839 
1840 	// Vulkan 1.2 section 18.5. Image Copies with Scaling:
1841 	// "The layerCount member of srcSubresource and dstSubresource must match"
1842 	// "The aspectMask member of srcSubresource and dstSubresource must match"
1843 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
1844 	ASSERT(region.srcSubresource.aspectMask == region.dstSubresource.aspectMask);
1845 
1846 	if(region.dstOffsets[0].x > region.dstOffsets[1].x)
1847 	{
1848 		std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
1849 		std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
1850 	}
1851 
1852 	if(region.dstOffsets[0].y > region.dstOffsets[1].y)
1853 	{
1854 		std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
1855 		std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
1856 	}
1857 
1858 	if(region.dstOffsets[0].z > region.dstOffsets[1].z)
1859 	{
1860 		std::swap(region.srcOffsets[0].z, region.srcOffsets[1].z);
1861 		std::swap(region.dstOffsets[0].z, region.dstOffsets[1].z);
1862 	}
1863 
1864 	VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
1865 	VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
1866 	VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
1867 
1868 	float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
1869 	                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
1870 	float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
1871 	                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
1872 	float depthRatio = static_cast<float>(region.srcOffsets[1].z - region.srcOffsets[0].z) /
1873 	                   static_cast<float>(region.dstOffsets[1].z - region.dstOffsets[0].z);
1874 	float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
1875 	float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
1876 	float z0 = region.srcOffsets[0].z + (0.5f - region.dstOffsets[0].z) * depthRatio;
1877 
1878 	auto srcFormat = src->getFormat(srcAspect);
1879 	auto dstFormat = dst->getFormat(dstAspect);
1880 
1881 	bool doFilter = (filter != VK_FILTER_NEAREST);
1882 	bool allowSRGBConversion =
1883 	    doFilter ||
1884 	    (src->getSampleCountFlagBits() > 1) ||
1885 	    (srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
1886 
1887 	State state(srcFormat, dstFormat, src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
1888 	            Options{ doFilter, allowSRGBConversion });
1889 	state.clampToEdge = (region.srcOffsets[0].x < 0) ||
1890 	                    (region.srcOffsets[0].y < 0) ||
1891 	                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
1892 	                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
1893 	                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
1894 	state.filter3D = (region.srcOffsets[1].z - region.srcOffsets[0].z) !=
1895 	                 (region.dstOffsets[1].z - region.dstOffsets[0].z);
1896 
1897 	auto blitRoutine = getBlitRoutine(state);
1898 	if(!blitRoutine)
1899 	{
1900 		return;
1901 	}
1902 
1903 	BlitData data = {
1904 		nullptr,                                                                                 // source
1905 		nullptr,                                                                                 // dest
1906 		assert_cast<uint32_t>(src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel)),    // sPitchB
1907 		assert_cast<uint32_t>(dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel)),    // dPitchB
1908 		assert_cast<uint32_t>(src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel)),  // sSliceB
1909 		assert_cast<uint32_t>(dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel)),  // dSliceB
1910 
1911 		x0,
1912 		y0,
1913 		z0,
1914 		widthRatio,
1915 		heightRatio,
1916 		depthRatio,
1917 
1918 		region.dstOffsets[0].x,  // x0d
1919 		region.dstOffsets[1].x,  // x1d
1920 		region.dstOffsets[0].y,  // y0d
1921 		region.dstOffsets[1].y,  // y1d
1922 		region.dstOffsets[0].z,  // z0d
1923 		region.dstOffsets[1].z,  // z1d
1924 
1925 		static_cast<int>(srcExtent.width),   // sWidth
1926 		static_cast<int>(srcExtent.height),  // sHeight
1927 		static_cast<int>(srcExtent.depth),   // sDepth
1928 
1929 		false,  // filter3D
1930 	};
1931 
1932 	VkImageSubresource srcSubres = {
1933 		region.srcSubresource.aspectMask,
1934 		region.srcSubresource.mipLevel,
1935 		region.srcSubresource.baseArrayLayer
1936 	};
1937 
1938 	VkImageSubresource dstSubres = {
1939 		region.dstSubresource.aspectMask,
1940 		region.dstSubresource.mipLevel,
1941 		region.dstSubresource.baseArrayLayer
1942 	};
1943 
1944 	VkImageSubresourceRange dstSubresRange = {
1945 		region.dstSubresource.aspectMask,
1946 		region.dstSubresource.mipLevel,
1947 		1,  // levelCount
1948 		region.dstSubresource.baseArrayLayer,
1949 		region.dstSubresource.layerCount
1950 	};
1951 
1952 	uint32_t lastLayer = src->getLastLayerIndex(dstSubresRange);
1953 
1954 	for(; dstSubres.arrayLayer <= lastLayer; srcSubres.arrayLayer++, dstSubres.arrayLayer++)
1955 	{
1956 		data.source = src->getTexelPointer({ 0, 0, 0 }, srcSubres);
1957 		data.dest = dst->getTexelPointer({ 0, 0, 0 }, dstSubres);
1958 
1959 		ASSERT(data.source < src->end());
1960 		ASSERT(data.dest < dst->end());
1961 
1962 		blitRoutine(&data);
1963 	}
1964 
1965 	dst->contentsChanged(dstSubresRange);
1966 }
1967 
resolveDepth(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)1968 static void resolveDepth(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
1969 {
1970 	if(dsrDesc.depthResolveMode == VK_RESOLVE_MODE_NONE)
1971 	{
1972 		return;
1973 	}
1974 
1975 	vk::Format format = src->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT);
1976 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_DEPTH_BIT);
1977 	int width = extent.width;
1978 	int height = extent.height;
1979 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
1980 
1981 	// To support other resolve modes, get the slice bytes and get a pointer to each sample plane.
1982 	// Then modify the loop below to include logic for handling each new mode.
1983 	uint8_t *source = (uint8_t *)src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1984 	uint8_t *dest = (uint8_t *)dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1985 
1986 	size_t formatSize = format.bytes();
1987 	// TODO(b/167558951) support other resolve modes.
1988 	ASSERT(dsrDesc.depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1989 	for(int y = 0; y < height; y++)
1990 	{
1991 		memcpy(dest, source, formatSize * width);
1992 
1993 		source += pitch;
1994 		dest += pitch;
1995 	}
1996 
1997 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
1998 }
1999 
resolveStencil(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)2000 static void resolveStencil(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
2001 {
2002 	if(dsrDesc.stencilResolveMode == VK_RESOLVE_MODE_NONE)
2003 	{
2004 		return;
2005 	}
2006 
2007 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_STENCIL_BIT);
2008 	int width = extent.width;
2009 	int height = extent.height;
2010 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
2011 
2012 	// To support other resolve modes, use src->slicePitchBytes() and get a pointer to each sample's slice.
2013 	// Then modify the loop below to include logic for handling each new mode.
2014 	uint8_t *source = reinterpret_cast<uint8_t *>(src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2015 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2016 
2017 	// TODO(b/167558951) support other resolve modes.
2018 	ASSERT(dsrDesc.stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
2019 	for(int y = 0; y < height; y++)
2020 	{
2021 		// Stencil is always 8 bits, so the width of the resource we're resolving is
2022 		// the number of bytes in each row we need to copy during for SAMPLE_ZERO
2023 		memcpy(dest, source, width);
2024 
2025 		source += pitch;
2026 		dest += pitch;
2027 	}
2028 
2029 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
2030 }
2031 
resolveDepthStencil(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)2032 void Blitter::resolveDepthStencil(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
2033 {
2034 	VkImageSubresourceRange srcRange = src->getSubresourceRange();
2035 	VkImageSubresourceRange dstRange = src->getSubresourceRange();
2036 	ASSERT(src->getFormat() == dst->getFormat());
2037 	ASSERT(srcRange.layerCount == 1 && dstRange.layerCount == 1);
2038 	ASSERT(srcRange.aspectMask == dstRange.aspectMask);
2039 
2040 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2041 	{
2042 		resolveDepth(src, dst, dsrDesc);
2043 	}
2044 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2045 	{
2046 		resolveStencil(src, dst, dsrDesc);
2047 	}
2048 }
2049 
resolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2050 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2051 {
2052 	// "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
2053 	ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2054 	ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2055 	// "The layerCount member of srcSubresource and dstSubresource must match"
2056 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
2057 
2058 	// We use this method both for explicit resolves from vkCmdResolveImage, and implicit ones for resolve attachments.
2059 	// - vkCmdResolveImage: "srcImage and dstImage must have been created with the same image format."
2060 	// - VkSubpassDescription: "each resolve attachment that is not VK_ATTACHMENT_UNUSED must have the same VkFormat as its corresponding color attachment."
2061 	ASSERT(src->getFormat() == dst->getFormat());
2062 
2063 	if(fastResolve(src, dst, region))
2064 	{
2065 		return;
2066 	}
2067 
2068 	// Fall back to a generic blit which performs the resolve.
2069 	VkImageBlit2KHR blitRegion;
2070 	blitRegion.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR;
2071 	blitRegion.pNext = nullptr;
2072 
2073 	blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
2074 	blitRegion.srcOffsets[1].x += region.extent.width;
2075 	blitRegion.srcOffsets[1].y += region.extent.height;
2076 	blitRegion.srcOffsets[1].z += region.extent.depth;
2077 
2078 	blitRegion.dstOffsets[0] = blitRegion.dstOffsets[1] = region.dstOffset;
2079 	blitRegion.dstOffsets[1].x += region.extent.width;
2080 	blitRegion.dstOffsets[1].y += region.extent.height;
2081 	blitRegion.dstOffsets[1].z += region.extent.depth;
2082 
2083 	blitRegion.srcSubresource = region.srcSubresource;
2084 	blitRegion.dstSubresource = region.dstSubresource;
2085 
2086 	blit(src, dst, blitRegion, VK_FILTER_NEAREST);
2087 }
2088 
averageByte4(uint32_t x,uint32_t y)2089 static inline uint32_t averageByte4(uint32_t x, uint32_t y)
2090 {
2091 	return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
2092 }
2093 
fastResolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2094 bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2095 {
2096 	if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
2097 	{
2098 		return false;
2099 	}
2100 
2101 	if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
2102 	{
2103 		return false;
2104 	}
2105 
2106 	if(region.srcSubresource.layerCount != 1)
2107 	{
2108 		return false;
2109 	}
2110 
2111 	if(region.extent != src->getExtent() ||
2112 	   region.extent != dst->getExtent() ||
2113 	   region.extent.depth != 1)
2114 	{
2115 		return false;
2116 	}
2117 
2118 	VkImageSubresource srcSubresource = {
2119 		region.srcSubresource.aspectMask,
2120 		region.srcSubresource.mipLevel,
2121 		region.srcSubresource.baseArrayLayer
2122 	};
2123 
2124 	VkImageSubresource dstSubresource = {
2125 		region.dstSubresource.aspectMask,
2126 		region.dstSubresource.mipLevel,
2127 		region.dstSubresource.baseArrayLayer
2128 	};
2129 
2130 	VkImageSubresourceRange dstSubresourceRange = {
2131 		region.dstSubresource.aspectMask,
2132 		region.dstSubresource.mipLevel,
2133 		1,  // levelCount
2134 		region.dstSubresource.baseArrayLayer,
2135 		region.dstSubresource.layerCount
2136 	};
2137 
2138 	void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
2139 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
2140 
2141 	auto format = src->getFormat();
2142 	auto samples = src->getSampleCountFlagBits();
2143 	auto extent = src->getExtent();
2144 
2145 	int width = extent.width;
2146 	int height = extent.height;
2147 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2148 	int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2149 
2150 	uint8_t *source0 = (uint8_t *)source;
2151 	uint8_t *source1 = source0 + slice;
2152 	uint8_t *source2 = source1 + slice;
2153 	uint8_t *source3 = source2 + slice;
2154 
2155 	[[maybe_unused]] const bool SSE2 = CPUID::supportsSSE2();
2156 
2157 	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
2158 	{
2159 		if(samples == 4)
2160 		{
2161 			for(int y = 0; y < height; y++)
2162 			{
2163 				int x = 0;
2164 
2165 #if defined(__i386__) || defined(__x86_64__)
2166 				if(SSE2)
2167 				{
2168 					for(; (x + 3) < width; x += 4)
2169 					{
2170 						__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
2171 						__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
2172 						__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
2173 						__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
2174 
2175 						c0 = _mm_avg_epu8(c0, c1);
2176 						c2 = _mm_avg_epu8(c2, c3);
2177 						c0 = _mm_avg_epu8(c0, c2);
2178 
2179 						_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
2180 					}
2181 				}
2182 #endif
2183 
2184 				for(; x < width; x++)
2185 				{
2186 					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
2187 					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
2188 					uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
2189 					uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
2190 
2191 					uint32_t c01 = averageByte4(c0, c1);
2192 					uint32_t c23 = averageByte4(c2, c3);
2193 					uint32_t c03 = averageByte4(c01, c23);
2194 
2195 					*(uint32_t *)(dest + 4 * x) = c03;
2196 				}
2197 
2198 				source0 += pitch;
2199 				source1 += pitch;
2200 				source2 += pitch;
2201 				source3 += pitch;
2202 				dest += pitch;
2203 
2204 				ASSERT(source0 < src->end());
2205 				ASSERT(source3 < src->end());
2206 				ASSERT(dest < dst->end());
2207 			}
2208 		}
2209 		else
2210 			UNSUPPORTED("Samples: %d", samples);
2211 	}
2212 	else
2213 	{
2214 		return false;
2215 	}
2216 
2217 	dst->contentsChanged(dstSubresourceRange);
2218 
2219 	return true;
2220 }
2221 
copy(const vk::Image * src,uint8_t * dst,unsigned int dstPitch)2222 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
2223 {
2224 	VkExtent3D extent = src->getExtent();
2225 	size_t rowBytes = src->getFormat(VK_IMAGE_ASPECT_COLOR_BIT).bytes() * extent.width;
2226 	unsigned int srcPitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
2227 	ASSERT(dstPitch >= rowBytes && srcPitch >= rowBytes && src->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0).height >= extent.height);
2228 
2229 	const uint8_t *s = (uint8_t *)src->getTexelPointer({ 0, 0, 0 }, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0 });
2230 	uint8_t *d = dst;
2231 
2232 	for(uint32_t y = 0; y < extent.height; y++)
2233 	{
2234 		memcpy(d, s, rowBytes);
2235 
2236 		s += srcPitch;
2237 		d += dstPitch;
2238 	}
2239 }
2240 
computeCubeCorner(Pointer<Byte> & layer,Int & x0,Int & x1,Int & y0,Int & y1,Int & pitchB,const State & state)2241 void Blitter::computeCubeCorner(Pointer<Byte> &layer, Int &x0, Int &x1, Int &y0, Int &y1, Int &pitchB, const State &state)
2242 {
2243 	int bytes = state.sourceFormat.bytes();
2244 
2245 	Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
2246 	           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
2247 	           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
2248 
2249 	c *= Float4(1.0f / 3.0f);
2250 
2251 	write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
2252 }
2253 
generateCornerUpdate(const State & state)2254 Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State &state)
2255 {
2256 	// Reading and writing from/to the same image
2257 	ASSERT(state.sourceFormat == state.destFormat);
2258 	ASSERT(state.srcSamples == state.destSamples);
2259 
2260 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2261 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2262 	ASSERT(state.srcSamples == 1);
2263 
2264 	CornerUpdateFunction function;
2265 	{
2266 		Pointer<Byte> blit(function.Arg<0>());
2267 
2268 		Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
2269 		Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
2270 		UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
2271 		UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
2272 
2273 		// Low Border, Low Pixel, High Border, High Pixel
2274 		Int LB(-1), LP(0), HB(dim), HP(dim - 1);
2275 
2276 		for(int face = 0; face < 6; face++)
2277 		{
2278 			computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
2279 			computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
2280 			computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
2281 			computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
2282 			layers = layers + layerSize;
2283 		}
2284 	}
2285 
2286 	return function("BlitRoutine");
2287 }
2288 
updateBorders(const vk::Image * image,const VkImageSubresource & subresource)2289 void Blitter::updateBorders(const vk::Image *image, const VkImageSubresource &subresource)
2290 {
2291 	ASSERT(image->getArrayLayers() >= (subresource.arrayLayer + 6));
2292 
2293 	// From Vulkan 1.1 spec, section 11.5. Image Views:
2294 	// "For cube and cube array image views, the layers of the image view starting
2295 	//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
2296 	VkImageSubresource posX = subresource;
2297 	VkImageSubresource negX = posX;
2298 	negX.arrayLayer++;
2299 	VkImageSubresource posY = negX;
2300 	posY.arrayLayer++;
2301 	VkImageSubresource negY = posY;
2302 	negY.arrayLayer++;
2303 	VkImageSubresource posZ = negY;
2304 	posZ.arrayLayer++;
2305 	VkImageSubresource negZ = posZ;
2306 	negZ.arrayLayer++;
2307 
2308 	// Copy top / bottom
2309 	copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
2310 	copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
2311 	copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
2312 	copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
2313 	copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
2314 	copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
2315 
2316 	copyCubeEdge(image, posX, TOP, posY, RIGHT);
2317 	copyCubeEdge(image, posY, TOP, negZ, TOP);
2318 	copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
2319 	copyCubeEdge(image, negX, TOP, posY, LEFT);
2320 	copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
2321 	copyCubeEdge(image, negZ, TOP, posY, TOP);
2322 
2323 	// Copy left / right
2324 	copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
2325 	copyCubeEdge(image, posY, RIGHT, posX, TOP);
2326 	copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
2327 	copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
2328 	copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
2329 	copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
2330 
2331 	copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
2332 	copyCubeEdge(image, posY, LEFT, negX, TOP);
2333 	copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
2334 	copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
2335 	copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
2336 	copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
2337 
2338 	// Compute corner colors
2339 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
2340 	vk::Format format = image->getFormat(aspect);
2341 	VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
2342 	State state(format, format, samples, samples, Options{ 0xF });
2343 
2344 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2345 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2346 	ASSERT(samples == VK_SAMPLE_COUNT_1_BIT);
2347 
2348 	auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
2349 	if(!cornerUpdateRoutine)
2350 	{
2351 		return;
2352 	}
2353 
2354 	VkExtent3D extent = image->getMipLevelExtent(aspect, subresource.mipLevel);
2355 	CubeBorderData data = {
2356 		image->getTexelPointer({ 0, 0, 0 }, posX),
2357 		assert_cast<uint32_t>(image->rowPitchBytes(aspect, subresource.mipLevel)),
2358 		assert_cast<uint32_t>(image->getLayerSize(aspect)),
2359 		extent.width
2360 	};
2361 	cornerUpdateRoutine(&data);
2362 }
2363 
copyCubeEdge(const vk::Image * image,const VkImageSubresource & dstSubresource,Edge dstEdge,const VkImageSubresource & srcSubresource,Edge srcEdge)2364 void Blitter::copyCubeEdge(const vk::Image *image,
2365                            const VkImageSubresource &dstSubresource, Edge dstEdge,
2366                            const VkImageSubresource &srcSubresource, Edge srcEdge)
2367 {
2368 	ASSERT(srcSubresource.aspectMask == dstSubresource.aspectMask);
2369 	ASSERT(srcSubresource.mipLevel == dstSubresource.mipLevel);
2370 	ASSERT(srcSubresource.arrayLayer != dstSubresource.arrayLayer);
2371 
2372 	// Figure out if the edges to be copied in reverse order respectively from one another
2373 	// The copy should be reversed whenever the same edges are contiguous or if we're
2374 	// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
2375 	//
2376 	//      | +y |
2377 	// | -x | +z | +x | -z |
2378 	//      | -y |
2379 
2380 	bool reverse = (srcEdge == dstEdge) ||
2381 	               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
2382 	               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
2383 	               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
2384 	               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
2385 
2386 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresource.aspectMask);
2387 	int bytes = image->getFormat(aspect).bytes();
2388 	int pitchB = image->rowPitchBytes(aspect, srcSubresource.mipLevel);
2389 
2390 	VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresource.mipLevel);
2391 	int w = extent.width;
2392 	int h = extent.height;
2393 	if(w != h)
2394 	{
2395 		UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
2396 	}
2397 
2398 	// Src is expressed in the regular [0, width-1], [0, height-1] space
2399 	bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
2400 	int srcDelta = srcHorizontal ? bytes : pitchB;
2401 	VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
2402 
2403 	// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
2404 	bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
2405 	int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
2406 	VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
2407 
2408 	// Don't write in the corners
2409 	if(dstHorizontal)
2410 	{
2411 		dstOffset.x += reverse ? w : 1;
2412 	}
2413 	else
2414 	{
2415 		dstOffset.y += reverse ? h : 1;
2416 	}
2417 
2418 	const uint8_t *src = static_cast<const uint8_t *>(image->getTexelPointer(srcOffset, srcSubresource));
2419 	uint8_t *dst = static_cast<uint8_t *>(image->getTexelPointer(dstOffset, dstSubresource));
2420 	ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
2421 	ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
2422 
2423 	for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
2424 	{
2425 		memcpy(dst, src, bytes);
2426 	}
2427 }
2428 
2429 }  // namespace sw
2430