1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "Blitter.hpp"
16
17 #include "Pipeline/ShaderCore.hpp"
18 #include "Reactor/Reactor.hpp"
19 #include "System/CPUID.hpp"
20 #include "System/Debug.hpp"
21 #include "System/Half.hpp"
22 #include "System/Memory.hpp"
23 #include "Vulkan/VkBuffer.hpp"
24 #include "Vulkan/VkImage.hpp"
25 #include "Vulkan/VkImageView.hpp"
26
27 #include <utility>
28
29 #if defined(__i386__) || defined(__x86_64__)
30 # include <xmmintrin.h>
31 # include <emmintrin.h>
32 #endif
33
34 namespace {
PackFields(rr::Int4 const & ints,const sw::int4 shifts)35 rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
36 {
37 return (rr::Int(ints.x) << shifts[0]) |
38 (rr::Int(ints.y) << shifts[1]) |
39 (rr::Int(ints.z) << shifts[2]) |
40 (rr::Int(ints.w) << shifts[3]);
41 }
42 } // namespace
43
44 namespace sw {
45
Blitter()46 Blitter::Blitter()
47 : blitMutex()
48 , blitCache(1024)
49 , cornerUpdateMutex()
50 , cornerUpdateCache(64) // We only need one of these per format
51 {
52 }
53
~Blitter()54 Blitter::~Blitter()
55 {
56 }
57
clear(const void * pixel,vk::Format format,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)58 void Blitter::clear(const void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
59 {
60 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
61 vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
62 if(dstFormat == VK_FORMAT_UNDEFINED)
63 {
64 return;
65 }
66
67 VkClearColorValue clampedPixel;
68 if(viewFormat.isSignedNormalized() || viewFormat.isUnsignedNormalized())
69 {
70 const float minValue = viewFormat.isSignedNormalized() ? -1.0f : 0.0f;
71 memcpy(clampedPixel.float32, pixel, sizeof(VkClearColorValue));
72 clampedPixel.float32[0] = sw::clamp(clampedPixel.float32[0], minValue, 1.0f);
73 clampedPixel.float32[1] = sw::clamp(clampedPixel.float32[1], minValue, 1.0f);
74 clampedPixel.float32[2] = sw::clamp(clampedPixel.float32[2], minValue, 1.0f);
75 clampedPixel.float32[3] = sw::clamp(clampedPixel.float32[3], minValue, 1.0f);
76 pixel = clampedPixel.float32;
77 }
78
79 if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
80 {
81 return;
82 }
83
84 State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
85 auto blitRoutine = getBlitRoutine(state);
86 if(!blitRoutine)
87 {
88 return;
89 }
90
91 VkImageSubresource subres = {
92 subresourceRange.aspectMask,
93 subresourceRange.baseMipLevel,
94 subresourceRange.baseArrayLayer
95 };
96
97 uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
98 uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
99
100 VkRect2D area = { { 0, 0 }, { 0, 0 } };
101 if(renderArea)
102 {
103 ASSERT(subresourceRange.levelCount == 1);
104 area = *renderArea;
105 }
106
107 for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
108 {
109 VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
110 if(!renderArea)
111 {
112 area.extent.width = extent.width;
113 area.extent.height = extent.height;
114 }
115
116 BlitData data = {
117 pixel, nullptr, // source, dest
118
119 assert_cast<uint32_t>(format.bytes()), // sPitchB
120 assert_cast<uint32_t>(dest->rowPitchBytes(aspect, subres.mipLevel)), // dPitchB
121 0, // sSliceB (unused in clear operations)
122 assert_cast<uint32_t>(dest->slicePitchBytes(aspect, subres.mipLevel)), // dSliceB
123
124 0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f, // x0, y0, z0, w, h, d
125
126 area.offset.x, static_cast<int>(area.offset.x + area.extent.width), // x0d, x1d
127 area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
128 0, 1, // z0d, z1d
129
130 0, 0, 0, // sWidth, sHeight, sDepth
131
132 false, // filter3D
133 };
134
135 if(renderArea && dest->is3DSlice())
136 {
137 // Reinterpret layers as depth slices
138 subres.arrayLayer = 0;
139 for(uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
140 {
141 data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
142 blitRoutine(&data);
143 }
144 }
145 else
146 {
147 for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
148 {
149 for(uint32_t depth = 0; depth < extent.depth; depth++)
150 {
151 data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
152
153 blitRoutine(&data);
154 }
155 }
156 }
157 }
158 dest->contentsChanged(subresourceRange);
159 }
160
fastClear(const void * clearValue,vk::Format clearFormat,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)161 bool Blitter::fastClear(const void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
162 {
163 if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
164 clearFormat != VK_FORMAT_D32_SFLOAT &&
165 clearFormat != VK_FORMAT_S8_UINT)
166 {
167 return false;
168 }
169
170 union ClearValue
171 {
172 struct
173 {
174 float r;
175 float g;
176 float b;
177 float a;
178 };
179
180 float rgb[3];
181
182 float d;
183 uint32_t d_as_u32;
184
185 uint32_t s;
186 };
187
188 const ClearValue &c = *reinterpret_cast<const ClearValue *>(clearValue);
189
190 uint32_t packed = 0;
191
192 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
193 switch(viewFormat)
194 {
195 case VK_FORMAT_R5G6B5_UNORM_PACK16:
196 packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
197 ((uint16_t)(63 * c.g + 0.5f) << 5) |
198 ((uint16_t)(31 * c.r + 0.5f) << 11);
199 break;
200 case VK_FORMAT_B5G6R5_UNORM_PACK16:
201 packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
202 ((uint16_t)(63 * c.g + 0.5f) << 5) |
203 ((uint16_t)(31 * c.b + 0.5f) << 11);
204 break;
205 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
206 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
207 case VK_FORMAT_R8G8B8A8_UNORM:
208 packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
209 ((uint32_t)(255 * c.b + 0.5f) << 16) |
210 ((uint32_t)(255 * c.g + 0.5f) << 8) |
211 ((uint32_t)(255 * c.r + 0.5f) << 0);
212 break;
213 case VK_FORMAT_B8G8R8A8_UNORM:
214 packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
215 ((uint32_t)(255 * c.r + 0.5f) << 16) |
216 ((uint32_t)(255 * c.g + 0.5f) << 8) |
217 ((uint32_t)(255 * c.b + 0.5f) << 0);
218 break;
219 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
220 packed = R11G11B10F(c.rgb);
221 break;
222 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
223 packed = RGB9E5(c.rgb);
224 break;
225 case VK_FORMAT_D32_SFLOAT:
226 ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
227 packed = c.d_as_u32; // float reinterpreted as uint32
228 break;
229 case VK_FORMAT_S8_UINT:
230 ASSERT(clearFormat == VK_FORMAT_S8_UINT);
231 packed = static_cast<uint8_t>(c.s);
232 break;
233 default:
234 return false;
235 }
236
237 VkImageSubresource subres = {
238 subresourceRange.aspectMask,
239 subresourceRange.baseMipLevel,
240 subresourceRange.baseArrayLayer
241 };
242 uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
243 uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
244
245 VkRect2D area = { { 0, 0 }, { 0, 0 } };
246 if(renderArea)
247 {
248 ASSERT(subresourceRange.levelCount == 1);
249 area = *renderArea;
250 }
251
252 for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
253 {
254 int rowPitchBytes = dest->rowPitchBytes(aspect, subres.mipLevel);
255 int slicePitchBytes = dest->slicePitchBytes(aspect, subres.mipLevel);
256 VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
257 if(!renderArea)
258 {
259 area.extent.width = extent.width;
260 area.extent.height = extent.height;
261 }
262 if(dest->is3DSlice())
263 {
264 extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
265 }
266
267 for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
268 {
269 for(uint32_t depth = 0; depth < extent.depth; depth++)
270 {
271 uint8_t *slice = (uint8_t *)dest->getTexelPointer(
272 { area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subres);
273
274 for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
275 {
276 uint8_t *d = slice;
277
278 switch(viewFormat.bytes())
279 {
280 case 4:
281 for(uint32_t i = 0; i < area.extent.height; i++)
282 {
283 ASSERT(d < dest->end());
284 sw::clear((uint32_t *)d, packed, area.extent.width);
285 d += rowPitchBytes;
286 }
287 break;
288 case 2:
289 for(uint32_t i = 0; i < area.extent.height; i++)
290 {
291 ASSERT(d < dest->end());
292 sw::clear((uint16_t *)d, static_cast<uint16_t>(packed), area.extent.width);
293 d += rowPitchBytes;
294 }
295 break;
296 case 1:
297 for(uint32_t i = 0; i < area.extent.height; i++)
298 {
299 ASSERT(d < dest->end());
300 memset(d, packed, area.extent.width);
301 d += rowPitchBytes;
302 }
303 break;
304 default:
305 assert(false);
306 }
307
308 slice += slicePitchBytes;
309 }
310 }
311 }
312 }
313 dest->contentsChanged(subresourceRange);
314
315 return true;
316 }
317
readFloat4(Pointer<Byte> element,const State & state)318 Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
319 {
320 Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
321
322 switch(state.sourceFormat)
323 {
324 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
325 c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
326 c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
327 c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
328 c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
329 break;
330 case VK_FORMAT_R8_SINT:
331 case VK_FORMAT_R8_SNORM:
332 c.x = Float(Int(*Pointer<SByte>(element)));
333 c.w = float(0x7F);
334 break;
335 case VK_FORMAT_R8_UNORM:
336 case VK_FORMAT_R8_UINT:
337 case VK_FORMAT_R8_SRGB:
338 c.x = Float(Int(*Pointer<Byte>(element)));
339 c.w = float(0xFF);
340 break;
341 case VK_FORMAT_R16_SINT:
342 case VK_FORMAT_R16_SNORM:
343 c.x = Float(Int(*Pointer<Short>(element)));
344 c.w = float(0x7FFF);
345 break;
346 case VK_FORMAT_R16_UNORM:
347 case VK_FORMAT_R16_UINT:
348 c.x = Float(Int(*Pointer<UShort>(element)));
349 c.w = float(0xFFFF);
350 break;
351 case VK_FORMAT_R32_SINT:
352 c.x = Float(*Pointer<Int>(element));
353 c.w = float(0x7FFFFFFF);
354 break;
355 case VK_FORMAT_R32_UINT:
356 c.x = Float(*Pointer<UInt>(element));
357 c.w = float(0xFFFFFFFF);
358 break;
359 case VK_FORMAT_B8G8R8A8_SRGB:
360 case VK_FORMAT_B8G8R8A8_UNORM:
361 c = Float4(*Pointer<Byte4>(element)).zyxw;
362 break;
363 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
364 case VK_FORMAT_R8G8B8A8_SINT:
365 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
366 case VK_FORMAT_R8G8B8A8_SNORM:
367 c = Float4(*Pointer<SByte4>(element));
368 break;
369 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
370 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
371 case VK_FORMAT_R8G8B8A8_UNORM:
372 case VK_FORMAT_R8G8B8A8_UINT:
373 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
374 case VK_FORMAT_R8G8B8A8_SRGB:
375 c = Float4(*Pointer<Byte4>(element));
376 break;
377 case VK_FORMAT_R16G16B16A16_SINT:
378 case VK_FORMAT_R16G16B16A16_SNORM:
379 c = Float4(*Pointer<Short4>(element));
380 break;
381 case VK_FORMAT_R16G16B16A16_UNORM:
382 case VK_FORMAT_R16G16B16A16_UINT:
383 c = Float4(*Pointer<UShort4>(element));
384 break;
385 case VK_FORMAT_R32G32B32A32_SINT:
386 c = Float4(*Pointer<Int4>(element));
387 break;
388 case VK_FORMAT_R32G32B32A32_UINT:
389 c = Float4(*Pointer<UInt4>(element));
390 break;
391 case VK_FORMAT_R8G8_SINT:
392 case VK_FORMAT_R8G8_SNORM:
393 c.x = Float(Int(*Pointer<SByte>(element + 0)));
394 c.y = Float(Int(*Pointer<SByte>(element + 1)));
395 c.w = float(0x7F);
396 break;
397 case VK_FORMAT_R8G8_UNORM:
398 case VK_FORMAT_R8G8_UINT:
399 case VK_FORMAT_R8G8_SRGB:
400 c.x = Float(Int(*Pointer<Byte>(element + 0)));
401 c.y = Float(Int(*Pointer<Byte>(element + 1)));
402 c.w = float(0xFF);
403 break;
404 case VK_FORMAT_R16G16_SINT:
405 case VK_FORMAT_R16G16_SNORM:
406 c.x = Float(Int(*Pointer<Short>(element + 0)));
407 c.y = Float(Int(*Pointer<Short>(element + 2)));
408 c.w = float(0x7FFF);
409 break;
410 case VK_FORMAT_R16G16_UNORM:
411 case VK_FORMAT_R16G16_UINT:
412 c.x = Float(Int(*Pointer<UShort>(element + 0)));
413 c.y = Float(Int(*Pointer<UShort>(element + 2)));
414 c.w = float(0xFFFF);
415 break;
416 case VK_FORMAT_R32G32_SINT:
417 c.x = Float(*Pointer<Int>(element + 0));
418 c.y = Float(*Pointer<Int>(element + 4));
419 c.w = float(0x7FFFFFFF);
420 break;
421 case VK_FORMAT_R32G32_UINT:
422 c.x = Float(*Pointer<UInt>(element + 0));
423 c.y = Float(*Pointer<UInt>(element + 4));
424 c.w = float(0xFFFFFFFF);
425 break;
426 case VK_FORMAT_R32G32B32A32_SFLOAT:
427 c = *Pointer<Float4>(element);
428 break;
429 case VK_FORMAT_R32G32_SFLOAT:
430 c.x = *Pointer<Float>(element + 0);
431 c.y = *Pointer<Float>(element + 4);
432 break;
433 case VK_FORMAT_R32_SFLOAT:
434 c.x = *Pointer<Float>(element);
435 break;
436 case VK_FORMAT_R16G16B16A16_SFLOAT:
437 c.w = Float(*Pointer<Half>(element + 6));
438 case VK_FORMAT_R16G16B16_SFLOAT:
439 c.z = Float(*Pointer<Half>(element + 4));
440 case VK_FORMAT_R16G16_SFLOAT:
441 c.y = Float(*Pointer<Half>(element + 2));
442 case VK_FORMAT_R16_SFLOAT:
443 c.x = Float(*Pointer<Half>(element));
444 break;
445 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
446 c = r11g11b10Unpack(*Pointer<UInt>(element));
447 break;
448 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
449 // This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
450 c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF)); // R's mantissa (bits 0-8)
451 c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9); // G's mantissa (bits 9-17)
452 c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
453 c *= Float4(
454 // 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
455 Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
456 // Since the 9 bit mantissa values currently stored in RGB were converted straight
457 // from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
458 // are (1 << 9) times too high.
459 // Also, the exponent has 5 bits and we compute the exponent bias of floating point
460 // formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
461 // Exponent bias (15) + number of mantissa bits per component (9) = 24
462 Float(1.0f / (1 << 24)));
463 c.w = 1.0f;
464 break;
465 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
466 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
467 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
468 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
469 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
470 break;
471 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
472 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
473 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
474 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
475 c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
476 break;
477 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
478 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
479 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
480 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
481 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
482 break;
483 case VK_FORMAT_R5G6B5_UNORM_PACK16:
484 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
485 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
486 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
487 break;
488 case VK_FORMAT_B5G6R5_UNORM_PACK16:
489 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
490 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
491 c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
492 break;
493 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
494 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
495 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
496 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
497 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
498 break;
499 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
500 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
501 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
502 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
503 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
504 break;
505 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
506 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
507 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
508 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
509 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
510 break;
511 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
512 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
513 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
514 c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
515 c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
516 c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
517 break;
518 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
519 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
520 c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
521 c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
522 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
523 c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
524 break;
525 case VK_FORMAT_D16_UNORM:
526 c.x = Float(Int((*Pointer<UShort>(element))));
527 break;
528 case VK_FORMAT_X8_D24_UNORM_PACK32:
529 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
530 break;
531 case VK_FORMAT_D32_SFLOAT:
532 c.x = *Pointer<Float>(element);
533 break;
534 case VK_FORMAT_S8_UINT:
535 c.x = Float(Int(*Pointer<Byte>(element)));
536 break;
537 default:
538 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
539 }
540
541 return c;
542 }
543
write(Float4 & c,Pointer<Byte> element,const State & state)544 void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
545 {
546 bool writeR = state.writeRed;
547 bool writeG = state.writeGreen;
548 bool writeB = state.writeBlue;
549 bool writeA = state.writeAlpha;
550 bool writeRGBA = writeR && writeG && writeB && writeA;
551
552 switch(state.destFormat)
553 {
554 case VK_FORMAT_R4G4_UNORM_PACK8:
555 if(writeR | writeG)
556 {
557 if(!writeR)
558 {
559 *Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
560 (*Pointer<Byte>(element) & Byte(0xF0));
561 }
562 else if(!writeG)
563 {
564 *Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
565 (Byte(RoundInt(Float(c.x))) << Byte(4));
566 }
567 else
568 {
569 *Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
570 (Byte(RoundInt(Float(c.x))) << Byte(4));
571 }
572 }
573 break;
574 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
575 if(writeRGBA)
576 {
577 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 }));
578 }
579 else
580 {
581 unsigned short mask = (writeA ? 0x000F : 0x0000) |
582 (writeB ? 0x00F0 : 0x0000) |
583 (writeG ? 0x0F00 : 0x0000) |
584 (writeR ? 0xF000 : 0x0000);
585 unsigned short unmask = ~mask;
586 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
587 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 })) & UShort(mask));
588 }
589 break;
590 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
591 if(writeRGBA)
592 {
593 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 }));
594 }
595 else
596 {
597 unsigned short mask = (writeA ? 0x000F : 0x0000) |
598 (writeR ? 0x00F0 : 0x0000) |
599 (writeG ? 0x0F00 : 0x0000) |
600 (writeB ? 0xF000 : 0x0000);
601 unsigned short unmask = ~mask;
602 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
603 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 })) & UShort(mask));
604 }
605 break;
606 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
607 if(writeRGBA)
608 {
609 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 }));
610 }
611 else
612 {
613 unsigned short mask = (writeB ? 0x000F : 0x0000) |
614 (writeG ? 0x00F0 : 0x0000) |
615 (writeR ? 0x0F00 : 0x0000) |
616 (writeA ? 0xF000 : 0x0000);
617 unsigned short unmask = ~mask;
618 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
619 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 })) & UShort(mask));
620 }
621 break;
622 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
623 if(writeRGBA)
624 {
625 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 }));
626 }
627 else
628 {
629 unsigned short mask = (writeR ? 0x000F : 0x0000) |
630 (writeG ? 0x00F0 : 0x0000) |
631 (writeB ? 0x0F00 : 0x0000) |
632 (writeA ? 0xF000 : 0x0000);
633 unsigned short unmask = ~mask;
634 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
635 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 })) & UShort(mask));
636 }
637 break;
638 case VK_FORMAT_B8G8R8A8_SRGB:
639 case VK_FORMAT_B8G8R8A8_UNORM:
640 if(writeRGBA)
641 {
642 Short4 c0 = RoundShort4(c.zyxw);
643 *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
644 }
645 else
646 {
647 if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
648 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
649 if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
650 if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
651 }
652 break;
653 case VK_FORMAT_B8G8R8_SNORM:
654 if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
655 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
656 if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
657 break;
658 case VK_FORMAT_B8G8R8_UNORM:
659 case VK_FORMAT_B8G8R8_SRGB:
660 if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
661 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
662 if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
663 break;
664 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
665 case VK_FORMAT_R8G8B8A8_UNORM:
666 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
667 case VK_FORMAT_R8G8B8A8_SRGB:
668 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
669 case VK_FORMAT_R8G8B8A8_UINT:
670 case VK_FORMAT_R8G8B8A8_USCALED:
671 case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
672 if(writeRGBA)
673 {
674 Short4 c0 = RoundShort4(c);
675 *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
676 }
677 else
678 {
679 if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
680 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
681 if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
682 if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
683 }
684 break;
685 case VK_FORMAT_R32G32B32A32_SFLOAT:
686 if(writeRGBA)
687 {
688 *Pointer<Float4>(element) = c;
689 }
690 else
691 {
692 if(writeR) { *Pointer<Float>(element) = c.x; }
693 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
694 if(writeB) { *Pointer<Float>(element + 8) = c.z; }
695 if(writeA) { *Pointer<Float>(element + 12) = c.w; }
696 }
697 break;
698 case VK_FORMAT_R32G32B32_SFLOAT:
699 if(writeR) { *Pointer<Float>(element) = c.x; }
700 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
701 if(writeB) { *Pointer<Float>(element + 8) = c.z; }
702 break;
703 case VK_FORMAT_R32G32_SFLOAT:
704 if(writeR && writeG)
705 {
706 *Pointer<Float2>(element) = Float2(c);
707 }
708 else
709 {
710 if(writeR) { *Pointer<Float>(element) = c.x; }
711 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
712 }
713 break;
714 case VK_FORMAT_R32_SFLOAT:
715 if(writeR) { *Pointer<Float>(element) = c.x; }
716 break;
717 case VK_FORMAT_R16G16B16A16_SFLOAT:
718 if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
719 // [[fallthrough]]
720 case VK_FORMAT_R16G16B16_SFLOAT:
721 if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
722 // [[fallthrough]]
723 case VK_FORMAT_R16G16_SFLOAT:
724 if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
725 // [[fallthrough]]
726 case VK_FORMAT_R16_SFLOAT:
727 if(writeR) { *Pointer<Half>(element) = Half(c.x); }
728 break;
729 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
730 {
731 UInt rgb = r11g11b10Pack(c);
732
733 UInt old = *Pointer<UInt>(element);
734
735 unsigned int mask = (writeR ? 0x000007FF : 0) |
736 (writeG ? 0x003FF800 : 0) |
737 (writeB ? 0xFFC00000 : 0);
738
739 *Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
740 }
741 break;
742 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
743 {
744 ASSERT(writeRGBA); // Can't sensibly write just part of this format.
745
746 // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
747
748 constexpr int N = 9; // number of mantissa bits per component
749 constexpr int B = 15; // exponent bias
750 constexpr int E_max = 31; // maximum possible biased exponent value
751
752 // Maximum representable value.
753 constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
754
755 // Clamp components to valid range. NaN becomes 0.
756 Float red_c = Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
757 Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
758 Float blue_c = Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
759
760 // We're reducing the mantissa to 9 bits, so we must round up if the next
761 // bit is 1. In other words add 0.5 to the new mantissa's position and
762 // allow overflow into the exponent so we can scale correctly.
763 constexpr int half = 1 << (23 - N);
764 Float red_r = As<Float>(As<Int>(red_c) + half);
765 Float green_r = As<Float>(As<Int>(green_c) + half);
766 Float blue_r = As<Float>(As<Int>(blue_c) + half);
767
768 // The largest component determines the shared exponent. It can't be lower
769 // than 0 (after bias subtraction) so also limit to the mimimum representable.
770 constexpr float min_s = 0.5f / (1 << B);
771 Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
772
773 // Obtain the reciprocal of the shared exponent by inverting the bits,
774 // and scale by the new mantissa's size. Note that the IEEE-754 single-precision
775 // format has an implicit leading 1, but this shared component format does not.
776 Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
777
778 UInt R9 = RoundInt(red_c * scale);
779 UInt G9 = UInt(RoundInt(green_c * scale));
780 UInt B9 = UInt(RoundInt(blue_c * scale));
781 UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
782
783 UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
784
785 *Pointer<UInt>(element) = E5B9G9R9;
786 }
787 break;
788 case VK_FORMAT_B8G8R8A8_SNORM:
789 if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
790 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
791 if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
792 if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
793 break;
794 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
795 case VK_FORMAT_R8G8B8A8_SINT:
796 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
797 case VK_FORMAT_R8G8B8A8_SNORM:
798 case VK_FORMAT_R8G8B8A8_SSCALED:
799 case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
800 if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
801 // [[fallthrough]]
802 case VK_FORMAT_R8G8B8_SINT:
803 case VK_FORMAT_R8G8B8_SNORM:
804 case VK_FORMAT_R8G8B8_SSCALED:
805 if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
806 // [[fallthrough]]
807 case VK_FORMAT_R8G8_SINT:
808 case VK_FORMAT_R8G8_SNORM:
809 case VK_FORMAT_R8G8_SSCALED:
810 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
811 // [[fallthrough]]
812 case VK_FORMAT_R8_SINT:
813 case VK_FORMAT_R8_SNORM:
814 case VK_FORMAT_R8_SSCALED:
815 if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
816 break;
817 case VK_FORMAT_R8G8B8_UINT:
818 case VK_FORMAT_R8G8B8_UNORM:
819 case VK_FORMAT_R8G8B8_USCALED:
820 case VK_FORMAT_R8G8B8_SRGB:
821 if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
822 // [[fallthrough]]
823 case VK_FORMAT_R8G8_UINT:
824 case VK_FORMAT_R8G8_UNORM:
825 case VK_FORMAT_R8G8_USCALED:
826 case VK_FORMAT_R8G8_SRGB:
827 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
828 // [[fallthrough]]
829 case VK_FORMAT_R8_UINT:
830 case VK_FORMAT_R8_UNORM:
831 case VK_FORMAT_R8_USCALED:
832 case VK_FORMAT_R8_SRGB:
833 if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
834 break;
835 case VK_FORMAT_R16G16B16A16_SINT:
836 case VK_FORMAT_R16G16B16A16_SNORM:
837 case VK_FORMAT_R16G16B16A16_SSCALED:
838 if(writeRGBA)
839 {
840 *Pointer<Short4>(element) = Short4(RoundInt(c));
841 }
842 else
843 {
844 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
845 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
846 if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
847 if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
848 }
849 break;
850 case VK_FORMAT_R16G16B16_SINT:
851 case VK_FORMAT_R16G16B16_SNORM:
852 case VK_FORMAT_R16G16B16_SSCALED:
853 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
854 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
855 if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
856 break;
857 case VK_FORMAT_R16G16_SINT:
858 case VK_FORMAT_R16G16_SNORM:
859 case VK_FORMAT_R16G16_SSCALED:
860 if(writeR && writeG)
861 {
862 *Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
863 }
864 else
865 {
866 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
867 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
868 }
869 break;
870 case VK_FORMAT_R16_SINT:
871 case VK_FORMAT_R16_SNORM:
872 case VK_FORMAT_R16_SSCALED:
873 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
874 break;
875 case VK_FORMAT_R16G16B16A16_UINT:
876 case VK_FORMAT_R16G16B16A16_UNORM:
877 case VK_FORMAT_R16G16B16A16_USCALED:
878 if(writeRGBA)
879 {
880 *Pointer<UShort4>(element) = UShort4(RoundInt(c));
881 }
882 else
883 {
884 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
885 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
886 if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
887 if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
888 }
889 break;
890 case VK_FORMAT_R16G16B16_UINT:
891 case VK_FORMAT_R16G16B16_UNORM:
892 case VK_FORMAT_R16G16B16_USCALED:
893 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
894 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
895 if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
896 break;
897 case VK_FORMAT_R16G16_UINT:
898 case VK_FORMAT_R16G16_UNORM:
899 case VK_FORMAT_R16G16_USCALED:
900 if(writeR && writeG)
901 {
902 *Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
903 }
904 else
905 {
906 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
907 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
908 }
909 break;
910 case VK_FORMAT_R16_UINT:
911 case VK_FORMAT_R16_UNORM:
912 case VK_FORMAT_R16_USCALED:
913 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
914 break;
915 case VK_FORMAT_R32G32B32A32_SINT:
916 if(writeRGBA)
917 {
918 *Pointer<Int4>(element) = RoundInt(c);
919 }
920 else
921 {
922 if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
923 if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
924 if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
925 if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
926 }
927 break;
928 case VK_FORMAT_R32G32B32_SINT:
929 if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
930 // [[fallthrough]]
931 case VK_FORMAT_R32G32_SINT:
932 if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
933 // [[fallthrough]]
934 case VK_FORMAT_R32_SINT:
935 if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
936 break;
937 case VK_FORMAT_R32G32B32A32_UINT:
938 if(writeRGBA)
939 {
940 *Pointer<UInt4>(element) = UInt4(RoundInt(c));
941 }
942 else
943 {
944 if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
945 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
946 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
947 if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
948 }
949 break;
950 case VK_FORMAT_R32G32B32_UINT:
951 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
952 // [[fallthrough]]
953 case VK_FORMAT_R32G32_UINT:
954 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
955 // [[fallthrough]]
956 case VK_FORMAT_R32_UINT:
957 if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
958 break;
959 case VK_FORMAT_R5G6B5_UNORM_PACK16:
960 if(writeR && writeG && writeB)
961 {
962 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
963 }
964 else
965 {
966 unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
967 unsigned short unmask = ~mask;
968 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
969 (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
970 UShort(mask));
971 }
972 break;
973 case VK_FORMAT_B5G6R5_UNORM_PACK16:
974 if(writeR && writeG && writeB)
975 {
976 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
977 }
978 else
979 {
980 unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
981 unsigned short unmask = ~mask;
982 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
983 (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
984 UShort(mask));
985 }
986 break;
987 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
988 if(writeRGBA)
989 {
990 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
991 }
992 else
993 {
994 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
995 (writeR ? 0x7C00 : 0x0000) |
996 (writeG ? 0x03E0 : 0x0000) |
997 (writeB ? 0x001F : 0x0000);
998 unsigned short unmask = ~mask;
999 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1000 (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
1001 UShort(mask));
1002 }
1003 break;
1004 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1005 if(writeRGBA)
1006 {
1007 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
1008 }
1009 else
1010 {
1011 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1012 (writeR ? 0x7C00 : 0x0000) |
1013 (writeG ? 0x03E0 : 0x0000) |
1014 (writeB ? 0x001F : 0x0000);
1015 unsigned short unmask = ~mask;
1016 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1017 (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
1018 UShort(mask));
1019 }
1020 break;
1021 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1022 if(writeRGBA)
1023 {
1024 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
1025 }
1026 else
1027 {
1028 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1029 (writeR ? 0x7C00 : 0x0000) |
1030 (writeG ? 0x03E0 : 0x0000) |
1031 (writeB ? 0x001F : 0x0000);
1032 unsigned short unmask = ~mask;
1033 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1034 (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
1035 UShort(mask));
1036 }
1037 break;
1038 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1039 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1040 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1041 if(writeRGBA)
1042 {
1043 *Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
1044 }
1045 else
1046 {
1047 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1048 (writeB ? 0x3FF00000 : 0x0000) |
1049 (writeG ? 0x000FFC00 : 0x0000) |
1050 (writeR ? 0x000003FF : 0x0000);
1051 unsigned int unmask = ~mask;
1052 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1053 (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
1054 UInt(mask));
1055 }
1056 break;
1057 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1058 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1059 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1060 if(writeRGBA)
1061 {
1062 *Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
1063 }
1064 else
1065 {
1066 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1067 (writeR ? 0x3FF00000 : 0x0000) |
1068 (writeG ? 0x000FFC00 : 0x0000) |
1069 (writeB ? 0x000003FF : 0x0000);
1070 unsigned int unmask = ~mask;
1071 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1072 (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
1073 UInt(mask));
1074 }
1075 break;
1076 case VK_FORMAT_D16_UNORM:
1077 *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
1078 break;
1079 case VK_FORMAT_X8_D24_UNORM_PACK32:
1080 *Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
1081 break;
1082 case VK_FORMAT_D32_SFLOAT:
1083 *Pointer<Float>(element) = c.x;
1084 break;
1085 case VK_FORMAT_S8_UINT:
1086 *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
1087 break;
1088 default:
1089 UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1090 break;
1091 }
1092 }
1093
readInt4(Pointer<Byte> element,const State & state)1094 Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
1095 {
1096 Int4 c(0, 0, 0, 1);
1097
1098 switch(state.sourceFormat)
1099 {
1100 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1101 case VK_FORMAT_R8G8B8A8_SINT:
1102 c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
1103 c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
1104 // [[fallthrough]]
1105 case VK_FORMAT_R8G8_SINT:
1106 c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
1107 // [[fallthrough]]
1108 case VK_FORMAT_R8_SINT:
1109 c = Insert(c, Int(*Pointer<SByte>(element)), 0);
1110 break;
1111 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1112 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
1113 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1114 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
1115 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1116 break;
1117 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1118 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 2);
1119 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1120 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 0);
1121 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1122 break;
1123 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1124 case VK_FORMAT_R8G8B8A8_UINT:
1125 c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
1126 c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
1127 // [[fallthrough]]
1128 case VK_FORMAT_R8G8_UINT:
1129 c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
1130 // [[fallthrough]]
1131 case VK_FORMAT_R8_UINT:
1132 case VK_FORMAT_S8_UINT:
1133 c = Insert(c, Int(*Pointer<Byte>(element)), 0);
1134 break;
1135 case VK_FORMAT_R16G16B16A16_SINT:
1136 c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
1137 c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
1138 // [[fallthrough]]
1139 case VK_FORMAT_R16G16_SINT:
1140 c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
1141 // [[fallthrough]]
1142 case VK_FORMAT_R16_SINT:
1143 c = Insert(c, Int(*Pointer<Short>(element)), 0);
1144 break;
1145 case VK_FORMAT_R16G16B16A16_UINT:
1146 c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
1147 c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
1148 // [[fallthrough]]
1149 case VK_FORMAT_R16G16_UINT:
1150 c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
1151 // [[fallthrough]]
1152 case VK_FORMAT_R16_UINT:
1153 c = Insert(c, Int(*Pointer<UShort>(element)), 0);
1154 break;
1155 case VK_FORMAT_R32G32B32A32_SINT:
1156 case VK_FORMAT_R32G32B32A32_UINT:
1157 c = *Pointer<Int4>(element);
1158 break;
1159 case VK_FORMAT_R32G32_SINT:
1160 case VK_FORMAT_R32G32_UINT:
1161 c = Insert(c, *Pointer<Int>(element + 4), 1);
1162 // [[fallthrough]]
1163 case VK_FORMAT_R32_SINT:
1164 case VK_FORMAT_R32_UINT:
1165 c = Insert(c, *Pointer<Int>(element), 0);
1166 break;
1167 default:
1168 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1169 }
1170
1171 return c;
1172 }
1173
write(Int4 & c,Pointer<Byte> element,const State & state)1174 void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
1175 {
1176 bool writeR = state.writeRed;
1177 bool writeG = state.writeGreen;
1178 bool writeB = state.writeBlue;
1179 bool writeA = state.writeAlpha;
1180 bool writeRGBA = writeR && writeG && writeB && writeA;
1181
1182 ASSERT(state.sourceFormat.isUnsigned() == state.destFormat.isUnsigned());
1183
1184 switch(state.destFormat)
1185 {
1186 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1187 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1188 c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
1189 break;
1190 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1191 case VK_FORMAT_R8G8B8A8_UINT:
1192 case VK_FORMAT_R8G8B8_UINT:
1193 case VK_FORMAT_R8G8_UINT:
1194 case VK_FORMAT_R8_UINT:
1195 case VK_FORMAT_R8G8B8A8_USCALED:
1196 case VK_FORMAT_R8G8B8_USCALED:
1197 case VK_FORMAT_R8G8_USCALED:
1198 case VK_FORMAT_R8_USCALED:
1199 case VK_FORMAT_S8_UINT:
1200 c = Min(As<UInt4>(c), UInt4(0xFF));
1201 break;
1202 case VK_FORMAT_R16G16B16A16_UINT:
1203 case VK_FORMAT_R16G16B16_UINT:
1204 case VK_FORMAT_R16G16_UINT:
1205 case VK_FORMAT_R16_UINT:
1206 case VK_FORMAT_R16G16B16A16_USCALED:
1207 case VK_FORMAT_R16G16B16_USCALED:
1208 case VK_FORMAT_R16G16_USCALED:
1209 case VK_FORMAT_R16_USCALED:
1210 c = Min(As<UInt4>(c), UInt4(0xFFFF));
1211 break;
1212 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1213 case VK_FORMAT_R8G8B8A8_SINT:
1214 case VK_FORMAT_R8G8_SINT:
1215 case VK_FORMAT_R8_SINT:
1216 case VK_FORMAT_R8G8B8A8_SSCALED:
1217 case VK_FORMAT_R8G8B8_SSCALED:
1218 case VK_FORMAT_R8G8_SSCALED:
1219 case VK_FORMAT_R8_SSCALED:
1220 c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
1221 break;
1222 case VK_FORMAT_R16G16B16A16_SINT:
1223 case VK_FORMAT_R16G16B16_SINT:
1224 case VK_FORMAT_R16G16_SINT:
1225 case VK_FORMAT_R16_SINT:
1226 case VK_FORMAT_R16G16B16A16_SSCALED:
1227 case VK_FORMAT_R16G16B16_SSCALED:
1228 case VK_FORMAT_R16G16_SSCALED:
1229 case VK_FORMAT_R16_SSCALED:
1230 c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
1231 break;
1232 default:
1233 break;
1234 }
1235
1236 switch(state.destFormat)
1237 {
1238 case VK_FORMAT_B8G8R8A8_SINT:
1239 case VK_FORMAT_B8G8R8A8_SSCALED:
1240 if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1241 // [[fallthrough]]
1242 case VK_FORMAT_B8G8R8_SINT:
1243 case VK_FORMAT_B8G8R8_SSCALED:
1244 if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
1245 if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1246 if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
1247 break;
1248 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1249 case VK_FORMAT_R8G8B8A8_SINT:
1250 case VK_FORMAT_R8G8B8A8_SSCALED:
1251 case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
1252 if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1253 // [[fallthrough]]
1254 case VK_FORMAT_R8G8B8_SINT:
1255 case VK_FORMAT_R8G8B8_SSCALED:
1256 if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
1257 // [[fallthrough]]
1258 case VK_FORMAT_R8G8_SINT:
1259 case VK_FORMAT_R8G8_SSCALED:
1260 if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1261 // [[fallthrough]]
1262 case VK_FORMAT_R8_SINT:
1263 case VK_FORMAT_R8_SSCALED:
1264 if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
1265 break;
1266 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1267 case VK_FORMAT_A2B10G10R10_SINT_PACK32:
1268 case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
1269 case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
1270 if(writeRGBA)
1271 {
1272 *Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
1273 }
1274 else
1275 {
1276 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1277 (writeB ? 0x3FF00000 : 0x0000) |
1278 (writeG ? 0x000FFC00 : 0x0000) |
1279 (writeR ? 0x000003FF : 0x0000);
1280 unsigned int unmask = ~mask;
1281 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1282 (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
1283 }
1284 break;
1285 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1286 case VK_FORMAT_A2R10G10B10_SINT_PACK32:
1287 case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
1288 case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
1289 if(writeRGBA)
1290 {
1291 *Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
1292 }
1293 else
1294 {
1295 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1296 (writeR ? 0x3FF00000 : 0x0000) |
1297 (writeG ? 0x000FFC00 : 0x0000) |
1298 (writeB ? 0x000003FF : 0x0000);
1299 unsigned int unmask = ~mask;
1300 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1301 (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
1302 }
1303 break;
1304 case VK_FORMAT_B8G8R8A8_UINT:
1305 case VK_FORMAT_B8G8R8A8_USCALED:
1306 if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1307 // [[fallthrough]]
1308 case VK_FORMAT_B8G8R8_UINT:
1309 case VK_FORMAT_B8G8R8_USCALED:
1310 case VK_FORMAT_B8G8R8_SRGB:
1311 if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
1312 if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1313 if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
1314 break;
1315 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1316 case VK_FORMAT_R8G8B8A8_UINT:
1317 case VK_FORMAT_R8G8B8A8_USCALED:
1318 case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
1319 if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1320 // [[fallthrough]]
1321 case VK_FORMAT_R8G8B8_UINT:
1322 case VK_FORMAT_R8G8B8_USCALED:
1323 if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
1324 // [[fallthrough]]
1325 case VK_FORMAT_R8G8_UINT:
1326 case VK_FORMAT_R8G8_USCALED:
1327 if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1328 // [[fallthrough]]
1329 case VK_FORMAT_R8_UINT:
1330 case VK_FORMAT_R8_USCALED:
1331 case VK_FORMAT_S8_UINT:
1332 if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
1333 break;
1334 case VK_FORMAT_R16G16B16A16_SINT:
1335 case VK_FORMAT_R16G16B16A16_SSCALED:
1336 if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
1337 // [[fallthrough]]
1338 case VK_FORMAT_R16G16B16_SINT:
1339 case VK_FORMAT_R16G16B16_SSCALED:
1340 if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
1341 // [[fallthrough]]
1342 case VK_FORMAT_R16G16_SINT:
1343 case VK_FORMAT_R16G16_SSCALED:
1344 if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
1345 // [[fallthrough]]
1346 case VK_FORMAT_R16_SINT:
1347 case VK_FORMAT_R16_SSCALED:
1348 if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
1349 break;
1350 case VK_FORMAT_R16G16B16A16_UINT:
1351 case VK_FORMAT_R16G16B16A16_USCALED:
1352 if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
1353 // [[fallthrough]]
1354 case VK_FORMAT_R16G16B16_UINT:
1355 case VK_FORMAT_R16G16B16_USCALED:
1356 if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
1357 // [[fallthrough]]
1358 case VK_FORMAT_R16G16_UINT:
1359 case VK_FORMAT_R16G16_USCALED:
1360 if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
1361 // [[fallthrough]]
1362 case VK_FORMAT_R16_UINT:
1363 case VK_FORMAT_R16_USCALED:
1364 if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
1365 break;
1366 case VK_FORMAT_R32G32B32A32_SINT:
1367 if(writeRGBA)
1368 {
1369 *Pointer<Int4>(element) = c;
1370 }
1371 else
1372 {
1373 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1374 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1375 if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1376 if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
1377 }
1378 break;
1379 case VK_FORMAT_R32G32B32_SINT:
1380 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1381 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1382 if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1383 break;
1384 case VK_FORMAT_R32G32_SINT:
1385 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1386 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1387 break;
1388 case VK_FORMAT_R32_SINT:
1389 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1390 break;
1391 case VK_FORMAT_R32G32B32A32_UINT:
1392 if(writeRGBA)
1393 {
1394 *Pointer<UInt4>(element) = As<UInt4>(c);
1395 }
1396 else
1397 {
1398 if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1399 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1400 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1401 if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
1402 }
1403 break;
1404 case VK_FORMAT_R32G32B32_UINT:
1405 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1406 // [[fallthrough]]
1407 case VK_FORMAT_R32G32_UINT:
1408 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1409 // [[fallthrough]]
1410 case VK_FORMAT_R32_UINT:
1411 if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1412 break;
1413 default:
1414 UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1415 }
1416 }
1417
ApplyScaleAndClamp(Float4 & value,const State & state,bool preScaled)1418 void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
1419 {
1420 float4 scale{}, unscale{};
1421
1422 if(state.clearOperation &&
1423 state.sourceFormat.isUnnormalizedInteger() &&
1424 !state.destFormat.isUnnormalizedInteger())
1425 {
1426 // If we're clearing a buffer from an int or uint color into a normalized color,
1427 // then the whole range of the int or uint color must be scaled between 0 and 1.
1428 switch(state.sourceFormat)
1429 {
1430 case VK_FORMAT_R32G32B32A32_SINT:
1431 unscale = float4(static_cast<float>(0x7FFFFFFF));
1432 break;
1433 case VK_FORMAT_R32G32B32A32_UINT:
1434 unscale = float4(static_cast<float>(0xFFFFFFFF));
1435 break;
1436 default:
1437 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1438 }
1439 }
1440 else
1441 {
1442 unscale = state.sourceFormat.getScale();
1443 }
1444
1445 scale = state.destFormat.getScale();
1446
1447 bool srcSRGB = state.sourceFormat.isSRGBformat();
1448 bool dstSRGB = state.destFormat.isSRGBformat();
1449
1450 if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB)) // One of the formats is sRGB encoded.
1451 {
1452 value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
1453 Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
1454 value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
1455 value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
1456 }
1457 else if(unscale != scale)
1458 {
1459 value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
1460 }
1461
1462 if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
1463 {
1464 value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
1465
1466 value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
1467 state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
1468 state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
1469 state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
1470 }
1471
1472 if(!state.sourceFormat.isUnsigned() && state.destFormat.isUnsigned())
1473 {
1474 value = Max(value, Float4(0.0f));
1475 }
1476 }
1477
ComputeOffset(Int & x,Int & y,Int & pitchB,int bytes)1478 Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
1479 {
1480 return y * pitchB + x * bytes;
1481 }
1482
ComputeOffset(Int & x,Int & y,Int & z,Int & sliceB,Int & pitchB,int bytes)1483 Int Blitter::ComputeOffset(Int &x, Int &y, Int &z, Int &sliceB, Int &pitchB, int bytes)
1484 {
1485 return z * sliceB + y * pitchB + x * bytes;
1486 }
1487
LinearToSRGB(const Float4 & c)1488 Float4 Blitter::LinearToSRGB(const Float4 &c)
1489 {
1490 Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
1491 Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
1492
1493 Float4 s = c;
1494 s.xyz = Max(lc, ec);
1495
1496 return s;
1497 }
1498
sRGBtoLinear(const Float4 & c)1499 Float4 Blitter::sRGBtoLinear(const Float4 &c)
1500 {
1501 Float4 lc = c * Float4(1.0f / 12.92f);
1502 Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
1503
1504 Int4 linear = CmpLT(c, Float4(0.04045f));
1505
1506 Float4 s = c;
1507 s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec))); // TODO: IfThenElse()
1508
1509 return s;
1510 }
1511
sample(Pointer<Byte> & source,Float & x,Float & y,Float & z,Int & sWidth,Int & sHeight,Int & sDepth,Int & sSliceB,Int & sPitchB,const State & state)1512 Float4 Blitter::sample(Pointer<Byte> &source, Float &x, Float &y, Float &z,
1513 Int &sWidth, Int &sHeight, Int &sDepth,
1514 Int &sSliceB, Int &sPitchB, const State &state)
1515 {
1516 bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1517 int srcBytes = state.sourceFormat.bytes();
1518
1519 Float4 color;
1520
1521 bool preScaled = false;
1522 if(!state.filter || intSrc)
1523 {
1524 Int X = Int(x);
1525 Int Y = Int(y);
1526 Int Z = Int(z);
1527
1528 if(state.clampToEdge)
1529 {
1530 X = Clamp(X, 0, sWidth - 1);
1531 Y = Clamp(Y, 0, sHeight - 1);
1532 Z = Clamp(Z, 0, sDepth - 1);
1533 }
1534
1535 Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1536
1537 color = readFloat4(s, state);
1538
1539 if(state.srcSamples > 1) // Resolve multisampled source
1540 {
1541 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1542 {
1543 ApplyScaleAndClamp(color, state);
1544 preScaled = true;
1545 }
1546 Float4 accum = color;
1547 for(int sample = 1; sample < state.srcSamples; sample++)
1548 {
1549 s += sSliceB;
1550 color = readFloat4(s, state);
1551
1552 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1553 {
1554 ApplyScaleAndClamp(color, state);
1555 preScaled = true;
1556 }
1557 accum += color;
1558 }
1559 color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
1560 }
1561 }
1562 else // Bilinear filtering
1563 {
1564 Float X = x;
1565 Float Y = y;
1566 Float Z = z;
1567
1568 if(state.clampToEdge)
1569 {
1570 X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
1571 Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
1572 Z = Min(Max(z, 0.5f), Float(sDepth) - 0.5f);
1573 }
1574
1575 Float x0 = X - 0.5f;
1576 Float y0 = Y - 0.5f;
1577 Float z0 = Z - 0.5f;
1578
1579 Int X0 = Max(Int(x0), 0);
1580 Int Y0 = Max(Int(y0), 0);
1581 Int Z0 = Max(Int(z0), 0);
1582
1583 Int X1 = X0 + 1;
1584 Int Y1 = Y0 + 1;
1585 X1 = IfThenElse(X1 >= sWidth, X0, X1);
1586 Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
1587
1588 if(state.filter3D)
1589 {
1590 Int Z1 = Z0 + 1;
1591 Z1 = IfThenElse(Z1 >= sHeight, Z0, Z1);
1592
1593 Pointer<Byte> s000 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1594 Pointer<Byte> s010 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1595 Pointer<Byte> s100 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1596 Pointer<Byte> s110 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1597 Pointer<Byte> s001 = source + ComputeOffset(X0, Y0, Z1, sSliceB, sPitchB, srcBytes);
1598 Pointer<Byte> s011 = source + ComputeOffset(X1, Y0, Z1, sSliceB, sPitchB, srcBytes);
1599 Pointer<Byte> s101 = source + ComputeOffset(X0, Y1, Z1, sSliceB, sPitchB, srcBytes);
1600 Pointer<Byte> s111 = source + ComputeOffset(X1, Y1, Z1, sSliceB, sPitchB, srcBytes);
1601
1602 Float4 c000 = readFloat4(s000, state);
1603 Float4 c010 = readFloat4(s010, state);
1604 Float4 c100 = readFloat4(s100, state);
1605 Float4 c110 = readFloat4(s110, state);
1606 Float4 c001 = readFloat4(s001, state);
1607 Float4 c011 = readFloat4(s011, state);
1608 Float4 c101 = readFloat4(s101, state);
1609 Float4 c111 = readFloat4(s111, state);
1610
1611 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1612 {
1613 ApplyScaleAndClamp(c000, state);
1614 ApplyScaleAndClamp(c010, state);
1615 ApplyScaleAndClamp(c100, state);
1616 ApplyScaleAndClamp(c110, state);
1617 ApplyScaleAndClamp(c001, state);
1618 ApplyScaleAndClamp(c011, state);
1619 ApplyScaleAndClamp(c101, state);
1620 ApplyScaleAndClamp(c111, state);
1621 preScaled = true;
1622 }
1623
1624 Float4 fx = Float4(x0 - Float(X0));
1625 Float4 fy = Float4(y0 - Float(Y0));
1626 Float4 fz = Float4(z0 - Float(Z0));
1627 Float4 ix = Float4(1.0f) - fx;
1628 Float4 iy = Float4(1.0f) - fy;
1629 Float4 iz = Float4(1.0f) - fz;
1630
1631 color = ((c000 * ix + c010 * fx) * iy +
1632 (c100 * ix + c110 * fx) * fy) *
1633 iz +
1634 ((c001 * ix + c011 * fx) * iy +
1635 (c101 * ix + c111 * fx) * fy) *
1636 fz;
1637 }
1638 else
1639 {
1640 Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1641 Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1642 Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1643 Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1644
1645 Float4 c00 = readFloat4(s00, state);
1646 Float4 c01 = readFloat4(s01, state);
1647 Float4 c10 = readFloat4(s10, state);
1648 Float4 c11 = readFloat4(s11, state);
1649
1650 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1651 {
1652 ApplyScaleAndClamp(c00, state);
1653 ApplyScaleAndClamp(c01, state);
1654 ApplyScaleAndClamp(c10, state);
1655 ApplyScaleAndClamp(c11, state);
1656 preScaled = true;
1657 }
1658
1659 Float4 fx = Float4(x0 - Float(X0));
1660 Float4 fy = Float4(y0 - Float(Y0));
1661 Float4 ix = Float4(1.0f) - fx;
1662 Float4 iy = Float4(1.0f) - fy;
1663
1664 color = (c00 * ix + c01 * fx) * iy +
1665 (c10 * ix + c11 * fx) * fy;
1666 }
1667 }
1668
1669 ApplyScaleAndClamp(color, state, preScaled);
1670
1671 return color;
1672 }
1673
generate(const State & state)1674 Blitter::BlitRoutineType Blitter::generate(const State &state)
1675 {
1676 BlitFunction function;
1677 {
1678 Pointer<Byte> blit(function.Arg<0>());
1679
1680 Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, source));
1681 Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, dest));
1682 Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData, sPitchB));
1683 Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData, dPitchB));
1684 Int sSliceB = *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
1685 Int dSliceB = *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
1686
1687 Float x0 = *Pointer<Float>(blit + OFFSET(BlitData, x0));
1688 Float y0 = *Pointer<Float>(blit + OFFSET(BlitData, y0));
1689 Float z0 = *Pointer<Float>(blit + OFFSET(BlitData, z0));
1690 Float w = *Pointer<Float>(blit + OFFSET(BlitData, w));
1691 Float h = *Pointer<Float>(blit + OFFSET(BlitData, h));
1692 Float d = *Pointer<Float>(blit + OFFSET(BlitData, d));
1693
1694 Int x0d = *Pointer<Int>(blit + OFFSET(BlitData, x0d));
1695 Int x1d = *Pointer<Int>(blit + OFFSET(BlitData, x1d));
1696 Int y0d = *Pointer<Int>(blit + OFFSET(BlitData, y0d));
1697 Int y1d = *Pointer<Int>(blit + OFFSET(BlitData, y1d));
1698 Int z0d = *Pointer<Int>(blit + OFFSET(BlitData, z0d));
1699 Int z1d = *Pointer<Int>(blit + OFFSET(BlitData, z1d));
1700
1701 Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData, sWidth));
1702 Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData, sHeight));
1703 Int sDepth = *Pointer<Int>(blit + OFFSET(BlitData, sDepth));
1704
1705 bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1706 bool intDst = state.destFormat.isUnnormalizedInteger();
1707 bool intBoth = intSrc && intDst;
1708 int srcBytes = state.sourceFormat.bytes();
1709 int dstBytes = state.destFormat.bytes();
1710
1711 bool hasConstantColorI = false;
1712 Int4 constantColorI;
1713 bool hasConstantColorF = false;
1714 Float4 constantColorF;
1715 if(state.clearOperation)
1716 {
1717 if(intBoth) // Integer types
1718 {
1719 constantColorI = readInt4(source, state);
1720 hasConstantColorI = true;
1721 }
1722 else
1723 {
1724 constantColorF = readFloat4(source, state);
1725 hasConstantColorF = true;
1726
1727 ApplyScaleAndClamp(constantColorF, state);
1728 }
1729 }
1730
1731 For(Int k = z0d, k < z1d, k++)
1732 {
1733 Float z = state.clearOperation ? RValue<Float>(z0) : z0 + Float(k) * d;
1734 Pointer<Byte> destSlice = dest + k * dSliceB;
1735
1736 For(Int j = y0d, j < y1d, j++)
1737 {
1738 Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
1739 Pointer<Byte> destLine = destSlice + j * dPitchB;
1740
1741 For(Int i = x0d, i < x1d, i++)
1742 {
1743 Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
1744 Pointer<Byte> d = destLine + i * dstBytes;
1745
1746 if(hasConstantColorI)
1747 {
1748 for(int s = 0; s < state.destSamples; s++)
1749 {
1750 write(constantColorI, d, state);
1751
1752 d += dSliceB;
1753 }
1754 }
1755 else if(hasConstantColorF)
1756 {
1757 for(int s = 0; s < state.destSamples; s++)
1758 {
1759 write(constantColorF, d, state);
1760
1761 d += dSliceB;
1762 }
1763 }
1764 else if(intBoth) // Integer types do not support filtering
1765 {
1766 Int X = Int(x);
1767 Int Y = Int(y);
1768 Int Z = Int(z);
1769
1770 if(state.clampToEdge)
1771 {
1772 X = Clamp(X, 0, sWidth - 1);
1773 Y = Clamp(Y, 0, sHeight - 1);
1774 Z = Clamp(Z, 0, sDepth - 1);
1775 }
1776
1777 Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1778
1779 // When both formats are true integer types, we don't go to float to avoid losing precision
1780 Int4 color = readInt4(s, state);
1781 for(int s = 0; s < state.destSamples; s++)
1782 {
1783 write(color, d, state);
1784
1785 d += dSliceB;
1786 }
1787 }
1788 else
1789 {
1790 Float4 color = sample(source, x, y, z, sWidth, sHeight, sDepth, sSliceB, sPitchB, state);
1791
1792 for(int s = 0; s < state.destSamples; s++)
1793 {
1794 write(color, d, state);
1795
1796 d += dSliceB;
1797 }
1798 }
1799 }
1800 }
1801 }
1802 }
1803
1804 return function("BlitRoutine");
1805 }
1806
getBlitRoutine(const State & state)1807 Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
1808 {
1809 marl::lock lock(blitMutex);
1810 auto blitRoutine = blitCache.lookup(state);
1811
1812 if(!blitRoutine)
1813 {
1814 blitRoutine = generate(state);
1815 blitCache.add(state, blitRoutine);
1816 }
1817
1818 return blitRoutine;
1819 }
1820
getCornerUpdateRoutine(const State & state)1821 Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
1822 {
1823 marl::lock lock(cornerUpdateMutex);
1824 auto cornerUpdateRoutine = cornerUpdateCache.lookup(state);
1825
1826 if(!cornerUpdateRoutine)
1827 {
1828 cornerUpdateRoutine = generateCornerUpdate(state);
1829 cornerUpdateCache.add(state, cornerUpdateRoutine);
1830 }
1831
1832 return cornerUpdateRoutine;
1833 }
1834
blit(const vk::Image * src,vk::Image * dst,VkImageBlit2KHR region,VkFilter filter)1835 void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit2KHR region, VkFilter filter)
1836 {
1837 ASSERT(src->getFormat() != VK_FORMAT_UNDEFINED);
1838 ASSERT(dst->getFormat() != VK_FORMAT_UNDEFINED);
1839
1840 // Vulkan 1.2 section 18.5. Image Copies with Scaling:
1841 // "The layerCount member of srcSubresource and dstSubresource must match"
1842 // "The aspectMask member of srcSubresource and dstSubresource must match"
1843 ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
1844 ASSERT(region.srcSubresource.aspectMask == region.dstSubresource.aspectMask);
1845
1846 if(region.dstOffsets[0].x > region.dstOffsets[1].x)
1847 {
1848 std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
1849 std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
1850 }
1851
1852 if(region.dstOffsets[0].y > region.dstOffsets[1].y)
1853 {
1854 std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
1855 std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
1856 }
1857
1858 if(region.dstOffsets[0].z > region.dstOffsets[1].z)
1859 {
1860 std::swap(region.srcOffsets[0].z, region.srcOffsets[1].z);
1861 std::swap(region.dstOffsets[0].z, region.dstOffsets[1].z);
1862 }
1863
1864 VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
1865 VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
1866 VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
1867
1868 float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
1869 static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
1870 float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
1871 static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
1872 float depthRatio = static_cast<float>(region.srcOffsets[1].z - region.srcOffsets[0].z) /
1873 static_cast<float>(region.dstOffsets[1].z - region.dstOffsets[0].z);
1874 float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
1875 float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
1876 float z0 = region.srcOffsets[0].z + (0.5f - region.dstOffsets[0].z) * depthRatio;
1877
1878 auto srcFormat = src->getFormat(srcAspect);
1879 auto dstFormat = dst->getFormat(dstAspect);
1880
1881 bool doFilter = (filter != VK_FILTER_NEAREST);
1882 bool allowSRGBConversion =
1883 doFilter ||
1884 (src->getSampleCountFlagBits() > 1) ||
1885 (srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
1886
1887 State state(srcFormat, dstFormat, src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
1888 Options{ doFilter, allowSRGBConversion });
1889 state.clampToEdge = (region.srcOffsets[0].x < 0) ||
1890 (region.srcOffsets[0].y < 0) ||
1891 (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
1892 (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
1893 (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
1894 state.filter3D = (region.srcOffsets[1].z - region.srcOffsets[0].z) !=
1895 (region.dstOffsets[1].z - region.dstOffsets[0].z);
1896
1897 auto blitRoutine = getBlitRoutine(state);
1898 if(!blitRoutine)
1899 {
1900 return;
1901 }
1902
1903 BlitData data = {
1904 nullptr, // source
1905 nullptr, // dest
1906 assert_cast<uint32_t>(src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel)), // sPitchB
1907 assert_cast<uint32_t>(dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel)), // dPitchB
1908 assert_cast<uint32_t>(src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel)), // sSliceB
1909 assert_cast<uint32_t>(dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel)), // dSliceB
1910
1911 x0,
1912 y0,
1913 z0,
1914 widthRatio,
1915 heightRatio,
1916 depthRatio,
1917
1918 region.dstOffsets[0].x, // x0d
1919 region.dstOffsets[1].x, // x1d
1920 region.dstOffsets[0].y, // y0d
1921 region.dstOffsets[1].y, // y1d
1922 region.dstOffsets[0].z, // z0d
1923 region.dstOffsets[1].z, // z1d
1924
1925 static_cast<int>(srcExtent.width), // sWidth
1926 static_cast<int>(srcExtent.height), // sHeight
1927 static_cast<int>(srcExtent.depth), // sDepth
1928
1929 false, // filter3D
1930 };
1931
1932 VkImageSubresource srcSubres = {
1933 region.srcSubresource.aspectMask,
1934 region.srcSubresource.mipLevel,
1935 region.srcSubresource.baseArrayLayer
1936 };
1937
1938 VkImageSubresource dstSubres = {
1939 region.dstSubresource.aspectMask,
1940 region.dstSubresource.mipLevel,
1941 region.dstSubresource.baseArrayLayer
1942 };
1943
1944 VkImageSubresourceRange dstSubresRange = {
1945 region.dstSubresource.aspectMask,
1946 region.dstSubresource.mipLevel,
1947 1, // levelCount
1948 region.dstSubresource.baseArrayLayer,
1949 region.dstSubresource.layerCount
1950 };
1951
1952 uint32_t lastLayer = src->getLastLayerIndex(dstSubresRange);
1953
1954 for(; dstSubres.arrayLayer <= lastLayer; srcSubres.arrayLayer++, dstSubres.arrayLayer++)
1955 {
1956 data.source = src->getTexelPointer({ 0, 0, 0 }, srcSubres);
1957 data.dest = dst->getTexelPointer({ 0, 0, 0 }, dstSubres);
1958
1959 ASSERT(data.source < src->end());
1960 ASSERT(data.dest < dst->end());
1961
1962 blitRoutine(&data);
1963 }
1964
1965 dst->contentsChanged(dstSubresRange);
1966 }
1967
resolveDepth(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)1968 static void resolveDepth(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
1969 {
1970 if(dsrDesc.depthResolveMode == VK_RESOLVE_MODE_NONE)
1971 {
1972 return;
1973 }
1974
1975 vk::Format format = src->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT);
1976 VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_DEPTH_BIT);
1977 int width = extent.width;
1978 int height = extent.height;
1979 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
1980
1981 // To support other resolve modes, get the slice bytes and get a pointer to each sample plane.
1982 // Then modify the loop below to include logic for handling each new mode.
1983 uint8_t *source = (uint8_t *)src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1984 uint8_t *dest = (uint8_t *)dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1985
1986 size_t formatSize = format.bytes();
1987 // TODO(b/167558951) support other resolve modes.
1988 ASSERT(dsrDesc.depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1989 for(int y = 0; y < height; y++)
1990 {
1991 memcpy(dest, source, formatSize * width);
1992
1993 source += pitch;
1994 dest += pitch;
1995 }
1996
1997 dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
1998 }
1999
resolveStencil(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)2000 static void resolveStencil(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
2001 {
2002 if(dsrDesc.stencilResolveMode == VK_RESOLVE_MODE_NONE)
2003 {
2004 return;
2005 }
2006
2007 VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_STENCIL_BIT);
2008 int width = extent.width;
2009 int height = extent.height;
2010 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
2011
2012 // To support other resolve modes, use src->slicePitchBytes() and get a pointer to each sample's slice.
2013 // Then modify the loop below to include logic for handling each new mode.
2014 uint8_t *source = reinterpret_cast<uint8_t *>(src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2015 uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2016
2017 // TODO(b/167558951) support other resolve modes.
2018 ASSERT(dsrDesc.stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
2019 for(int y = 0; y < height; y++)
2020 {
2021 // Stencil is always 8 bits, so the width of the resource we're resolving is
2022 // the number of bytes in each row we need to copy during for SAMPLE_ZERO
2023 memcpy(dest, source, width);
2024
2025 source += pitch;
2026 dest += pitch;
2027 }
2028
2029 dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
2030 }
2031
resolveDepthStencil(const vk::ImageView * src,vk::ImageView * dst,const VkSubpassDescriptionDepthStencilResolve & dsrDesc)2032 void Blitter::resolveDepthStencil(const vk::ImageView *src, vk::ImageView *dst, const VkSubpassDescriptionDepthStencilResolve &dsrDesc)
2033 {
2034 VkImageSubresourceRange srcRange = src->getSubresourceRange();
2035 VkImageSubresourceRange dstRange = src->getSubresourceRange();
2036 ASSERT(src->getFormat() == dst->getFormat());
2037 ASSERT(srcRange.layerCount == 1 && dstRange.layerCount == 1);
2038 ASSERT(srcRange.aspectMask == dstRange.aspectMask);
2039
2040 if(srcRange.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2041 {
2042 resolveDepth(src, dst, dsrDesc);
2043 }
2044 if(srcRange.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2045 {
2046 resolveStencil(src, dst, dsrDesc);
2047 }
2048 }
2049
resolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2050 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2051 {
2052 // "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
2053 ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2054 ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2055 // "The layerCount member of srcSubresource and dstSubresource must match"
2056 ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
2057
2058 // We use this method both for explicit resolves from vkCmdResolveImage, and implicit ones for resolve attachments.
2059 // - vkCmdResolveImage: "srcImage and dstImage must have been created with the same image format."
2060 // - VkSubpassDescription: "each resolve attachment that is not VK_ATTACHMENT_UNUSED must have the same VkFormat as its corresponding color attachment."
2061 ASSERT(src->getFormat() == dst->getFormat());
2062
2063 if(fastResolve(src, dst, region))
2064 {
2065 return;
2066 }
2067
2068 // Fall back to a generic blit which performs the resolve.
2069 VkImageBlit2KHR blitRegion;
2070 blitRegion.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR;
2071 blitRegion.pNext = nullptr;
2072
2073 blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
2074 blitRegion.srcOffsets[1].x += region.extent.width;
2075 blitRegion.srcOffsets[1].y += region.extent.height;
2076 blitRegion.srcOffsets[1].z += region.extent.depth;
2077
2078 blitRegion.dstOffsets[0] = blitRegion.dstOffsets[1] = region.dstOffset;
2079 blitRegion.dstOffsets[1].x += region.extent.width;
2080 blitRegion.dstOffsets[1].y += region.extent.height;
2081 blitRegion.dstOffsets[1].z += region.extent.depth;
2082
2083 blitRegion.srcSubresource = region.srcSubresource;
2084 blitRegion.dstSubresource = region.dstSubresource;
2085
2086 blit(src, dst, blitRegion, VK_FILTER_NEAREST);
2087 }
2088
averageByte4(uint32_t x,uint32_t y)2089 static inline uint32_t averageByte4(uint32_t x, uint32_t y)
2090 {
2091 return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
2092 }
2093
fastResolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2094 bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2095 {
2096 if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
2097 {
2098 return false;
2099 }
2100
2101 if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
2102 {
2103 return false;
2104 }
2105
2106 if(region.srcSubresource.layerCount != 1)
2107 {
2108 return false;
2109 }
2110
2111 if(region.extent != src->getExtent() ||
2112 region.extent != dst->getExtent() ||
2113 region.extent.depth != 1)
2114 {
2115 return false;
2116 }
2117
2118 VkImageSubresource srcSubresource = {
2119 region.srcSubresource.aspectMask,
2120 region.srcSubresource.mipLevel,
2121 region.srcSubresource.baseArrayLayer
2122 };
2123
2124 VkImageSubresource dstSubresource = {
2125 region.dstSubresource.aspectMask,
2126 region.dstSubresource.mipLevel,
2127 region.dstSubresource.baseArrayLayer
2128 };
2129
2130 VkImageSubresourceRange dstSubresourceRange = {
2131 region.dstSubresource.aspectMask,
2132 region.dstSubresource.mipLevel,
2133 1, // levelCount
2134 region.dstSubresource.baseArrayLayer,
2135 region.dstSubresource.layerCount
2136 };
2137
2138 void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
2139 uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
2140
2141 auto format = src->getFormat();
2142 auto samples = src->getSampleCountFlagBits();
2143 auto extent = src->getExtent();
2144
2145 int width = extent.width;
2146 int height = extent.height;
2147 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2148 int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2149
2150 uint8_t *source0 = (uint8_t *)source;
2151 uint8_t *source1 = source0 + slice;
2152 uint8_t *source2 = source1 + slice;
2153 uint8_t *source3 = source2 + slice;
2154
2155 [[maybe_unused]] const bool SSE2 = CPUID::supportsSSE2();
2156
2157 if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
2158 {
2159 if(samples == 4)
2160 {
2161 for(int y = 0; y < height; y++)
2162 {
2163 int x = 0;
2164
2165 #if defined(__i386__) || defined(__x86_64__)
2166 if(SSE2)
2167 {
2168 for(; (x + 3) < width; x += 4)
2169 {
2170 __m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
2171 __m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
2172 __m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
2173 __m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
2174
2175 c0 = _mm_avg_epu8(c0, c1);
2176 c2 = _mm_avg_epu8(c2, c3);
2177 c0 = _mm_avg_epu8(c0, c2);
2178
2179 _mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
2180 }
2181 }
2182 #endif
2183
2184 for(; x < width; x++)
2185 {
2186 uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
2187 uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
2188 uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
2189 uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
2190
2191 uint32_t c01 = averageByte4(c0, c1);
2192 uint32_t c23 = averageByte4(c2, c3);
2193 uint32_t c03 = averageByte4(c01, c23);
2194
2195 *(uint32_t *)(dest + 4 * x) = c03;
2196 }
2197
2198 source0 += pitch;
2199 source1 += pitch;
2200 source2 += pitch;
2201 source3 += pitch;
2202 dest += pitch;
2203
2204 ASSERT(source0 < src->end());
2205 ASSERT(source3 < src->end());
2206 ASSERT(dest < dst->end());
2207 }
2208 }
2209 else
2210 UNSUPPORTED("Samples: %d", samples);
2211 }
2212 else
2213 {
2214 return false;
2215 }
2216
2217 dst->contentsChanged(dstSubresourceRange);
2218
2219 return true;
2220 }
2221
copy(const vk::Image * src,uint8_t * dst,unsigned int dstPitch)2222 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
2223 {
2224 VkExtent3D extent = src->getExtent();
2225 size_t rowBytes = src->getFormat(VK_IMAGE_ASPECT_COLOR_BIT).bytes() * extent.width;
2226 unsigned int srcPitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
2227 ASSERT(dstPitch >= rowBytes && srcPitch >= rowBytes && src->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0).height >= extent.height);
2228
2229 const uint8_t *s = (uint8_t *)src->getTexelPointer({ 0, 0, 0 }, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0 });
2230 uint8_t *d = dst;
2231
2232 for(uint32_t y = 0; y < extent.height; y++)
2233 {
2234 memcpy(d, s, rowBytes);
2235
2236 s += srcPitch;
2237 d += dstPitch;
2238 }
2239 }
2240
computeCubeCorner(Pointer<Byte> & layer,Int & x0,Int & x1,Int & y0,Int & y1,Int & pitchB,const State & state)2241 void Blitter::computeCubeCorner(Pointer<Byte> &layer, Int &x0, Int &x1, Int &y0, Int &y1, Int &pitchB, const State &state)
2242 {
2243 int bytes = state.sourceFormat.bytes();
2244
2245 Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
2246 readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
2247 readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
2248
2249 c *= Float4(1.0f / 3.0f);
2250
2251 write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
2252 }
2253
generateCornerUpdate(const State & state)2254 Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State &state)
2255 {
2256 // Reading and writing from/to the same image
2257 ASSERT(state.sourceFormat == state.destFormat);
2258 ASSERT(state.srcSamples == state.destSamples);
2259
2260 // Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2261 // VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2262 ASSERT(state.srcSamples == 1);
2263
2264 CornerUpdateFunction function;
2265 {
2266 Pointer<Byte> blit(function.Arg<0>());
2267
2268 Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
2269 Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
2270 UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
2271 UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
2272
2273 // Low Border, Low Pixel, High Border, High Pixel
2274 Int LB(-1), LP(0), HB(dim), HP(dim - 1);
2275
2276 for(int face = 0; face < 6; face++)
2277 {
2278 computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
2279 computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
2280 computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
2281 computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
2282 layers = layers + layerSize;
2283 }
2284 }
2285
2286 return function("BlitRoutine");
2287 }
2288
updateBorders(const vk::Image * image,const VkImageSubresource & subresource)2289 void Blitter::updateBorders(const vk::Image *image, const VkImageSubresource &subresource)
2290 {
2291 ASSERT(image->getArrayLayers() >= (subresource.arrayLayer + 6));
2292
2293 // From Vulkan 1.1 spec, section 11.5. Image Views:
2294 // "For cube and cube array image views, the layers of the image view starting
2295 // at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
2296 VkImageSubresource posX = subresource;
2297 VkImageSubresource negX = posX;
2298 negX.arrayLayer++;
2299 VkImageSubresource posY = negX;
2300 posY.arrayLayer++;
2301 VkImageSubresource negY = posY;
2302 negY.arrayLayer++;
2303 VkImageSubresource posZ = negY;
2304 posZ.arrayLayer++;
2305 VkImageSubresource negZ = posZ;
2306 negZ.arrayLayer++;
2307
2308 // Copy top / bottom
2309 copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
2310 copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
2311 copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
2312 copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
2313 copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
2314 copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
2315
2316 copyCubeEdge(image, posX, TOP, posY, RIGHT);
2317 copyCubeEdge(image, posY, TOP, negZ, TOP);
2318 copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
2319 copyCubeEdge(image, negX, TOP, posY, LEFT);
2320 copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
2321 copyCubeEdge(image, negZ, TOP, posY, TOP);
2322
2323 // Copy left / right
2324 copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
2325 copyCubeEdge(image, posY, RIGHT, posX, TOP);
2326 copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
2327 copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
2328 copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
2329 copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
2330
2331 copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
2332 copyCubeEdge(image, posY, LEFT, negX, TOP);
2333 copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
2334 copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
2335 copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
2336 copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
2337
2338 // Compute corner colors
2339 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
2340 vk::Format format = image->getFormat(aspect);
2341 VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
2342 State state(format, format, samples, samples, Options{ 0xF });
2343
2344 // Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2345 // VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2346 ASSERT(samples == VK_SAMPLE_COUNT_1_BIT);
2347
2348 auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
2349 if(!cornerUpdateRoutine)
2350 {
2351 return;
2352 }
2353
2354 VkExtent3D extent = image->getMipLevelExtent(aspect, subresource.mipLevel);
2355 CubeBorderData data = {
2356 image->getTexelPointer({ 0, 0, 0 }, posX),
2357 assert_cast<uint32_t>(image->rowPitchBytes(aspect, subresource.mipLevel)),
2358 assert_cast<uint32_t>(image->getLayerSize(aspect)),
2359 extent.width
2360 };
2361 cornerUpdateRoutine(&data);
2362 }
2363
copyCubeEdge(const vk::Image * image,const VkImageSubresource & dstSubresource,Edge dstEdge,const VkImageSubresource & srcSubresource,Edge srcEdge)2364 void Blitter::copyCubeEdge(const vk::Image *image,
2365 const VkImageSubresource &dstSubresource, Edge dstEdge,
2366 const VkImageSubresource &srcSubresource, Edge srcEdge)
2367 {
2368 ASSERT(srcSubresource.aspectMask == dstSubresource.aspectMask);
2369 ASSERT(srcSubresource.mipLevel == dstSubresource.mipLevel);
2370 ASSERT(srcSubresource.arrayLayer != dstSubresource.arrayLayer);
2371
2372 // Figure out if the edges to be copied in reverse order respectively from one another
2373 // The copy should be reversed whenever the same edges are contiguous or if we're
2374 // copying top <-> right or bottom <-> left. This is explained by the layout, which is:
2375 //
2376 // | +y |
2377 // | -x | +z | +x | -z |
2378 // | -y |
2379
2380 bool reverse = (srcEdge == dstEdge) ||
2381 ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
2382 ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
2383 ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
2384 ((srcEdge == LEFT) && (dstEdge == BOTTOM));
2385
2386 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresource.aspectMask);
2387 int bytes = image->getFormat(aspect).bytes();
2388 int pitchB = image->rowPitchBytes(aspect, srcSubresource.mipLevel);
2389
2390 VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresource.mipLevel);
2391 int w = extent.width;
2392 int h = extent.height;
2393 if(w != h)
2394 {
2395 UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
2396 }
2397
2398 // Src is expressed in the regular [0, width-1], [0, height-1] space
2399 bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
2400 int srcDelta = srcHorizontal ? bytes : pitchB;
2401 VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
2402
2403 // Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
2404 bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
2405 int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
2406 VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
2407
2408 // Don't write in the corners
2409 if(dstHorizontal)
2410 {
2411 dstOffset.x += reverse ? w : 1;
2412 }
2413 else
2414 {
2415 dstOffset.y += reverse ? h : 1;
2416 }
2417
2418 const uint8_t *src = static_cast<const uint8_t *>(image->getTexelPointer(srcOffset, srcSubresource));
2419 uint8_t *dst = static_cast<uint8_t *>(image->getTexelPointer(dstOffset, dstSubresource));
2420 ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
2421 ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
2422
2423 for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
2424 {
2425 memcpy(dst, src, bytes);
2426 }
2427 }
2428
2429 } // namespace sw
2430