#include #include #include #include namespace at { namespace native { namespace vulkan { namespace packing { api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) { if (v_dst.is_quantized()) { switch (v_dst.storage_type()) { case api::StorageType::TEXTURE_3D: switch (v_dst.dtype()) { case api::ScalarType::QUInt8: return VK_KERNEL(nchw_to_image_uint8); case api::ScalarType::QInt8: return VK_KERNEL(nchw_to_image_int8); case api::ScalarType::QInt32: return VK_KERNEL(nchw_to_image_int32); default: VK_THROW( "Vulkan quantization currently not supported for dtype ", v_dst.dtype()); } case api::StorageType::TEXTURE_2D: switch (v_dst.dtype()) { case api::ScalarType::QUInt8: return VK_KERNEL(nchw_to_image2d_uint8); case api::ScalarType::QInt8: return VK_KERNEL(nchw_to_image2d_int8); case api::ScalarType::QInt32: return VK_KERNEL(nchw_to_image2d_int32); default: VK_THROW( "Vulkan quantization currently not supported for dtype ", v_dst.dtype()); } default: VK_THROW("No kernel available!"); case api::StorageType::BUFFER: case api::StorageType::UNKNOWN: VK_THROW("Requested storage type must be a texture type."); } } if (v_dst.dtype() == api::kFloat) { switch (v_dst.storage_type()) { case api::StorageType::TEXTURE_3D: return VK_KERNEL(nchw_to_image); case api::StorageType::TEXTURE_2D: return VK_KERNEL(nchw_to_image2d); default: VK_THROW("No kernel available!"); } } else if (v_dst.dtype() == api::kBool) { switch (v_dst.storage_type()) { case api::StorageType::TEXTURE_3D: return VK_KERNEL(nchw_to_image_bool); default: VK_THROW("No kernel available!"); } } else { VK_THROW("Unsupported dtype!"); } } api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) { if (v_src.is_quantized() || v_src.dtype() == api::kBool) { auto plane_size = dim_at(v_src) * dim_at(v_src); switch (v_src.storage_type()) { case api::StorageType::TEXTURE_3D: switch (v_src.dtype()) { case api::ScalarType::QUInt8: case api::ScalarType::QInt8: case api::kBool: return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4) : VK_KERNEL(image_to_nchw_uint); case api::ScalarType::QInt32: return VK_KERNEL(image_to_nchw_int32); default: VK_THROW( "Vulkan quantization currently not supported for dtype ", v_src.dtype()); } default: VK_THROW("No kernel available!"); case api::StorageType::BUFFER: case api::StorageType::UNKNOWN: VK_THROW("Requested storage type must be a texture type."); } } if (v_src.dtype() == api::kFloat) { switch (v_src.storage_type()) { case api::StorageType::TEXTURE_3D: return VK_KERNEL(image_to_nchw); case api::StorageType::TEXTURE_2D: return VK_KERNEL(image2d_to_nchw); default: VK_THROW("No kernel available!"); } } else { VK_THROW("Unsupported dtype!"); } } struct ToFromTextureParams final { api::utils::ivec3 extents; int32_t planeSize; api::utils::ivec2 channelInfo; }; void record_nchw_to_image_op( api::Context* const context, api::ShaderInfo& compute_shader, api::VulkanBuffer& src_buffer, vTensor& v_dst, api::PipelineBarrier pipeline_barrier, VkFence fence_handle) { api::utils::uvec3 global_size = v_dst.extents(); api::utils::uvec3 local_size = adaptive_work_group_size(global_size); int32_t height = api::utils::safe_downcast(dim_at(v_dst)); int32_t width = api::utils::safe_downcast(dim_at(v_dst)); int32_t channels = api::utils::safe_downcast(dim_at(v_dst)); int32_t plane_size = height * width; int32_t c_depth = api::utils::div_up(channels, 4); ToFromTextureParams block{ api::utils::make_ivec3(v_dst.extents()), plane_size, {c_depth, channels}, }; api::UniformParamsBuffer params(context, block); context->submit_compute_job( // shader descriptor compute_shader, // pipeline barrier pipeline_barrier, // global work group size global_size, // local work group size local_size, // fence handle fence_handle, // shader arguments v_dst.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), src_buffer, // params buffer params.buffer()); } bool record_image_to_nchw_op( api::Context* const context, api::ShaderInfo& compute_shader, vTensor& v_src, api::VulkanBuffer& dst_buffer, api::PipelineBarrier pipeline_barrier, VkFence fence_handle) { api::utils::uvec3 global_size = v_src.extents(); api::utils::uvec3 local_size = adaptive_work_group_size(global_size); int32_t height = api::utils::safe_downcast(dim_at(v_src)); int32_t width = api::utils::safe_downcast(dim_at(v_src)); int32_t channels = api::utils::safe_downcast(dim_at(v_src)); int32_t plane_size = height * width; int32_t c_depth = api::utils::div_up(channels, 4); ToFromTextureParams block{ api::utils::make_ivec3(v_src.extents()), plane_size, {c_depth, channels}, }; if (v_src.dtype() == api::ScalarType::QUInt8 || v_src.dtype() == api::ScalarType::QInt8 || v_src.dtype() == api::kBool) { // Special case using optimized shader, image_to_nchw_quantized_mul4 if (plane_size % 4 == 0) { global_size.data[0u] = plane_size / 4; global_size.data[1u] = 1; local_size.data[0u] *= local_size.data[1u]; local_size.data[1u] = 1; } // Global and local size for regular 1D buffer. else { uint32_t numel = v_src.numel(); global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u}; local_size = {64u, 1u, 1u}; } } api::UniformParamsBuffer params(context, block); return context->submit_compute_job( // shader descriptor compute_shader, // pipeline barrier pipeline_barrier, // global work group size global_size, // local work group size local_size, // fence handle fence_handle, // shader arguments v_src.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), dst_buffer, // params buffer params.buffer()); } void record_nchw_to_buffer_op( api::Context* const context, api::VulkanBuffer& src_buffer, vTensor& v_dst, api::PipelineBarrier pipeline_barrier, VkFence fence_handle) { uint32_t gpu_buf_len = api::utils::safe_downcast(v_dst.gpu_numel()); api::utils::uvec3 global_size = {gpu_buf_len, 1u, 1u}; api::utils::uvec3 local_size = {32u, 1u, 1u}; api::UniformParamsBuffer cpu_buffer_metadata( context, v_dst.get_cpu_buffer_metadata()); context->submit_compute_job( // shader descriptor VK_KERNEL(buffer_to_buffer), // pipeline barrier pipeline_barrier, // global work group size global_size, // local work group size local_size, // fence handle fence_handle, // shader arguments v_dst.buffer( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_dst.buffer_metadata(), src_buffer, cpu_buffer_metadata.buffer()); } bool record_buffer_to_nchw_op( api::Context* const context, vTensor& v_src, api::VulkanBuffer& dst_buffer, api::PipelineBarrier pipeline_barrier, VkFence fence_handle) { uint32_t buf_len = api::utils::safe_downcast(v_src.numel()); api::utils::uvec3 global_size = {buf_len, 1u, 1u}; api::utils::uvec3 local_size = {4u, 1u, 1u}; api::UniformParamsBuffer cpu_buffer_metadata( context, v_src.get_cpu_buffer_metadata()); return context->submit_compute_job( // shader descriptor VK_KERNEL(buffer_to_buffer), // pipeline barrier pipeline_barrier, // global work group size global_size, // local work group size local_size, // fence handle fence_handle, // shader arguments dst_buffer, cpu_buffer_metadata.buffer(), v_src.buffer( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_src.buffer_metadata()); } static vTensor channel_image_repacking( const vTensor& v_input, api::GPUMemoryLayout target_layout, const api::ShaderInfo& shader_descriptor) { api::Context* const context = api::context(); vTensor v_output{ context, v_input.sizes(), v_input.dtype(), v_input.storage_type(), target_layout, }; // Required to determine how to insert memory barriers in the command buffer api::PipelineBarrier pipeline_barrier{}; // The shader assumes a 4d nchw to calculate the lookup coordinate. // If the input is not 4d, we need to pad it with 1's on the front. const struct Block final { api::utils::ivec4 sizes; } block{ api::utils::make_ivec4_prepadded1(v_input.sizes()), }; api::UniformParamsBuffer params(context, block); context->submit_compute_job( // shader descriptor // VK_KERNEL(packing_channel_to_height), shader_descriptor, // pipeline barrier pipeline_barrier, // global work group size v_output.extents(), // local work group size adaptive_work_group_size(v_output.extents()), // fence handle VK_NULL_HANDLE, // shader arguments v_output.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE), // params buffer params.buffer()); return v_output; } vTensor convert_image_channels_packed_to_height_packed(const vTensor& v_input) { return channel_image_repacking( v_input, api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, VK_KERNEL(convert_channels_to_height_packed)); } vTensor convert_image_channels_packed_to_width_packed(const vTensor& v_input) { return channel_image_repacking( v_input, api::GPUMemoryLayout::TENSOR_WIDTH_PACKED, VK_KERNEL(convert_channels_to_width_packed)); } } // namespace packing } // namespace vulkan } // namespace native } // namespace at