#include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #else #include #include #include #include #endif namespace at { namespace native { namespace vulkan { namespace ops { namespace utils { using namespace api::utils; /* * This function formats an input tensor in NCHW layout to NC4HW layout such * that the buffer of the formatted tensor can be directly copied into a GPU * texture. Conceptually, the formatting can be achieved via the following * steps: * * 1. Given that the src tensor has size {N,C,H,W} * * 2. Combine the batch and channel dims by reshaping to {N*C, H, W} * * 3. Determine the amount of padding to add: determine how many channels to add * in order to align N*C to the next multiple of 4 * * 4. Add padding to the tensor so that the batch-channel dimension is a * multiple of four; the shape of the tensor is now {NC_aligned, H, W} * * 5. Split the batch-channel dimension into groups of 4 by reshaping the tensor * to size {NC_aligned/4, 4, H, W} * * 6. The groups of 4 channels (dim 1) should be contiguous. Therefore, permute * the dims of the tensor in the order {0, 2, 3, 1} * * 7. Finally, return a contiguous version of the tensor. The final shape of the * tensor would be {NC_aligned/4, H, W, 4} */ Tensor nchw_to_nc4hw(const Tensor& src) { uint32_t N = get_dim(src.sizes()); uint32_t C = get_dim(src.sizes()); uint32_t H = get_dim(src.sizes()); uint32_t W = get_dim(src.sizes()); uint32_t C_aligned = api::utils::align_up(C, 4u); uint32_t NC4 = (N * C_aligned) / 4; // Add padding to the tensor so that the channel dim is a multiple of 4 Tensor padding = at::zeros({N, C_aligned - C, H, W}, src.options()); Tensor src_padded = at::cat({src.reshape({N, C, H, W}), padding}, 1); // Reshape to group channels into groups of 4 and permute so that the groups // are in the first dimension so that they are contiguous Tensor src_NC4HW = src_padded.reshape({NC4, 4, H, W}).permute({0, 2, 3, 1}); // Return a contiguous version of the tensor return src_NC4HW.contiguous(); } /* * Creates a staging tensor into which texture data, which will be in NC4HW * format, can be copied directly. The shape of the staging tensor will be the * same as the tensor produced by a call to format_src_tensor(). */ Tensor create_staging_tensor(const vTensor& v_in) { uint32_t N = get_dim(v_in.sizes()); uint32_t C = get_dim(v_in.sizes()); uint32_t H = get_dim(v_in.sizes()); uint32_t W = get_dim(v_in.sizes()); uint32_t NC4 = N * api::utils::div_up(C, 4u); // Note that the dtype corresponding with the texture format of the vTensor is // used instead of options().dtype(). This is to ensure the number of bytes in // the staging tensor matches the number of bytes in the image texture. Refer // to comments for api::vk_format() return at::empty( {NC4, H, W, 4}, at::device(at::kCPU).dtype(convert_dtype(v_in.texture_dtype()))); } /* * After copying texture data, which will be in NC4HW format, to a staging * tensor created in create_staging_tensor(), this function reformats the tensor * to NCHW format. It essentially reverses the transformations made by * format_src_tensor(). * * Note that the sizes of the original tensor must be passed in to fully restore * the properties of the original tensor. */ Tensor nc4hw_to_nchw(const Tensor& t_in, IntArrayRef sizes) { uint32_t N = get_dim(sizes); uint32_t C = get_dim(sizes); uint32_t H = get_dim(sizes); uint32_t W = get_dim(sizes); uint32_t C_aligned = api::utils::align_up(C, 4u); // Undo the permute step and channel grouping step Tensor t_in_padded = t_in.permute({0, 3, 1, 2}).reshape({N, C_aligned, H, W}); // Remove the padding channels Tensor t_in_shaved = at::narrow(t_in_padded, /*dim=*/1, /*start*/ 0, /*end*/ C); // Reshape to original sizing and dtype and return a contiguous Tensor return t_in_shaved.reshape(sizes).contiguous(); } void copy_buffer_to_vtensor( api::VulkanBuffer& src_buffer, vTensor& v_dst, api::PipelineBarrier& pipeline_barrier) { api::Context* const context = api::context(); TORCH_CHECK( src_buffer.mem_size() == v_dst.gpu_nbytes(), "Vulkan copy_buffer_to_vtensor: source buffer and destination texture " "do not have the same number of bytes"); context->submit_copy( // pipeline barrier pipeline_barrier, // resources src_buffer, v_dst.image( pipeline_barrier, api::PipelineStage::TRANSFER, api::MemoryAccessType::WRITE), // copy details v_dst.extents(), {0u, 0u, 0u}, {0u, 0u, 0u}, // fence handle VK_NULL_HANDLE); } void copy_buffer_to_buffer( api::Context* const context, api::StorageBuffer& src, api::StorageBuffer& dst, VkFence fence_handle) { api::PipelineBarrier pipeline_barrier{}; context->submit_copy( // pipeline barrier pipeline_barrier, // resources src.buffer(), dst.buffer(), // copy details {static_cast(src.buffer().mem_size()), 0u, 0u}, {0u, 0u, 0u}, {0u, 0u, 0u}, // fence handle fence_handle); } void copy_vtensor_to_buffer( vTensor& v_src, api::VulkanBuffer& dst_buffer, api::PipelineBarrier& pipeline_barrier, const VkFence fence_handle) { api::Context* const context = api::context(); TORCH_CHECK( v_src.gpu_nbytes() == dst_buffer.mem_size(), "Vulkan copy_vtensor_to_buffer: source texture and destination buffer " "do not have the same number of bytes"); context->submit_copy( // pipeline barrier pipeline_barrier, // resources v_src.image( pipeline_barrier, api::PipelineStage::TRANSFER, api::MemoryAccessType::READ), dst_buffer, // copy details v_src.extents(), {0u, 0u, 0u}, {0u, 0u, 0u}, // fence handle fence_handle); } void pack_buffer_to_vtensor( api::VulkanBuffer& buffer, vTensor& v_self, api::PipelineBarrier& pipeline_barrier) { api::Context* const context = api::context(); if (v_self.storage_type() == api::StorageType::BUFFER) { packing::record_nchw_to_buffer_op( context, buffer, v_self, pipeline_barrier, VK_NULL_HANDLE); } else { api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(v_self); packing::record_nchw_to_image_op( context, compute_shader, buffer, v_self, pipeline_barrier, VK_NULL_HANDLE); } } void pack_staging_to_vtensor(api::VulkanBuffer& staging, vTensor& v_self) { api::PipelineBarrier pipeline_barrier{}; pack_buffer_to_vtensor(staging, v_self, pipeline_barrier); } bool pack_vtensor_to_staging( vTensor& v_self, api::VulkanBuffer& staging, const VkFence fence_handle) { api::Context* const context = api::context(); api::PipelineBarrier pipeline_barrier{}; if (v_self.storage_type() == api::StorageType::BUFFER) { return packing::record_buffer_to_nchw_op( context, v_self, staging, pipeline_barrier, fence_handle); } else { api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(v_self); return packing::record_image_to_nchw_op( context, compute_shader, v_self, staging, pipeline_barrier, fence_handle); } } /* * Broadcasting Utils */ // check if two tensors are broadcastable void is_broadcastable(const Tensor& input1, const Tensor& input2) { TORCH_CHECK( input1.dim() <= 4 && input2.dim() <= 4, "Vulkan only supports tensors <= 4 dimensions"); // check if the shapes of input tensors are broadcastable // see https://pytorch.org/docs/stable/notes/broadcasting.html // for broadcasting semantics const std::string broadcast_error_msg = "Tensors are not broadcastable!"; if (get_dim(input1) != get_dim(input2)) { TORCH_CHECK( get_dim(input1) == 1 || get_dim(input2) == 1, broadcast_error_msg); } if (get_dim(input1) != get_dim(input2)) { TORCH_CHECK( get_dim(input1) == 1 || get_dim(input2) == 1, broadcast_error_msg); } if (get_dim(input1) != get_dim(input2)) { TORCH_CHECK( get_dim(input1) == 1 || get_dim(input2) == 1, broadcast_error_msg); } if (get_dim(input1) != get_dim(input2)) { TORCH_CHECK( get_dim(input1) == 1 || get_dim(input2) == 1, broadcast_error_msg); } } // compute the output shape by broadcasting the shapes of t1 and t2 std::vector broadcast_size(const Tensor& t1, const Tensor& t2) { int64_t t1_size = t1.dim(); int64_t t2_size = t2.dim(); std::vector out; if (t1_size > t2_size) { for (int64_t i = 0; i < t1_size; i++) { out.push_back(t1.sizes()[i]); } } else { for (int64_t i = 0; i < t2_size; i++) { out.push_back(t2.sizes()[i]); } } if (!out.empty()) { out[out.size() - 1] = std::max(get_dim(t1), get_dim(t2)); } if (out.size() > 1) { out[out.size() - 2] = std::max(get_dim(t1), get_dim(t2)); } if (out.size() > 2) { out[out.size() - 3] = std::max(get_dim(t1), get_dim(t2)); } if (out.size() > 3) { out[out.size() - 4] = std::max(get_dim(t1), get_dim(t2)); } return out; } api::utils::vec4 extract_texel(const Tensor& input, const ivec3& pos) { api::Context* const context = api::context(); TORCH_CHECK(input.is_vulkan()); const vTensor& v_input = convert(input); api::PipelineBarrier pipeline_barrier{}; std::vector output_size{1, 1, 1}; // x, y, z, w all using a single element tensor. We intend to pull // (0, 0, 0).x from each tensor. This allows us to isolate the effect // of most packing mechanism. api::ScalarType dtype = convert_dtype(input.scalar_type()); vTensor v_outputs_x{context, output_size, dtype}; vTensor v_outputs_y{context, output_size, dtype}; vTensor v_outputs_z{context, output_size, dtype}; vTensor v_outputs_w{context, output_size, dtype}; const struct Block final { ivec3 pos; } block{ pos, }; api::UniformParamsBuffer params(context, block); context->submit_compute_job( VK_KERNEL(extract_texel), pipeline_barrier, {1, 1, 1}, {1, 1, 1}, VK_NULL_HANDLE, v_outputs_x.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_outputs_y.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_outputs_z.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_outputs_w.image( pipeline_barrier, api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE), v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE), params.buffer()); vec4 rv = { convert(v_outputs_x).cpu().data_ptr()[0], convert(v_outputs_y).cpu().data_ptr()[0], convert(v_outputs_z).cpu().data_ptr()[0], convert(v_outputs_w).cpu().data_ptr()[0], }; return rv; } } // namespace utils } // namespace ops } // namespace vulkan } // namespace native } // namespace at