/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include using namespace vkcompute; // // Operator Recording Functions // void record_nchw_to_buffer_op( api::Context* const context, vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, {}, VK_NULL_HANDLE, 0, v_dst.buffer( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, v_dst.sizes_ubo(), v_dst.strides_ubo(), v_dst.numel_ubo()); } void record_buffer_to_nchw_op( api::Context* const context, api::vTensor& v_src, vkapi::VulkanBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, {uint32_t(v_src.numel()), 1, 1}, {64, 1, 1}, {}, VK_NULL_HANDLE, 0, dst_buffer, v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), v_src.strides_ubo(), v_src.numel_ubo()); } void record_nchw_to_image_op( api::Context* const context, vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()}; context->submit_compute_job( get_nchw_to_tensor_shader( v_dst, context->adapter_ptr()->has_full_int8_buffers_support()), pipeline_barrier, v_dst.logical_limits(), adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, v_dst.image( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, v_dst.sizes_ubo()); } void record_image_to_nchw_op( api::Context* const context, api::vTensor& v_src, vkapi::VulkanBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()}; context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, v_src.logical_limits(), adaptive_work_group_size(v_src.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, dst_buffer, v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo()); } void record_bitw8_image_to_nchw_nobitw8buffer_op( api::Context* const context, api::vTensor& v_src, api::StagingBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); utils::uvec3 global_wg_size = {buffer_len, 1, 1}; std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer"; add_storage_type_suffix(kernel_name, v_src); add_dtype_suffix(kernel_name, v_src); context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, global_wg_size, adaptive_work_group_size(global_wg_size), {v_src.hashed_layout()}, VK_NULL_HANDLE, 0, dst_buffer.buffer(), v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), v_src.numel_ubo()); } void record_conv2d_prepack_weights_op( api::Context* const context, vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst, const std::vector& original_sizes, const bool transposed) { vkapi::PipelineBarrier pipeline_barrier{}; std::string kernel_name; if (transposed) { kernel_name = "conv_transpose2d"; } else { kernel_name = "conv2d"; } kernel_name += "_prepack_weights"; add_dtype_suffix(kernel_name, v_dst); vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); api::ParamsBuffer original_sizes_ubo( context, utils::make_ivec4(original_sizes, /*reverse = */ true)); vkapi::SpecVarList specialization_constants = {}; context->submit_compute_job( shader, pipeline_barrier, v_dst.logical_limits(), adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, v_dst.image( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, v_dst.sizes_ubo(), original_sizes_ubo.buffer()); } void record_binary_op( api::Context* const context, const std::string& op_name, api::vTensor& v_in1, api::vTensor& v_in2, api::vTensor& v_dst) { std::string kernel_name = "binary_" + op_name + "_nobroadcast__test"; add_dtype_suffix(kernel_name, v_dst); vkapi::PipelineBarrier pipeline_barrier{}; vkapi::SpecVarList specialization_constants = {}; context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, v_dst.logical_limits(), adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, v_dst.image( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_dst.sizes_ubo()); } void execute_and_check_add( api::vTensor& a, api::vTensor& b, api::vTensor& c, float a_val, float b_val) { // Fill input tensors fill_vtensor(a, a_val); fill_vtensor(b, b_val); // a + b = c record_binary_op(api::context(), "add", a, b, c); // Extract output tensor std::vector data_out = extract_vtensor(c); // Check output for (size_t i = 0; i < data_out.size(); ++i) { CHECK_VALUE(data_out, i, (a_val + b_val)); } } void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) { std::string kernel_name("idx_fill_buffer"); switch (v_ten.dtype()) { case vkapi::kFloat: kernel_name += "_float"; break; case vkapi::kHalf: kernel_name += "_half"; break; case vkapi::kQInt8: kernel_name += "_int8"; break; case vkapi::kQUInt8: kernel_name += "_uint8"; break; default: throw std::runtime_error("Unsupported dtype"); break; } api::ParamsBuffer params(api::context(), int32_t(v_ten.numel())); { vkapi::PipelineBarrier pipeline_barrier{}; vkapi::SpecVarList specialization_constants = {}; api::context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, {uint32_t(v_ten.numel()), 1, 1}, {64, 1, 1}, specialization_constants, VK_NULL_HANDLE, 0, v_ten.buffer( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::READ), params.buffer()); } } void record_scalar_add_buffer( api::Context* context, api::vTensor& v_ten, float offset) { vkapi::PipelineBarrier pipeline_barrier{}; vkapi::SpecVarList specialization_constants = {SV(offset)}; std::string kernel = "scalar_add_buffer"; add_dtype_suffix(kernel, v_ten); api::context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel), pipeline_barrier, {uint32_t(v_ten.numel()), 1, 1}, {64, 1, 1}, specialization_constants, VK_NULL_HANDLE, 0, v_ten.buffer( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE), v_ten.numel_ubo()); } void record_reference_matmul( api::Context* context, api::vTensor& out, api::vTensor& mat1, api::vTensor& mat2) { vkapi::PipelineBarrier pipeline_barrier{}; api::context()->submit_compute_job( VK_KERNEL(reference_matmul), pipeline_barrier, {uint32_t(out.size(1)), uint32_t(out.size(0)), 1}, {64, 1, 1}, {}, VK_NULL_HANDLE, 0, out.buffer( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), out.sizes_ubo(), out.strides_ubo(), mat1.sizes_ubo(), mat1.strides_ubo(), mat2.sizes_ubo(), mat2.strides_ubo()); } void record_matmul_texture3d( api::Context* context, api::vTensor& out, api::vTensor& mat1, api::vTensor& mat2) { std::string kernel_name = "matmul_naive"; kernel_name.reserve(kShaderNameReserve); add_storage_type_suffix(kernel_name, out.storage_type()); add_dtype_suffix(kernel_name, out.dtype()); utils::uvec3 global_wg_size = out.logical_limits(); vkapi::PipelineBarrier pipeline_barrier{}; api::context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, global_wg_size, {8, 8, 1}, {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()}, VK_NULL_HANDLE, 0, out.image( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), out.sizes_ubo(), out.logical_limits_ubo(), mat1.sizes_ubo(), mat2.sizes_ubo()); } // // Input & Output Utilities // #define FORALL_SUPPORTED_TYPES(_) \ _(uint8_t, Byte) \ _(int8_t, Char) \ _(int32_t, Int) \ _(executorch::aten::Half, Half) \ _(float, Float) \ _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ std::vector data_converted; \ data_converted.resize(data.size()); \ for (int i = 0; i < data.size(); ++i) { \ data_converted[i] = ctype(data[i]); \ } \ staging_buffer.copy_from( \ data_converted.data(), vten.staging_buffer_nbytes()); \ } break; switch (vten.dtype()) { FORALL_SUPPORTED_TYPES(CASE) default: VK_THROW("Unsupported dtype"); } #undef CASE if (vten.storage_type() == utils::StorageType::BUFFER) { record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten); } else { record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten); } } void fill_vtensor(api::vTensor& vten, float val, bool iota) { std::vector vten_data(vten.staging_buffer_numel()); if (iota) { std::iota(vten_data.begin(), vten_data.end(), val); } else { std::fill(vten_data.begin(), vten_data.end(), val); } fill_vtensor(vten, vten_data); } std::vector create_random_float_buffer( const size_t numel, const float min, const float max) { std::vector data(numel); std::default_random_engine rng; std::uniform_real_distribution dist(min, max); for (size_t i = 0; i < data.size(); ++i) { data[i] = dist(rng); } return data; } std::vector create_random_uint8_buffer( const size_t numel, const uint8_t min, const uint8_t max) { std::vector data(numel); std::default_random_engine rng; std::uniform_real_distribution dist(min, max); for (size_t i = 0; i < data.size(); ++i) { data[i] = (uint8_t)dist(rng); } return data; } void fill_vtensor( ComputeGraph& graph, const IOValueRef idx, float val, bool iota) { vTensorPtr t = graph.get_tensor(idx.value); std::vector data(t->numel()); if (t->storage_type() != utils::kBuffer) { data.resize(t->staging_buffer_numel()); } if (iota) { std::iota(data.begin(), data.end(), val); } else { std::fill(data.begin(), data.end(), val); } graph.copy_into_staging(idx.staging, data.data(), data.size()); } void extract_vtensor(api::vTensor& vten, std::vector& data) { api::StagingBuffer staging_buffer( api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); } else { record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer()); } vkapi::VulkanFence fence = api::context()->fences().get_fence(); api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ std::vector data_converted(data.size()); \ staging_buffer.copy_to( \ data_converted.data(), vten.staging_buffer_nbytes()); \ for (int i = 0; i < data.size(); ++i) { \ data[i] = float(data_converted[i]); \ } \ } break; switch (vten.dtype()) { FORALL_SUPPORTED_TYPES(CASE) default: VK_THROW("Unsupported dtype"); } #undef CASE } // // Context Management // void submit_to_gpu() { vkapi::VulkanFence fence = api::context()->fences().get_fence(); api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); } vkapi::Allocation allocate_memory_for(const api::vTensor& vten) { VmaAllocationCreateInfo alloc_create_info = api::context()->adapter_ptr()->vma().gpuonly_resource_create_info(); return api::context()->adapter_ptr()->vma().create_allocation( vten.get_memory_requirements(), alloc_create_info); } VmaTotalStatistics get_vma_stats() { return api::context()->adapter_ptr()->vma().get_memory_statistics(); } size_t get_vma_allocation_count() { return get_vma_stats().total.statistics.allocationCount; } // // Graph Test Utilities // void execute_graph_and_check_output( ComputeGraph& graph, std::vector input_vals, std::vector expected_outputs) { assert(input_vals.size() == graph.inputs().size()); assert(expected_outputs.size() == graph.outputs().size()); for (size_t i = 0; i < graph.inputs().size(); ++i) { fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i)); } graph.execute(); for (size_t i = 0; i < graph.outputs().size(); ++i) { IOValueRef out_ioval = graph.outputs().at(i); vTensorPtr t_out = graph.get_tensor(out_ioval.value); std::vector output_data(t_out->staging_buffer_numel()); graph.copy_from_staging( out_ioval.staging, output_data.data(), output_data.size()); for (size_t j = 0; j < t_out->numel(); ++j) { CHECK_VALUE(output_data, j, expected_outputs.at(i)); } } } bool check_close(float a, float b, float atol, float rtol) { float max = std::max(std::abs(a), std::abs(b)); float diff = std::abs(a - b); return diff <= (atol + rtol * max); }