graphite/vk/VulkanBuffer.cpp

/*
 * Copyright 2022 Google LLC
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "src/gpu/graphite/vk/VulkanBuffer.h"

#include "include/gpu/vk/VulkanMemoryAllocator.h"
#include "src/gpu/graphite/vk/VulkanCommandBuffer.h"
#include "src/gpu/graphite/vk/VulkanGraphiteUtils.h"
#include "src/gpu/vk/VulkanMemory.h"

namespace skgpu::graphite {

sk_sp<Buffer> VulkanBuffer::Make(const VulkanSharedContext* sharedContext,
                                 size_t size,
                                 BufferType type,
                                 AccessPattern accessPattern) {
    if (size <= 0) {
        return nullptr;
    }
    VkBuffer buffer;
    skgpu::VulkanAlloc alloc;

    // TODO (b/374749633): We can't use protected buffers in the vertex shader. The checks below
    // make sure we don't use it for vertex or index buffers. But we currently don't have a way to
    // check here if it is a uniform or storage buffer that is used in the vertex shader. If we hit
    // that issue and need those GpuOnly buffers, we'll need to pass in some information to the
    // factory to say what stage the buffer is for. Maybe expand AccessPattern to be
    // GpuOnly_NotVertex or some better name like that.
    bool isProtected = sharedContext->isProtected() == Protected::kYes &&
                       accessPattern == AccessPattern::kGpuOnly &&
                       type != BufferType::kVertex &&
                       type != BufferType::kIndex;

    // Protected memory _never_ uses mappable buffers.
    // Otherwise, the only time we don't require mappable buffers is when we're on a device
    // where gpu only memory has faster reads on the gpu than memory that is also mappable
    // on the cpu.
    bool requiresMappable = !isProtected &&
                            (accessPattern == AccessPattern::kHostVisible ||
                             !sharedContext->vulkanCaps().gpuOnlyBuffersMorePerformant());

    using BufferUsage = skgpu::VulkanMemoryAllocator::BufferUsage;

    BufferUsage allocUsage;
    if (type == BufferType::kXferCpuToGpu) {
        allocUsage = BufferUsage::kTransfersFromCpuToGpu;
    } else if (type == BufferType::kXferGpuToCpu) {
        allocUsage = BufferUsage::kTransfersFromGpuToCpu;
    } else {
        // GPU-only buffers are preferred unless mappability is required.
        allocUsage = requiresMappable ? BufferUsage::kCpuWritesGpuReads : BufferUsage::kGpuOnly;
    }

    // Create the buffer object
    VkBufferCreateInfo bufInfo;
    memset(&bufInfo, 0, sizeof(VkBufferCreateInfo));
    bufInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
    bufInfo.flags = isProtected ? VK_BUFFER_CREATE_PROTECTED_BIT : 0;
    bufInfo.size = size;

    // To support SkMesh buffer updates we make Vertex and Index buffers capable of being transfer
    // dsts. To support rtAdjust uniform buffer updates, we make host-visible uniform buffers also
    // capable of being transfer dsts.
    switch (type) {
        case BufferType::kVertex:
            bufInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
            break;
        case BufferType::kIndex:
            bufInfo.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
            break;
        case BufferType::kStorage:
            bufInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            break;
        case BufferType::kQuery:
            SK_ABORT("Query buffers not supported on Vulkan");
            break;
        case BufferType::kIndirect:
            bufInfo.usage =
                    VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            break;
        case BufferType::kVertexStorage:
            bufInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            break;
        case BufferType::kIndexStorage:
            bufInfo.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            break;
        case BufferType::kUniform:
            bufInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
            break;
        case BufferType::kXferCpuToGpu:
            bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
            break;
        case BufferType::kXferGpuToCpu:
            bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
            break;
    }

    // We may not always get a mappable buffer for non-dynamic access buffers. Thus we set the
    // transfer dst usage bit in case we need to do a copy to write data. It doesn't really hurt
    // to set this extra usage flag, but we could narrow the scope of buffers we set it on more than
    // just not dynamic.
    if (!requiresMappable || accessPattern == AccessPattern::kGpuOnly) {
        bufInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
    }

    bufInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    bufInfo.queueFamilyIndexCount = 0;
    bufInfo.pQueueFamilyIndices = nullptr;

    VkResult result;
    VULKAN_CALL_RESULT(sharedContext,
                       result,
                       CreateBuffer(sharedContext->device(),
                                    &bufInfo,
                                    nullptr, /*const VkAllocationCallbacks*/
                                    &buffer));
    if (result != VK_SUCCESS) {
        return nullptr;
    }

    auto allocator = sharedContext->memoryAllocator();
    bool shouldPersistentlyMapCpuToGpu =
        sharedContext->vulkanCaps().shouldPersistentlyMapCpuToGpuBuffers();
    //AllocBufferMemory
    auto checkResult = [](VkResult result) {
        return result == VK_SUCCESS;
    };
    if (!skgpu::VulkanMemory::AllocBufferMemory(allocator,
                                                buffer,
                                                skgpu::Protected(isProtected),
                                                allocUsage,
                                                shouldPersistentlyMapCpuToGpu,
                                                checkResult,
                                                &alloc)) {
        VULKAN_CALL(sharedContext->interface(),
                    DestroyBuffer(sharedContext->device(),
                                  buffer,
                                  /*const VkAllocationCallbacks*=*/nullptr));
        return nullptr;
    }

    // Bind buffer
    VULKAN_CALL_RESULT(
            sharedContext,
            result,
            BindBufferMemory(sharedContext->device(), buffer, alloc.fMemory, alloc.fOffset));
    if (result != VK_SUCCESS) {
        skgpu::VulkanMemory::FreeBufferMemory(allocator, alloc);
        VULKAN_CALL(sharedContext->interface(), DestroyBuffer(sharedContext->device(),
                buffer,
                /*const VkAllocationCallbacks*=*/nullptr));
        return nullptr;
    }

    return sk_sp<Buffer>(new VulkanBuffer(
            sharedContext, size, type, accessPattern, std::move(buffer), alloc, bufInfo.usage,
            Protected(isProtected)));
}

VulkanBuffer::VulkanBuffer(const VulkanSharedContext* sharedContext,
                           size_t size,
                           BufferType type,
                           AccessPattern accessPattern,
                           VkBuffer buffer,
                           const skgpu::VulkanAlloc& alloc,
                           const VkBufferUsageFlags usageFlags,
                           Protected isProtected)
        : Buffer(sharedContext, size, isProtected)
        , fBuffer(std::move(buffer))
        , fAlloc(alloc)
        , fBufferUsageFlags(usageFlags)
        // We assume a buffer is used for CPU reads only in the case of GPU->CPU transfer buffers.
        , fBufferUsedForCPURead(type == BufferType::kXferGpuToCpu) {}

void VulkanBuffer::freeGpuData() {
    if (fMapPtr) {
        this->internalUnmap(0, this->size());
        fMapPtr = nullptr;
    }

    const VulkanSharedContext* sharedContext =
            static_cast<const VulkanSharedContext*>(this->sharedContext());
    SkASSERT(fBuffer);
    SkASSERT(fAlloc.fMemory && fAlloc.fBackendMemory);
    VULKAN_CALL(sharedContext->interface(),
                DestroyBuffer(sharedContext->device(), fBuffer, nullptr));
    fBuffer = VK_NULL_HANDLE;

    skgpu::VulkanMemory::FreeBufferMemory(sharedContext->memoryAllocator(), fAlloc);
    fAlloc.fMemory = VK_NULL_HANDLE;
    fAlloc.fBackendMemory = 0;
}

void VulkanBuffer::internalMap(size_t readOffset, size_t readSize) {
    SkASSERT(!fMapPtr);
    if (this->isMappable()) {
        SkASSERT(fAlloc.fSize > 0);
        SkASSERT(fAlloc.fSize >= readOffset + readSize);

        const VulkanSharedContext* sharedContext = this->vulkanSharedContext();

        auto allocator = sharedContext->memoryAllocator();
        auto checkResult = [sharedContext](VkResult result) {
            VULKAN_LOG_IF_NOT_SUCCESS(sharedContext, result, "skgpu::VulkanMemory::MapAlloc");
            return sharedContext->checkVkResult(result);
        };
        fMapPtr = skgpu::VulkanMemory::MapAlloc(allocator, fAlloc, checkResult);
        if (fMapPtr && readSize != 0) {
            auto checkResult_invalidate = [sharedContext, readOffset, readSize](VkResult result) {
                VULKAN_LOG_IF_NOT_SUCCESS(sharedContext,
                                          result,
                                          "skgpu::VulkanMemory::InvalidateMappedAlloc "
                                          "(readOffset:%zu, readSize:%zu)",
                                          readOffset,
                                          readSize);
                return sharedContext->checkVkResult(result);
            };
            // "Invalidate" here means make device writes visible to the host. That is, it makes
            // sure any GPU writes are finished in the range we might read from.
            skgpu::VulkanMemory::InvalidateMappedAlloc(allocator,
                                                       fAlloc,
                                                       readOffset,
                                                       readSize,
                                                       checkResult_invalidate);
        }
    }
}

void VulkanBuffer::internalUnmap(size_t flushOffset, size_t flushSize) {
    SkASSERT(fMapPtr && this->isMappable());

    SkASSERT(fAlloc.fSize > 0);
    SkASSERT(fAlloc.fSize >= flushOffset + flushSize);

    const VulkanSharedContext* sharedContext = this->vulkanSharedContext();
    auto checkResult = [sharedContext, flushOffset, flushSize](VkResult result) {
        VULKAN_LOG_IF_NOT_SUCCESS(sharedContext,
                                  result,
                                  "skgpu::VulkanMemory::FlushMappedAlloc "
                                  "(flushOffset:%zu, flushSize:%zu)",
                                  flushOffset,
                                  flushSize);
        return sharedContext->checkVkResult(result);
    };

    auto allocator = sharedContext->memoryAllocator();
    skgpu::VulkanMemory::FlushMappedAlloc(allocator, fAlloc, flushOffset, flushSize, checkResult);
    skgpu::VulkanMemory::UnmapAlloc(allocator, fAlloc);
}

void VulkanBuffer::onMap() {
    SkASSERT(fBuffer);
    SkASSERT(!this->isMapped());

    this->internalMap(0, fBufferUsedForCPURead ? this->size() : 0);
}

void VulkanBuffer::onUnmap() {
    SkASSERT(fBuffer);
    SkASSERT(this->isMapped());
    this->internalUnmap(0, fBufferUsedForCPURead ? 0 : this->size());
}

namespace {

VkPipelineStageFlags access_to_pipeline_srcStageFlags(const VkAccessFlags srcAccess) {
    // For now this function assumes the access flags equal a specific bit and don't act like true
    // flags (i.e. set of bits). If we ever start having buffer usages that have multiple accesses
    // in one usage we'll need to update this.
    switch (srcAccess) {
        case 0:
            return VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        case (VK_ACCESS_TRANSFER_WRITE_BIT):  // fallthrough
        case (VK_ACCESS_TRANSFER_READ_BIT):
            return VK_PIPELINE_STAGE_TRANSFER_BIT;
        case (VK_ACCESS_UNIFORM_READ_BIT):
            // TODO(b/307577875): It is possible that uniforms could have simply been used in the
            // vertex shader and not the fragment shader, so using the fragment shader pipeline
            // stage bit indiscriminately is a bit overkill. This call should be modified to check &
            // allow for selecting VK_PIPELINE_STAGE_VERTEX_SHADER_BIT when appropriate.
            return (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
        case (VK_ACCESS_SHADER_WRITE_BIT):
            return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
        case (VK_ACCESS_INDEX_READ_BIT):  // fallthrough
        case (VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT):
            return VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
        case (VK_ACCESS_INDIRECT_COMMAND_READ_BIT):
            return VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
        case (VK_ACCESS_HOST_READ_BIT):  // fallthrough
        case (VK_ACCESS_HOST_WRITE_BIT):
            return VK_PIPELINE_STAGE_HOST_BIT;
        default:
            SkUNREACHABLE;
    }
}

bool access_is_read_only(VkAccessFlags access) {
    switch (access) {
        case 0: // initialization state
        case (VK_ACCESS_TRANSFER_READ_BIT):
        case (VK_ACCESS_UNIFORM_READ_BIT):
        case (VK_ACCESS_INDEX_READ_BIT):
        case (VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT):
        case (VK_ACCESS_INDIRECT_COMMAND_READ_BIT):
        case (VK_ACCESS_HOST_READ_BIT):
            return true;
        case (VK_ACCESS_TRANSFER_WRITE_BIT):
        case (VK_ACCESS_SHADER_WRITE_BIT):
        case (VK_ACCESS_HOST_WRITE_BIT):
            return false;
        default:
            SkUNREACHABLE;
    }
}

} // anonymous namespace

void VulkanBuffer::setBufferAccess(VulkanCommandBuffer* cmdBuffer,
                                   VkAccessFlags dstAccess,
                                   VkPipelineStageFlags dstStageMask) const {
    SkASSERT(dstAccess == VK_ACCESS_HOST_READ_BIT ||
             dstAccess == VK_ACCESS_TRANSFER_WRITE_BIT ||
             dstAccess == VK_ACCESS_TRANSFER_READ_BIT ||
             dstAccess == VK_ACCESS_UNIFORM_READ_BIT ||
             dstAccess == VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT ||
             dstAccess == VK_ACCESS_INDEX_READ_BIT);

    VkPipelineStageFlags srcStageMask = access_to_pipeline_srcStageFlags(fCurrentAccess);
    SkASSERT(srcStageMask);

    bool needsBarrier = true;

    // We don't need a barrier if we're going from a read access to another read access.
    if (access_is_read_only(fCurrentAccess) && access_is_read_only(dstAccess)) {
        // Currently all of reads should be the same type of access. If we ever allow and need
        // different read usages for a buffer, then we'll need to update the logic in this file to
        // store all the read accesses in a mask. Additionally we'll need to keep track of what the
        // last write was since we will need to add a barrier to for the new read access. Even if we
        // had put in a barrier for a previous read already. For example if we have the sequence
        // Write_1, Read_Access1, Read_Access2. We will first put a barrier going from Write_1 to
        // Read_Access1. But with the current logic when we add Read_Access2 it will think its going
        // from a read -> read. Thus no barrier would be added. But we need do to add another
        // barrier for Write_1 to Read_Access2 so that the changes from write become visibile.
        SkASSERT(fCurrentAccess == dstAccess || fCurrentAccess == 0);
        needsBarrier = false;
    }

    // When the buffer was last used on the host, we don't need to add any barrier as writes on the
    // CPU host are implicitly synchronized what you submit new commands.
    if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
        needsBarrier = false;
    }

    if (needsBarrier) {
        VkBufferMemoryBarrier bufferMemoryBarrier = {
            VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,  // sType
            nullptr,                                  // pNext
            fCurrentAccess,                           // srcAccessMask
            dstAccess,                                // dstAccessMask
            VK_QUEUE_FAMILY_IGNORED,                  // srcQueueFamilyIndex
            VK_QUEUE_FAMILY_IGNORED,                  // dstQueueFamilyIndex
            fBuffer,                                  // buffer
            0,                                        // offset
            this->size(),                             // size
        };
        cmdBuffer->addBufferMemoryBarrier(srcStageMask, dstStageMask, &bufferMemoryBarrier);
    }

    fCurrentAccess = dstAccess;
}

} // namespace skgpu::graphite