1/* 2 * Copyright 2024 Valve Corporation 3 * SPDX-License-Identifier: MIT 4 */ 5#include "compiler/libcl/libcl_vk.h" 6#include "agx_pack.h" 7#include "geometry.h" 8#include "libagx_dgc.h" 9 10/* 11 * To implement drawIndirectCount generically, we dispatch a kernel to 12 * clone-and-patch the indirect buffer, predicating out draws as appropriate. 13 */ 14KERNEL(32) 15libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in, 16 constant uint32_t *draw_count, uint32_t stride_el, 17 uint indexed__2) 18{ 19 uint draw = get_global_id(0); 20 uint words = indexed__2 ? 5 : 4; 21 bool enabled = draw < *draw_count; 22 out += draw * words; 23 in += draw * stride_el; 24 25 /* Copy enabled draws, zero predicated draws. */ 26 for (uint i = 0; i < words; ++i) { 27 out[i] = enabled ? in[i] : 0; 28 } 29} 30 31/* 32 * Indexing/offseting is in software if necessary so we strip all 33 * indexing/offset information. 34 */ 35KERNEL(1) 36libagx_draw_without_adj(global VkDrawIndirectCommand *out, 37 global VkDrawIndirectCommand *in, 38 global struct agx_ia_state *ia, uint64_t index_buffer, 39 uint64_t index_buffer_range_el, int index_size_B, 40 enum mesa_prim prim) 41{ 42 *out = (VkDrawIndirectCommand){ 43 .vertexCount = libagx_remap_adj_count(in->vertexCount, prim), 44 .instanceCount = in->instanceCount, 45 }; 46 47 /* TODO: Deduplicate */ 48 if (index_size_B) { 49 uint offs = in->firstVertex; 50 51 ia->index_buffer = libagx_index_buffer( 52 index_buffer, index_buffer_range_el, offs, index_size_B, 0); 53 54 ia->index_buffer_range_el = 55 libagx_index_buffer_range_el(index_buffer_range_el, offs); 56 } 57} 58 59/* Precondition: len must be < the group size */ 60static void 61libagx_memcpy_small(global uchar *dst, constant uchar *src, uint len, uint tid) 62{ 63 if (tid < len) { 64 dst[tid] = src[tid]; 65 } 66} 67 68static void 69libagx_memcpy_aligned_uint4(global uint *dst, constant uint *src, uint len, 70 uint tid, uint group_size) 71{ 72 for (uint i = tid; i < len; i += group_size) { 73 vstore4(vload4(i, src), i, dst); 74 } 75} 76 77static void 78libagx_memcpy_to_aligned(global uint *dst, constant uchar *src, uint len, 79 uint tid, uint group_size) 80{ 81 /* Copy a few bytes at the start */ 82 uint start_unaligned = ((uintptr_t)src) & 3; 83 if (start_unaligned) { 84 uint need = 4 - start_unaligned; 85 libagx_memcpy_small((global uchar *)dst, src, need, tid); 86 src += need; 87 len -= need; 88 } 89 90 /* Copy a few bytes at the end */ 91 uint end_unaligned = len & 0xf; 92 len -= end_unaligned; 93 libagx_memcpy_small(((global uchar *)dst) + len, src + len, end_unaligned, 94 tid); 95 96 /* Now both src and dst are word-aligned, and len is 16-aligned */ 97 libagx_memcpy_aligned_uint4(dst, (constant uint *)src, len / 16, tid, 98 group_size); 99} 100 101/* Precondition: len must be < the group size */ 102static void 103libagx_memset_small(global uchar *dst, uchar b, int len, uint tid) 104{ 105 if (tid < len) { 106 dst[tid] = b; 107 } 108} 109 110/* 111 * AGX does not implement robustBufferAccess2 semantics for 112 * index buffers, where out-of-bounds indices read as zero. When we 113 * dynamically detect index buffer overread (this if-statement), we need 114 * to clone the index buffer and zero-extend it to get robustness. 115 * 116 * We do this dynamically (generating a VDM draw to consume the result) to avoid 117 * expensive allocations & memcpys in the happy path where no out-of-bounds 118 * access occurs. Otherwise we could use a hardware indirect draw, rather than 119 * generating VDM words directly in shader. 120 * 121 * TODO: Handle multiple draws in parallel. 122 */ 123KERNEL(32) 124libagx_draw_robust_index(global uint32_t *vdm, 125 global struct agx_geometry_state *heap, 126 constant VkDrawIndexedIndirectCommand *cmd, 127 uint64_t in_buf_ptr, uint32_t in_buf_range_B, 128 ushort restart, enum agx_primitive topology, 129 enum agx_index_size index_size__3) 130{ 131 uint tid = get_sub_group_id(); 132 bool first = tid == 0; 133 enum agx_index_size index_size = index_size__3; 134 135 struct agx_draw draw = agx_draw_indexed( 136 cmd->indexCount, cmd->instanceCount, cmd->firstIndex, cmd->vertexOffset, 137 cmd->firstInstance, in_buf_ptr, in_buf_range_B, index_size, restart); 138 139 if (agx_direct_draw_overreads_indices(draw)) { 140 constant void *in_buf = (constant void *)agx_draw_index_buffer(draw); 141 uint in_size_el = agx_draw_index_range_el(draw); 142 uint in_size_B = agx_indices_to_B(in_size_el, index_size); 143 144 /* After a small number of zeroes at the end, extra zeroes cannot change 145 * rendering since they will duplicate the same degenerate primitive many 146 * times. Therefore we clamp the number of zeroes we need to extend with. 147 * This makes the memset constant time. 148 */ 149 draw.b.count[0] = min(draw.b.count[0], in_size_el + 32); 150 151 uint out_size_el = draw.b.count[0]; 152 uint out_size_B = agx_indices_to_B(out_size_el, index_size); 153 154 /* Allocate memory for the shadow index buffer */ 155 global uchar *padded; 156 if (first) { 157 padded = agx_heap_alloc_nonatomic(heap, out_size_B); 158 } 159 padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0); 160 161 draw.index_buffer = (uintptr_t)padded; 162 draw.index_buffer_range_B = out_size_B; 163 draw.start = 0; 164 165 /* Clone the index buffer. The destination is aligned as a post-condition 166 * of agx_heap_alloc_nonatomic. 167 */ 168 libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid, 169 32); 170 171 /* Extend with up to 32 zeroes with a small memset */ 172 libagx_memset_small(padded + in_size_B, 0, out_size_B - in_size_B, tid); 173 } 174 175 if (first) { 176 agx_vdm_draw(vdm, 0, draw, topology); 177 } 178} 179