• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2024 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5#include "compiler/libcl/libcl_vk.h"
6#include "agx_pack.h"
7#include "geometry.h"
8#include "libagx_dgc.h"
9
10/*
11 * To implement drawIndirectCount generically, we dispatch a kernel to
12 * clone-and-patch the indirect buffer, predicating out draws as appropriate.
13 */
14KERNEL(32)
15libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in,
16                          constant uint32_t *draw_count, uint32_t stride_el,
17                          uint indexed__2)
18{
19   uint draw = get_global_id(0);
20   uint words = indexed__2 ? 5 : 4;
21   bool enabled = draw < *draw_count;
22   out += draw * words;
23   in += draw * stride_el;
24
25   /* Copy enabled draws, zero predicated draws. */
26   for (uint i = 0; i < words; ++i) {
27      out[i] = enabled ? in[i] : 0;
28   }
29}
30
31/*
32 * Indexing/offseting is in software if necessary so we strip all
33 * indexing/offset information.
34 */
35KERNEL(1)
36libagx_draw_without_adj(global VkDrawIndirectCommand *out,
37                        global VkDrawIndirectCommand *in,
38                        global struct agx_ia_state *ia, uint64_t index_buffer,
39                        uint64_t index_buffer_range_el, int index_size_B,
40                        enum mesa_prim prim)
41{
42   *out = (VkDrawIndirectCommand){
43      .vertexCount = libagx_remap_adj_count(in->vertexCount, prim),
44      .instanceCount = in->instanceCount,
45   };
46
47   /* TODO: Deduplicate */
48   if (index_size_B) {
49      uint offs = in->firstVertex;
50
51      ia->index_buffer = libagx_index_buffer(
52         index_buffer, index_buffer_range_el, offs, index_size_B, 0);
53
54      ia->index_buffer_range_el =
55         libagx_index_buffer_range_el(index_buffer_range_el, offs);
56   }
57}
58
59/* Precondition: len must be < the group size */
60static void
61libagx_memcpy_small(global uchar *dst, constant uchar *src, uint len, uint tid)
62{
63   if (tid < len) {
64      dst[tid] = src[tid];
65   }
66}
67
68static void
69libagx_memcpy_aligned_uint4(global uint *dst, constant uint *src, uint len,
70                            uint tid, uint group_size)
71{
72   for (uint i = tid; i < len; i += group_size) {
73      vstore4(vload4(i, src), i, dst);
74   }
75}
76
77static void
78libagx_memcpy_to_aligned(global uint *dst, constant uchar *src, uint len,
79                         uint tid, uint group_size)
80{
81   /* Copy a few bytes at the start */
82   uint start_unaligned = ((uintptr_t)src) & 3;
83   if (start_unaligned) {
84      uint need = 4 - start_unaligned;
85      libagx_memcpy_small((global uchar *)dst, src, need, tid);
86      src += need;
87      len -= need;
88   }
89
90   /* Copy a few bytes at the end */
91   uint end_unaligned = len & 0xf;
92   len -= end_unaligned;
93   libagx_memcpy_small(((global uchar *)dst) + len, src + len, end_unaligned,
94                       tid);
95
96   /* Now both src and dst are word-aligned, and len is 16-aligned */
97   libagx_memcpy_aligned_uint4(dst, (constant uint *)src, len / 16, tid,
98                               group_size);
99}
100
101/* Precondition: len must be < the group size */
102static void
103libagx_memset_small(global uchar *dst, uchar b, int len, uint tid)
104{
105   if (tid < len) {
106      dst[tid] = b;
107   }
108}
109
110/*
111 * AGX does not implement robustBufferAccess2 semantics for
112 * index buffers, where out-of-bounds indices read as zero. When we
113 * dynamically detect index buffer overread (this if-statement), we need
114 * to clone the index buffer and zero-extend it to get robustness.
115 *
116 * We do this dynamically (generating a VDM draw to consume the result) to avoid
117 * expensive allocations & memcpys in the happy path where no out-of-bounds
118 * access occurs. Otherwise we could use a hardware indirect draw, rather than
119 * generating VDM words directly in shader.
120 *
121 * TODO: Handle multiple draws in parallel.
122 */
123KERNEL(32)
124libagx_draw_robust_index(global uint32_t *vdm,
125                         global struct agx_geometry_state *heap,
126                         constant VkDrawIndexedIndirectCommand *cmd,
127                         uint64_t in_buf_ptr, uint32_t in_buf_range_B,
128                         ushort restart, enum agx_primitive topology,
129                         enum agx_index_size index_size__3)
130{
131   uint tid = get_sub_group_id();
132   bool first = tid == 0;
133   enum agx_index_size index_size = index_size__3;
134
135   struct agx_draw draw = agx_draw_indexed(
136      cmd->indexCount, cmd->instanceCount, cmd->firstIndex, cmd->vertexOffset,
137      cmd->firstInstance, in_buf_ptr, in_buf_range_B, index_size, restart);
138
139   if (agx_direct_draw_overreads_indices(draw)) {
140      constant void *in_buf = (constant void *)agx_draw_index_buffer(draw);
141      uint in_size_el = agx_draw_index_range_el(draw);
142      uint in_size_B = agx_indices_to_B(in_size_el, index_size);
143
144      /* After a small number of zeroes at the end, extra zeroes cannot change
145       * rendering since they will duplicate the same degenerate primitive many
146       * times. Therefore we clamp the number of zeroes we need to extend with.
147       * This makes the memset constant time.
148       */
149      draw.b.count[0] = min(draw.b.count[0], in_size_el + 32);
150
151      uint out_size_el = draw.b.count[0];
152      uint out_size_B = agx_indices_to_B(out_size_el, index_size);
153
154      /* Allocate memory for the shadow index buffer */
155      global uchar *padded;
156      if (first) {
157         padded = agx_heap_alloc_nonatomic(heap, out_size_B);
158      }
159      padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0);
160
161      draw.index_buffer = (uintptr_t)padded;
162      draw.index_buffer_range_B = out_size_B;
163      draw.start = 0;
164
165      /* Clone the index buffer. The destination is aligned as a post-condition
166       * of agx_heap_alloc_nonatomic.
167       */
168      libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid,
169                               32);
170
171      /* Extend with up to 32 zeroes with a small memset */
172      libagx_memset_small(padded + in_size_B, 0, out_size_B - in_size_B, tid);
173   }
174
175   if (first) {
176      agx_vdm_draw(vdm, 0, draw, topology);
177   }
178}
179