1/* 2 * Copyright 2023 Alyssa Rosenzweig 3 * Copyright 2023 Valve Corporation 4 * SPDX-License-Identifier: MIT 5 */ 6 7#include "geometry.h" 8 9static uint 10align(uint x, uint y) 11{ 12 return (x + y - 1) & ~(y - 1); 13} 14 15/* Compatible with util/u_math.h */ 16static inline uint 17util_logbase2_ceil(uint n) 18{ 19 if (n <= 1) 20 return 0; 21 else 22 return 32 - clz(n - 1); 23} 24 25/* Swap the two non-provoking vertices third vert in odd triangles. This 26 * generates a vertex ID list with a consistent winding order. 27 * 28 * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own 29 * inverse. This lets us reuse it for both vertex fetch and transform feedback. 30 */ 31uint 32libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) 33{ 34 unsigned pv = flatshade_first ? 0 : 2; 35 36 bool even = (prim & 1) == 0; 37 bool provoking = vert == pv; 38 39 return (provoking || even) ? vert : ((3 - pv) - vert); 40} 41 42uint64_t 43libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index, 44 uint vert, uint buffer, uint stride, 45 uint output_offset) 46{ 47 uint index = base_index + vert; 48 uint xfb_offset = (index * stride) + output_offset; 49 50 return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset; 51} 52 53uint 54libagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) 55{ 56 /* (0, 1), (1, 2), (2, 0) */ 57 if (prim == (num_prims - 1) && vert == 1) 58 return 0; 59 else 60 return prim + vert; 61} 62 63uint 64libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert, 65 uint num_prims) 66{ 67 /* Line list, line strip, or line loop */ 68 if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1) 69 return 0; 70 71 if (mode == MESA_PRIM_LINES) 72 prim *= 2; 73 74 return prim + vert; 75} 76 77uint 78libagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) 79{ 80 /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking 81 * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last. 82 * Piglit clipflat expects us to switch between these orders depending on 83 * provoking vertex, to avoid trivializing the fan. 84 * 85 * Rotate accordingly. 86 */ 87 if (flatshade_first) { 88 vert = vert + 1; 89 vert = (vert == 2) ? 0 : vert; 90 } 91 92 /* The simpler form assuming last is provoking. */ 93 return (vert == 0) ? 0 : prim + vert; 94} 95 96uint 97libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert, 98 bool flatshade_first) 99{ 100 if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) { 101 vert = vert + 1; 102 vert = (vert == 3) ? 0 : vert; 103 } 104 105 if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0) 106 return 0; 107 108 if (mode == MESA_PRIM_TRIANGLES) 109 prim *= 3; 110 111 /* Triangle list, triangle strip, or triangle fan */ 112 if (mode == MESA_PRIM_TRIANGLE_STRIP) { 113 unsigned pv = flatshade_first ? 0 : 2; 114 115 bool even = (prim & 1) == 0; 116 bool provoking = vert == pv; 117 118 vert = ((provoking || even) ? vert : ((3 - pv) - vert)); 119 } 120 121 return prim + vert; 122} 123 124uint 125libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert) 126{ 127 /* Line list adj or line strip adj */ 128 if (mode == MESA_PRIM_LINES_ADJACENCY) 129 prim *= 4; 130 131 return prim + vert; 132} 133 134uint 135libagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, 136 bool flatshade_first) 137{ 138 /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency". 139 * 140 * There are different cases for first/middle/last/only primitives and for 141 * odd/even primitives. Determine which case we're in. 142 */ 143 bool last = prim == (num_prims - 1); 144 bool first = prim == 0; 145 bool even = (prim & 1) == 0; 146 bool even_or_first = even || first; 147 148 /* When the last vertex is provoking, we rotate the primitives 149 * accordingly. This seems required for OpenGL. 150 */ 151 if (!flatshade_first && !even_or_first) { 152 vert = (vert + 4u) % 6u; 153 } 154 155 /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily, 156 * there are lots of patterns we can exploit, avoiding a full 6x6 LUT. 157 * 158 * Here we assume the first vertex is provoking, the Vulkan default. 159 */ 160 uint offsets[6] = { 161 0, 162 first ? 1 : (even ? -2 : 3), 163 even_or_first ? 2 : 4, 164 last ? 5 : 6, 165 even_or_first ? 4 : 2, 166 even_or_first ? 3 : -2, 167 }; 168 169 /* Ensure NIR can see thru the local array */ 170 uint offset = 0; 171 for (uint i = 1; i < 6; ++i) { 172 if (i == vert) 173 offset = offsets[i]; 174 } 175 176 /* Finally add to the base of the primitive */ 177 return (prim * 2) + offset; 178} 179 180uint 181libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert, 182 uint nr, bool flatshade_first) 183{ 184 /* Tri adj list or tri adj strip */ 185 if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) { 186 return libagx_vertex_id_for_tri_strip_adj(prim, vert, nr, 187 flatshade_first); 188 } else { 189 return (6 * prim) + vert; 190 } 191} 192 193uint 194libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, 195 uint prim, uint vert, uint num_prims) 196{ 197 switch (mode) { 198 case MESA_PRIM_POINTS: 199 case MESA_PRIM_LINES: 200 case MESA_PRIM_TRIANGLES: 201 case MESA_PRIM_LINES_ADJACENCY: 202 case MESA_PRIM_TRIANGLES_ADJACENCY: 203 /* Regular primitive: every N vertices defines a primitive */ 204 return (prim * mesa_vertices_per_prim(mode)) + vert; 205 206 case MESA_PRIM_LINE_LOOP: 207 return libagx_vertex_id_for_line_loop(prim, vert, num_prims); 208 209 case MESA_PRIM_LINE_STRIP: 210 case MESA_PRIM_LINE_STRIP_ADJACENCY: 211 /* (i, i + 1) or (i, ..., i + 3) */ 212 return prim + vert; 213 214 case MESA_PRIM_TRIANGLE_STRIP: { 215 /* Order depends on the provoking vert. 216 * 217 * First: (0, 1, 2), (1, 3, 2), (2, 3, 4). 218 * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4). 219 * 220 * Pull the (maybe swapped) vert from the corresponding primitive 221 */ 222 return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first); 223 } 224 225 case MESA_PRIM_TRIANGLE_FAN: 226 return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first); 227 228 case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: 229 return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims, 230 flatshade_first); 231 232 default: 233 return 0; 234 } 235} 236 237/* 238 * When unrolling the index buffer for a draw, we translate the old indirect 239 * draws to new indirect draws. This routine allocates the new index buffer and 240 * sets up most of the new draw descriptor. 241 */ 242static global void * 243setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw, 244 uint draw, enum mesa_prim mode, uint index_size_B) 245{ 246 /* Determine an upper bound on the memory required for the index buffer. 247 * Restarts only decrease the unrolled index buffer size, so the maximum size 248 * is the unrolled size when the input has no restarts. 249 */ 250 uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]); 251 uint max_verts = max_prims * mesa_vertices_per_prim(mode); 252 uint alloc_size = max_verts * index_size_B; 253 254 /* Allocate memory from the heap for the unrolled index buffer. Use an atomic 255 * since multiple threads may be running to handle multidraw in parallel. 256 */ 257 global struct agx_geometry_state *heap = ia->heap; 258 uint old_heap_bottom = atomic_fetch_add( 259 (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4)); 260 261 /* Regardless of the input stride, we use tightly packed output draws */ 262 global uint *out = &ia->out_draws[5 * draw]; 263 264 /* Setup most of the descriptor. Count will be determined after unroll. */ 265 out[1] = in_draw[1]; /* instance count */ 266 out[2] = old_heap_bottom / index_size_B; /* index offset */ 267 out[3] = in_draw[3]; /* index bias */ 268 out[4] = in_draw[4]; /* base instance */ 269 270 /* Return the index buffer we allocated */ 271 return (global uchar *)heap->heap + (old_heap_bottom * index_size_B); 272} 273 274#define UNROLL(INDEX, suffix) \ 275 void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia, \ 276 enum mesa_prim mode, uint draw) \ 277 { \ 278 /* For an indirect multidraw, we are dispatched maxDraws times and \ 279 * terminate trailing invocations. \ 280 */ \ 281 if (ia->count && draw >= *(ia->count)) \ 282 return; \ 283 \ 284 constant uint *in_draw = \ 285 (constant uint *)(ia->draws + (draw * ia->draw_stride)); \ 286 \ 287 uint count = in_draw[0]; \ 288 constant INDEX *in = (constant INDEX *)ia->index_buffer; \ 289 in += in_draw[2]; \ 290 \ 291 global INDEX *out = \ 292 setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX)); \ 293 \ 294 uint out_prims = 0; \ 295 INDEX restart_idx = ia->restart_index; \ 296 bool flatshade_first = ia->flatshade_first; \ 297 uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX); \ 298 \ 299 uint needle = 0; \ 300 uint per_prim = mesa_vertices_per_prim(mode); \ 301 while (needle < count) { \ 302 /* Search for next restart or the end */ \ 303 uint next_restart = needle; \ 304 while ((next_restart < count) && in[next_restart] != restart_idx) \ 305 ++next_restart; \ 306 \ 307 /* Emit up to the next restart */ \ 308 uint subcount = next_restart - needle; \ 309 uint subprims = u_decomposed_prims_for_vertices(mode, subcount); \ 310 for (uint i = 0; i < subprims; ++i) { \ 311 for (uint vtx = 0; vtx < per_prim; ++vtx) { \ 312 uint id = libagx_vertex_id_for_topology(mode, flatshade_first, \ 313 i, vtx, subprims); \ 314 uint offset = needle + id; \ 315 \ 316 out[(out_prims * per_prim) + vtx] = \ 317 offset < in_size_el ? in[offset] : 0; \ 318 } \ 319 \ 320 out_prims++; \ 321 } \ 322 \ 323 needle = next_restart + 1; \ 324 } \ 325 \ 326 ia->out_draws[(5 * draw) + 0] = out_prims * per_prim; \ 327 } 328 329UNROLL(uchar, u8) 330UNROLL(ushort, u16) 331UNROLL(uint, u32) 332 333uintptr_t 334libagx_index_buffer(constant struct agx_ia_state *p, uint id, 335 uint index_size) 336{ 337 return (uintptr_t)&p->index_buffer[id * index_size]; 338} 339 340uint 341libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i) 342{ 343 global uint *off_ptr = p->xfb_offs_ptrs[i]; 344 if (!off_ptr) 345 return 0; 346 347 uint off = *off_ptr; 348 p->xfb_base[i] = p->xfb_base_original[i] + off; 349 return off; 350} 351 352/* 353 * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into 354 * writes into the 32-bit output index buffer. We write the sequence (b, b + 1, 355 * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n 356 * (count) is the number of verts in the prims, and -1 is the prim restart index 357 * used to signal the end of the prim. 358 * 359 * For points, we write index buffers without restart, just as a sideband to 360 * pass data into the vertex shader. 361 */ 362void 363libagx_end_primitive(global int *index_buffer, uint total_verts, 364 uint verts_in_prim, uint total_prims, 365 uint invocation_vertex_base, uint invocation_prim_base, 366 uint geometry_base, bool restart) 367{ 368 /* Previous verts/prims are from previous invocations plus earlier 369 * prims in this invocation. For the intra-invocation counts, we 370 * subtract the count for this prim from the inclusive sum NIR gives us. 371 */ 372 uint previous_verts_in_invoc = (total_verts - verts_in_prim); 373 uint previous_verts = invocation_vertex_base + previous_verts_in_invoc; 374 uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0; 375 376 /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */ 377 uint index_base = geometry_base + previous_verts_in_invoc; 378 379 /* Index buffer contains 1 index for each vertex and 1 for each prim */ 380 global int *out = &index_buffer[previous_verts + previous_prims]; 381 382 /* Write out indices for the strip */ 383 for (uint i = 0; i < verts_in_prim; ++i) { 384 out[i] = index_base + i; 385 } 386 387 if (restart) 388 out[verts_in_prim] = -1; 389} 390 391void 392libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed, 393 uint vertices, uint primitives) 394{ 395 global uint *descriptor = p->indirect_desc; 396 global struct agx_geometry_state *state = p->state; 397 398 /* Setup the indirect draw descriptor */ 399 if (indexed) { 400 uint indices = vertices + primitives; /* includes restart indices */ 401 402 /* Allocate the index buffer */ 403 uint index_buffer_offset_B = state->heap_bottom; 404 p->output_index_buffer = 405 (global uint *)(state->heap + index_buffer_offset_B); 406 state->heap_bottom += (indices * 4); 407 408 descriptor[0] = indices; /* count */ 409 descriptor[1] = 1; /* instance count */ 410 descriptor[2] = index_buffer_offset_B / 4; /* start */ 411 descriptor[3] = 0; /* index bias */ 412 descriptor[4] = 0; /* start instance */ 413 } else { 414 descriptor[0] = vertices; /* count */ 415 descriptor[1] = 1; /* instance count */ 416 descriptor[2] = 0; /* start */ 417 descriptor[3] = 0; /* start instance */ 418 } 419 420 if (state->heap_bottom > 1024 * 1024 * 128) { 421 global uint *foo = (global uint *)(uintptr_t)0xdeadbeef; 422 *foo = 0x1234; 423 } 424} 425 426void 427libagx_gs_setup_indirect(global struct agx_geometry_params *p, 428 global struct agx_ia_state *ia, enum mesa_prim mode, 429 uint local_id) 430{ 431 global uint *in_draw = (global uint *)ia->draws; 432 433 /* Determine the (primitives, instances) grid size. */ 434 uint vertex_count = in_draw[0]; 435 uint instance_count = in_draw[1]; 436 437 /* Calculate number of primitives input into the GS */ 438 uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count); 439 p->input_primitives = prim_per_instance * instance_count; 440 p->input_vertices = vertex_count; 441 442 /* Invoke VS as (vertices, instances, 1); GS as (primitives, instances, 1) */ 443 p->vs_grid[0] = vertex_count; 444 p->vs_grid[1] = instance_count; 445 p->vs_grid[2] = 1; 446 447 p->gs_grid[0] = prim_per_instance; 448 p->gs_grid[1] = instance_count; 449 p->gs_grid[2] = 1; 450 451 p->primitives_log2 = util_logbase2_ceil(prim_per_instance); 452 453 /* If indexing is enabled, the third word is the offset into the index buffer 454 * in elements. Apply that offset now that we have it. For a hardware 455 * indirect draw, the hardware would do this for us, but for software input 456 * assembly we need to do it ourselves. 457 */ 458 if (ia->index_buffer) { 459 ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B; 460 } 461 462 /* We may need to allocate VS and GS count buffers, do so now */ 463 global struct agx_geometry_state *state = p->state; 464 465 uint vertex_buffer_size = 466 libagx_tcs_in_size(vertex_count * instance_count, p->vs_outputs); 467 468 p->count_buffer = (global uint *)(state->heap + state->heap_bottom); 469 state->heap_bottom += 470 align(p->input_primitives * p->count_buffer_stride, 16); 471 472 p->vertex_buffer = (global uint *)(state->heap + state->heap_bottom); 473 state->heap_bottom += align(vertex_buffer_size, 4); 474} 475 476void 477libagx_prefix_sum(global uint *buffer, uint len, uint words, uint2 local_id) 478{ 479 /* Main loop: complete subgroups processing 32 values at once 480 * 481 * TODO: Don't do a serial bottleneck! This is bad! 482 */ 483 uint i, count = 0; 484 uint len_remainder = len % 32; 485 uint len_rounded_down = len - len_remainder; 486 487 for (i = local_id.x; i < len_rounded_down; i += 32) { 488 global uint *ptr = &buffer[(i * words) + local_id.y]; 489 uint value = *ptr; 490 491 /* TODO: use inclusive once that's wired up */ 492 uint value_prefix_sum = sub_group_scan_exclusive_add(value) + value; 493 *ptr = count + value_prefix_sum; 494 495 /* Advance count by the reduction sum of all processed values. We already 496 * have that sum calculated in the last lane. We know that lane is active, 497 * since all control flow is uniform except in the last iteration. 498 */ 499 count += sub_group_broadcast(value_prefix_sum, 31); 500 } 501 502 /* The last iteration is special since we won't have a full subgroup unless 503 * the length is divisible by the subgroup size, and we don't advance count. 504 */ 505 if (local_id.x < len_remainder) { 506 global uint *ptr = &buffer[(i * words) + local_id.y]; 507 uint value = *ptr; 508 509 /* TODO: use inclusive once that's wired up */ 510 *ptr = count + sub_group_scan_exclusive_add(value) + value; 511 } 512} 513 514bool 515libagx_is_provoking_last(global struct agx_ia_state *ia) 516{ 517 return !ia->flatshade_first; 518} 519 520uintptr_t 521libagx_vertex_output_address(constant struct agx_geometry_params *p, uint vtx, 522 gl_varying_slot location, uint64_t vs_outputs) 523{ 524 return (uintptr_t)p->vertex_buffer + 525 libagx_tcs_in_offs(vtx, location, vs_outputs); 526} 527