1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_xfb_info.h"
25
26 #include "util/u_dynarray.h"
27 #include <util/u_math.h>
28
29 static void
add_var_xfb_varying(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,unsigned buffer,unsigned offset,const struct glsl_type * type)30 add_var_xfb_varying(nir_xfb_info *xfb,
31 nir_xfb_varyings_info *varyings,
32 unsigned buffer,
33 unsigned offset,
34 const struct glsl_type *type)
35 {
36 if (varyings == NULL)
37 return;
38
39 nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++];
40
41 varying->type = type;
42 varying->buffer = buffer;
43 varying->offset = offset;
44 xfb->buffers[buffer].varying_count++;
45 }
46
47
48 static nir_xfb_info *
nir_xfb_info_create(void * mem_ctx,uint16_t output_count)49 nir_xfb_info_create(void *mem_ctx, uint16_t output_count)
50 {
51 return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count));
52 }
53
54 static size_t
nir_xfb_varyings_info_size(uint16_t varying_count)55 nir_xfb_varyings_info_size(uint16_t varying_count)
56 {
57 return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count;
58 }
59
60 static nir_xfb_varyings_info *
nir_xfb_varyings_info_create(void * mem_ctx,uint16_t varying_count)61 nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count)
62 {
63 return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count));
64 }
65
66 static void
add_var_xfb_outputs(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,nir_variable * var,unsigned buffer,unsigned * location,unsigned * offset,const struct glsl_type * type,bool varying_added)67 add_var_xfb_outputs(nir_xfb_info *xfb,
68 nir_xfb_varyings_info *varyings,
69 nir_variable *var,
70 unsigned buffer,
71 unsigned *location,
72 unsigned *offset,
73 const struct glsl_type *type,
74 bool varying_added)
75 {
76 /* If this type contains a 64-bit value, align to 8 bytes */
77 if (glsl_type_contains_64bit(type))
78 *offset = ALIGN_POT(*offset, 8);
79
80 if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
81 unsigned length = glsl_get_length(type);
82
83 const struct glsl_type *child_type = glsl_get_array_element(type);
84 if (!glsl_type_is_array(child_type) &&
85 !glsl_type_is_struct(child_type)) {
86
87 add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
88 varying_added = true;
89 }
90
91 for (unsigned i = 0; i < length; i++)
92 add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
93 child_type, varying_added);
94 } else if (glsl_type_is_struct_or_ifc(type)) {
95 unsigned length = glsl_get_length(type);
96 for (unsigned i = 0; i < length; i++) {
97 const struct glsl_type *child_type = glsl_get_struct_field(type, i);
98 add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
99 child_type, varying_added);
100 }
101 } else {
102 assert(buffer < NIR_MAX_XFB_BUFFERS);
103 if (xfb->buffers_written & (1 << buffer)) {
104 assert(xfb->buffers[buffer].stride == var->data.xfb.stride);
105 assert(xfb->buffer_to_stream[buffer] == var->data.stream);
106 } else {
107 xfb->buffers_written |= (1 << buffer);
108 xfb->buffers[buffer].stride = var->data.xfb.stride;
109 xfb->buffer_to_stream[buffer] = var->data.stream;
110 }
111
112 assert(var->data.stream < NIR_MAX_XFB_STREAMS);
113 xfb->streams_written |= (1 << var->data.stream);
114
115 unsigned comp_slots;
116 if (var->data.compact) {
117 /* This only happens for clip/cull which are float arrays */
118 assert(glsl_without_array(type) == glsl_float_type());
119 assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
120 var->data.location == VARYING_SLOT_CLIP_DIST1);
121 comp_slots = glsl_get_length(type);
122 } else {
123 comp_slots = glsl_get_component_slots(type);
124
125 UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
126 assert(attrib_slots == glsl_count_attribute_slots(type, false));
127
128 /* Ensure that we don't have, for instance, a dvec2 with a
129 * location_frac of 2 which would make it crass a location boundary
130 * even though it fits in a single slot. However, you can have a
131 * dvec3 which crosses the slot boundary with a location_frac of 2.
132 */
133 assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
134 attrib_slots);
135 }
136
137 assert(var->data.location_frac + comp_slots <= 8);
138 uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
139 unsigned comp_offset = var->data.location_frac;
140
141 if (!varying_added) {
142 add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
143 }
144
145 while (comp_mask) {
146 nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
147
148 output->buffer = buffer;
149 output->offset = *offset;
150 output->location = *location;
151 output->component_mask = comp_mask & 0xf;
152 output->component_offset = comp_offset;
153
154 *offset += util_bitcount(output->component_mask) * 4;
155 (*location)++;
156 comp_mask >>= 4;
157 comp_offset = 0;
158 }
159 }
160 }
161
162 static int
compare_xfb_varying_offsets(const void * _a,const void * _b)163 compare_xfb_varying_offsets(const void *_a, const void *_b)
164 {
165 const nir_xfb_varying_info *a = _a, *b = _b;
166
167 if (a->buffer != b->buffer)
168 return a->buffer - b->buffer;
169
170 return a->offset - b->offset;
171 }
172
173 static int
compare_xfb_output_offsets(const void * _a,const void * _b)174 compare_xfb_output_offsets(const void *_a, const void *_b)
175 {
176 const nir_xfb_output_info *a = _a, *b = _b;
177
178 return a->offset - b->offset;
179 }
180
181 void
nir_shader_gather_xfb_info(nir_shader * shader)182 nir_shader_gather_xfb_info(nir_shader *shader)
183 {
184 nir_gather_xfb_info_with_varyings(shader, NULL, NULL);
185 }
186
187 void
nir_gather_xfb_info_with_varyings(nir_shader * shader,void * mem_ctx,nir_xfb_varyings_info ** varyings_info_out)188 nir_gather_xfb_info_with_varyings(nir_shader *shader,
189 void *mem_ctx,
190 nir_xfb_varyings_info **varyings_info_out)
191 {
192 assert(shader->info.stage == MESA_SHADER_VERTEX ||
193 shader->info.stage == MESA_SHADER_TESS_EVAL ||
194 shader->info.stage == MESA_SHADER_GEOMETRY);
195
196 /* Compute the number of outputs we have. This is simply the number of
197 * cumulative locations consumed by all the variables. If a location is
198 * represented by multiple variables, then they each count separately in
199 * number of outputs. This is only an estimate as some variables may have
200 * an xfb_buffer but not an output so it may end up larger than we need but
201 * it should be good enough for allocation.
202 */
203 unsigned num_outputs = 0;
204 unsigned num_varyings = 0;
205 nir_xfb_varyings_info *varyings_info = NULL;
206 nir_foreach_shader_out_variable(var, shader) {
207 if (var->data.explicit_xfb_buffer) {
208 num_outputs += glsl_count_attribute_slots(var->type, false);
209 num_varyings += glsl_varying_count(var->type);
210 }
211 }
212 if (num_outputs == 0 || num_varyings == 0)
213 return;
214
215 nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs);
216 if (varyings_info_out != NULL) {
217 *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings);
218 varyings_info = *varyings_info_out;
219 }
220
221 /* Walk the list of outputs and add them to the array */
222 nir_foreach_shader_out_variable(var, shader) {
223 if (!var->data.explicit_xfb_buffer)
224 continue;
225
226 unsigned location = var->data.location;
227
228 /* In order to know if we have a array of blocks can't be done just by
229 * checking if we have an interface type and is an array, because due
230 * splitting we could end on a case were we received a split struct
231 * that contains an array.
232 */
233 bool is_array_block = var->interface_type != NULL &&
234 glsl_type_is_array(var->type) &&
235 glsl_without_array(var->type) == var->interface_type;
236
237 if (var->data.explicit_offset && !is_array_block) {
238 unsigned offset = var->data.offset;
239 add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer,
240 &location, &offset, var->type, false);
241 } else if (is_array_block) {
242 assert(glsl_type_is_struct_or_ifc(var->interface_type));
243
244 unsigned aoa_size = glsl_get_aoa_size(var->type);
245 const struct glsl_type *itype = var->interface_type;
246 unsigned nfields = glsl_get_length(itype);
247 for (unsigned b = 0; b < aoa_size; b++) {
248 for (unsigned f = 0; f < nfields; f++) {
249 int foffset = glsl_get_struct_field_offset(itype, f);
250 const struct glsl_type *ftype = glsl_get_struct_field(itype, f);
251 if (foffset < 0) {
252 location += glsl_count_attribute_slots(ftype, false);
253 continue;
254 }
255
256 unsigned offset = foffset;
257 add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b,
258 &location, &offset, ftype, false);
259 }
260 }
261 }
262 }
263
264 /* Everything is easier in the state setup code if outputs and varyings are
265 * sorted in order of output offset (and buffer for varyings).
266 */
267 qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
268 compare_xfb_output_offsets);
269
270 if (varyings_info != NULL) {
271 qsort(varyings_info->varyings, varyings_info->varying_count,
272 sizeof(varyings_info->varyings[0]),
273 compare_xfb_varying_offsets);
274 }
275
276 #ifndef NDEBUG
277 /* Finally, do a sanity check */
278 unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0};
279 for (unsigned i = 0; i < xfb->output_count; i++) {
280 assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
281 assert(xfb->outputs[i].component_mask != 0);
282 unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
283 max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
284 }
285 #endif
286
287 ralloc_free(shader->xfb_info);
288 shader->xfb_info = xfb;
289 }
290
291 static int
get_xfb_out_sort_index(const nir_xfb_output_info * a)292 get_xfb_out_sort_index(const nir_xfb_output_info *a)
293 {
294 /* Return the maximum number to put dummy components at the end. */
295 if (!a->component_mask)
296 return MAX_XFB_BUFFERS << 26;
297
298 return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */
299 /* 10 bits for the component location (256 * 4) */
300 (((uint32_t)a->location * 4 + a->component_offset) << 16) |
301 /* 16 bits for the offset */
302 a->offset;
303 }
304
305 static int
compare_xfb_out(const void * pa,const void * pb)306 compare_xfb_out(const void *pa, const void *pb)
307 {
308 const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa;
309 const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb;
310
311 return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b);
312 }
313
314 /**
315 * Gather transform feedback info from lowered IO intrinsics.
316 *
317 * Optionally return slot_to_register, an optional table to translate
318 * gl_varying_slot to "base" indices.
319 */
320 nir_xfb_info *
nir_gather_xfb_info_from_intrinsics(nir_shader * nir,int slot_to_register[NUM_TOTAL_VARYING_SLOTS])321 nir_gather_xfb_info_from_intrinsics(nir_shader *nir,
322 int slot_to_register[NUM_TOTAL_VARYING_SLOTS])
323 {
324 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
325 uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0};
326 uint8_t buffer_mask = 0;
327 uint8_t stream_mask = 0;
328
329 if (slot_to_register) {
330 memset(slot_to_register, -1,
331 sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS));
332 }
333
334 /* Gather xfb outputs. */
335 struct util_dynarray array = {0};
336
337 nir_foreach_block(block, impl) {
338 nir_foreach_instr(instr, block) {
339 if (instr->type != nir_instr_type_intrinsic ||
340 !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr)))
341 continue;
342
343 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
344
345 unsigned wr_mask = nir_intrinsic_write_mask(intr);
346
347 while (wr_mask) {
348 unsigned i = u_bit_scan(&wr_mask);
349 unsigned index = nir_intrinsic_component(intr) + i;
350 nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) :
351 nir_intrinsic_io_xfb2(intr);
352
353 if (xfb.out[index % 2].num_components) {
354 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
355 nir_xfb_output_info out;
356
357 out.component_offset = index;
358 out.component_mask =
359 BITFIELD_RANGE(index, xfb.out[index % 2].num_components);
360 out.location = sem.location;
361 out.buffer = xfb.out[index % 2].buffer;
362 out.offset = (uint32_t)xfb.out[index % 2].offset * 4;
363 util_dynarray_append(&array, nir_xfb_output_info, out);
364
365 uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3;
366 buffer_to_stream[out.buffer] = stream;
367 buffer_mask |= BITFIELD_BIT(out.buffer);
368 stream_mask |= BITFIELD_BIT(stream);
369
370 if (slot_to_register)
371 slot_to_register[sem.location] = nir_intrinsic_base(intr);
372
373 /* No elements before component_offset are allowed to be set. */
374 assert(!(out.component_mask & BITFIELD_MASK(out.component_offset)));
375 }
376 }
377 }
378 }
379
380 nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data;
381 int count = util_dynarray_num_elements(&array, nir_xfb_output_info);
382
383 if (!count)
384 return NULL;
385
386 if (count > 1) {
387 /* Sort outputs by buffer, location, and component. */
388 qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
389
390 /* Merge outputs referencing the same slot. */
391 for (int i = 0; i < count - 1; i++) {
392 nir_xfb_output_info *cur = &outputs[i];
393
394 if (!cur->component_mask)
395 continue;
396
397 /* Outputs referencing the same buffer and location are contiguous. */
398 for (int j = i + 1;
399 j < count &&
400 cur->buffer == outputs[j].buffer &&
401 cur->location == outputs[j].location; j++) {
402 if (outputs[j].component_mask &&
403 outputs[j].offset - outputs[j].component_offset * 4 ==
404 cur->offset - cur->component_offset * 4) {
405 unsigned merged_offset = MIN2(cur->component_offset,
406 outputs[j].component_offset);
407 /* component_mask is relative to 0, not component_offset */
408 unsigned merged_mask = cur->component_mask | outputs[j].component_mask;
409
410 /* The component mask should have no holes after merging. */
411 if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) {
412 /* Merge outputs. */
413 cur->component_offset = merged_offset;
414 cur->component_mask = merged_mask;
415 cur->offset = (uint32_t)cur->offset -
416 (uint32_t)cur->component_offset * 4 +
417 (uint32_t)merged_offset * 4;
418 /* Disable the other output. */
419 outputs[j].component_mask = 0;
420 }
421 }
422 }
423 }
424
425 /* Sort outputs again to put disabled outputs at the end. */
426 qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
427
428 /* Remove disabled outputs. */
429 for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--)
430 count = i;
431 }
432
433 for (unsigned i = 0; i < count; i++)
434 assert(outputs[i].component_mask);
435
436 /* Create nir_xfb_info. */
437 nir_xfb_info *info = calloc(1, nir_xfb_info_size(count));
438 if (!info) {
439 util_dynarray_fini(&array);
440 return NULL;
441 }
442
443 /* Fill nir_xfb_info. */
444 info->buffers_written = buffer_mask;
445 info->streams_written = stream_mask;
446 memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream));
447 info->output_count = count;
448 memcpy(info->outputs, outputs, count * sizeof(outputs[0]));
449
450 /* Set strides. */
451 for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) {
452 if (buffer_mask & BITFIELD_BIT(i))
453 info->buffers[i].stride = nir->info.xfb_stride[i];
454 }
455
456 /* Set varying_count. */
457 for (unsigned i = 0; i < count; i++)
458 info->buffers[outputs[i].buffer].varying_count++;
459
460 util_dynarray_fini(&array);
461 return info;
462 }
463
464 void
nir_print_xfb_info(nir_xfb_info * info,FILE * fp)465 nir_print_xfb_info(nir_xfb_info *info, FILE *fp)
466 {
467 fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written);
468 fprintf(fp, "streams_written: 0x%x\n", info->streams_written);
469
470 for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
471 if (BITFIELD_BIT(i) & info->buffers_written) {
472 fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i,
473 info->buffers[i].stride,
474 info->buffers[i].varying_count,
475 info->buffer_to_stream[i]);
476 }
477 }
478
479 fprintf(fp, "output_count: %u\n", info->output_count);
480
481 for (unsigned i = 0; i < info->output_count; i++) {
482 fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, "
483 "component_offset=%u, component_mask=0x%x\n",
484 i, info->outputs[i].buffer,
485 info->outputs[i].offset,
486 info->outputs[i].location,
487 info->outputs[i].component_offset,
488 info->outputs[i].component_mask);
489 }
490 }
491