• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir_xfb_info.h"
25 
26 #include "util/u_dynarray.h"
27 #include <util/u_math.h>
28 
29 static void
add_var_xfb_varying(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,unsigned buffer,unsigned offset,const struct glsl_type * type)30 add_var_xfb_varying(nir_xfb_info *xfb,
31                     nir_xfb_varyings_info *varyings,
32                     unsigned buffer,
33                     unsigned offset,
34                     const struct glsl_type *type)
35 {
36    if (varyings == NULL)
37       return;
38 
39    nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++];
40 
41    varying->type = type;
42    varying->buffer = buffer;
43    varying->offset = offset;
44    xfb->buffers[buffer].varying_count++;
45 }
46 
47 
48 static nir_xfb_info *
nir_xfb_info_create(void * mem_ctx,uint16_t output_count)49 nir_xfb_info_create(void *mem_ctx, uint16_t output_count)
50 {
51    return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count));
52 }
53 
54 static size_t
nir_xfb_varyings_info_size(uint16_t varying_count)55 nir_xfb_varyings_info_size(uint16_t varying_count)
56 {
57    return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count;
58 }
59 
60 static nir_xfb_varyings_info *
nir_xfb_varyings_info_create(void * mem_ctx,uint16_t varying_count)61 nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count)
62 {
63    return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count));
64 }
65 
66 static void
add_var_xfb_outputs(nir_xfb_info * xfb,nir_xfb_varyings_info * varyings,nir_variable * var,unsigned buffer,unsigned * location,unsigned * offset,const struct glsl_type * type,bool varying_added)67 add_var_xfb_outputs(nir_xfb_info *xfb,
68                     nir_xfb_varyings_info *varyings,
69                     nir_variable *var,
70                     unsigned buffer,
71                     unsigned *location,
72                     unsigned *offset,
73                     const struct glsl_type *type,
74                     bool varying_added)
75 {
76    /* If this type contains a 64-bit value, align to 8 bytes */
77    if (glsl_type_contains_64bit(type))
78       *offset = ALIGN_POT(*offset, 8);
79 
80    if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
81       unsigned length = glsl_get_length(type);
82 
83       const struct glsl_type *child_type = glsl_get_array_element(type);
84       if (!glsl_type_is_array(child_type) &&
85           !glsl_type_is_struct(child_type)) {
86 
87          add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
88          varying_added = true;
89       }
90 
91       for (unsigned i = 0; i < length; i++)
92          add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
93                              child_type, varying_added);
94    } else if (glsl_type_is_struct_or_ifc(type)) {
95       unsigned length = glsl_get_length(type);
96       for (unsigned i = 0; i < length; i++) {
97          const struct glsl_type *child_type = glsl_get_struct_field(type, i);
98          add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset,
99                              child_type, varying_added);
100       }
101    } else {
102       assert(buffer < NIR_MAX_XFB_BUFFERS);
103       if (xfb->buffers_written & (1 << buffer)) {
104          assert(xfb->buffers[buffer].stride == var->data.xfb.stride);
105          assert(xfb->buffer_to_stream[buffer] == var->data.stream);
106       } else {
107          xfb->buffers_written |= (1 << buffer);
108          xfb->buffers[buffer].stride = var->data.xfb.stride;
109          xfb->buffer_to_stream[buffer] = var->data.stream;
110       }
111 
112       assert(var->data.stream < NIR_MAX_XFB_STREAMS);
113       xfb->streams_written |= (1 << var->data.stream);
114 
115       unsigned comp_slots;
116       if (var->data.compact) {
117          /* This only happens for clip/cull which are float arrays */
118          assert(glsl_without_array(type) == glsl_float_type());
119          assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
120                 var->data.location == VARYING_SLOT_CLIP_DIST1);
121          comp_slots = glsl_get_length(type);
122       } else {
123          comp_slots = glsl_get_component_slots(type);
124 
125          UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
126          assert(attrib_slots == glsl_count_attribute_slots(type, false));
127 
128          /* Ensure that we don't have, for instance, a dvec2 with a
129           * location_frac of 2 which would make it crass a location boundary
130           * even though it fits in a single slot.  However, you can have a
131           * dvec3 which crosses the slot boundary with a location_frac of 2.
132           */
133          assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
134                 attrib_slots);
135       }
136 
137       assert(var->data.location_frac + comp_slots <= 8);
138       uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
139       unsigned comp_offset = var->data.location_frac;
140 
141       if (!varying_added) {
142          add_var_xfb_varying(xfb, varyings, buffer, *offset, type);
143       }
144 
145       while (comp_mask) {
146          nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
147 
148          output->buffer = buffer;
149          output->offset = *offset;
150          output->location = *location;
151          output->component_mask = comp_mask & 0xf;
152          output->component_offset = comp_offset;
153 
154          *offset += util_bitcount(output->component_mask) * 4;
155          (*location)++;
156          comp_mask >>= 4;
157          comp_offset = 0;
158       }
159    }
160 }
161 
162 static int
compare_xfb_varying_offsets(const void * _a,const void * _b)163 compare_xfb_varying_offsets(const void *_a, const void *_b)
164 {
165    const nir_xfb_varying_info *a = _a, *b = _b;
166 
167    if (a->buffer != b->buffer)
168       return a->buffer - b->buffer;
169 
170    return a->offset - b->offset;
171 }
172 
173 static int
compare_xfb_output_offsets(const void * _a,const void * _b)174 compare_xfb_output_offsets(const void *_a, const void *_b)
175 {
176    const nir_xfb_output_info *a = _a, *b = _b;
177 
178    return a->offset - b->offset;
179 }
180 
181 void
nir_shader_gather_xfb_info(nir_shader * shader)182 nir_shader_gather_xfb_info(nir_shader *shader)
183 {
184    nir_gather_xfb_info_with_varyings(shader, NULL, NULL);
185 }
186 
187 void
nir_gather_xfb_info_with_varyings(nir_shader * shader,void * mem_ctx,nir_xfb_varyings_info ** varyings_info_out)188 nir_gather_xfb_info_with_varyings(nir_shader *shader,
189                                   void *mem_ctx,
190                                   nir_xfb_varyings_info **varyings_info_out)
191 {
192    assert(shader->info.stage == MESA_SHADER_VERTEX ||
193           shader->info.stage == MESA_SHADER_TESS_EVAL ||
194           shader->info.stage == MESA_SHADER_GEOMETRY);
195 
196    /* Compute the number of outputs we have.  This is simply the number of
197     * cumulative locations consumed by all the variables.  If a location is
198     * represented by multiple variables, then they each count separately in
199     * number of outputs.  This is only an estimate as some variables may have
200     * an xfb_buffer but not an output so it may end up larger than we need but
201     * it should be good enough for allocation.
202     */
203    unsigned num_outputs = 0;
204    unsigned num_varyings = 0;
205    nir_xfb_varyings_info *varyings_info = NULL;
206    nir_foreach_shader_out_variable(var, shader) {
207       if (var->data.explicit_xfb_buffer) {
208          num_outputs += glsl_count_attribute_slots(var->type, false);
209          num_varyings += glsl_varying_count(var->type);
210       }
211    }
212    if (num_outputs == 0 || num_varyings == 0)
213       return;
214 
215    nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs);
216    if (varyings_info_out != NULL) {
217       *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings);
218       varyings_info = *varyings_info_out;
219    }
220 
221    /* Walk the list of outputs and add them to the array */
222    nir_foreach_shader_out_variable(var, shader) {
223       if (!var->data.explicit_xfb_buffer)
224          continue;
225 
226       unsigned location = var->data.location;
227 
228       /* In order to know if we have a array of blocks can't be done just by
229        * checking if we have an interface type and is an array, because due
230        * splitting we could end on a case were we received a split struct
231        * that contains an array.
232        */
233       bool is_array_block = var->interface_type != NULL &&
234          glsl_type_is_array(var->type) &&
235          glsl_without_array(var->type) == var->interface_type;
236 
237       if (var->data.explicit_offset && !is_array_block) {
238          unsigned offset = var->data.offset;
239          add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer,
240                              &location, &offset, var->type, false);
241       } else if (is_array_block) {
242          assert(glsl_type_is_struct_or_ifc(var->interface_type));
243 
244          unsigned aoa_size = glsl_get_aoa_size(var->type);
245          const struct glsl_type *itype = var->interface_type;
246          unsigned nfields = glsl_get_length(itype);
247          for (unsigned b = 0; b < aoa_size; b++) {
248             for (unsigned f = 0; f < nfields; f++) {
249                int foffset = glsl_get_struct_field_offset(itype, f);
250                const struct glsl_type *ftype = glsl_get_struct_field(itype, f);
251                if (foffset < 0) {
252                   location += glsl_count_attribute_slots(ftype, false);
253                   continue;
254                }
255 
256                unsigned offset = foffset;
257                add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b,
258                                    &location, &offset, ftype, false);
259             }
260          }
261       }
262    }
263 
264    /* Everything is easier in the state setup code if outputs and varyings are
265     * sorted in order of output offset (and buffer for varyings).
266     */
267    qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
268          compare_xfb_output_offsets);
269 
270    if (varyings_info != NULL) {
271       qsort(varyings_info->varyings, varyings_info->varying_count,
272             sizeof(varyings_info->varyings[0]),
273             compare_xfb_varying_offsets);
274    }
275 
276 #ifndef NDEBUG
277    /* Finally, do a sanity check */
278    unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0};
279    for (unsigned i = 0; i < xfb->output_count; i++) {
280       assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
281       assert(xfb->outputs[i].component_mask != 0);
282       unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
283       max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
284    }
285 #endif
286 
287    ralloc_free(shader->xfb_info);
288    shader->xfb_info = xfb;
289 }
290 
291 static int
get_xfb_out_sort_index(const nir_xfb_output_info * a)292 get_xfb_out_sort_index(const nir_xfb_output_info *a)
293 {
294    /* Return the maximum number to put dummy components at the end. */
295    if (!a->component_mask)
296       return MAX_XFB_BUFFERS << 26;
297 
298    return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */
299           /* 10 bits for the component location (256 * 4) */
300           (((uint32_t)a->location * 4 + a->component_offset) << 16) |
301           /* 16 bits for the offset */
302           a->offset;
303 }
304 
305 static int
compare_xfb_out(const void * pa,const void * pb)306 compare_xfb_out(const void *pa, const void *pb)
307 {
308    const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa;
309    const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb;
310 
311    return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b);
312 }
313 
314 /**
315  * Gather transform feedback info from lowered IO intrinsics.
316  *
317  * Optionally return slot_to_register, an optional table to translate
318  * gl_varying_slot to "base" indices.
319  */
320 nir_xfb_info *
nir_gather_xfb_info_from_intrinsics(nir_shader * nir,int slot_to_register[NUM_TOTAL_VARYING_SLOTS])321 nir_gather_xfb_info_from_intrinsics(nir_shader *nir,
322                                     int slot_to_register[NUM_TOTAL_VARYING_SLOTS])
323 {
324    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
325    uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0};
326    uint8_t buffer_mask = 0;
327    uint8_t stream_mask = 0;
328 
329    if (slot_to_register) {
330       memset(slot_to_register, -1,
331              sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS));
332    }
333 
334    /* Gather xfb outputs. */
335    struct util_dynarray array = {0};
336 
337    nir_foreach_block(block, impl) {
338       nir_foreach_instr(instr, block) {
339          if (instr->type != nir_instr_type_intrinsic ||
340              !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr)))
341             continue;
342 
343          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
344 
345          unsigned wr_mask = nir_intrinsic_write_mask(intr);
346 
347          while (wr_mask) {
348             unsigned i = u_bit_scan(&wr_mask);
349             unsigned index = nir_intrinsic_component(intr) + i;
350             nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) :
351                                          nir_intrinsic_io_xfb2(intr);
352 
353             if (xfb.out[index % 2].num_components) {
354                nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
355                nir_xfb_output_info out;
356 
357                out.component_offset = index;
358                out.component_mask =
359                   BITFIELD_RANGE(index, xfb.out[index % 2].num_components);
360                out.location = sem.location;
361                out.buffer = xfb.out[index % 2].buffer;
362                out.offset = (uint32_t)xfb.out[index % 2].offset * 4;
363                util_dynarray_append(&array, nir_xfb_output_info, out);
364 
365                uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3;
366                buffer_to_stream[out.buffer] = stream;
367                buffer_mask |= BITFIELD_BIT(out.buffer);
368                stream_mask |= BITFIELD_BIT(stream);
369 
370                if (slot_to_register)
371                   slot_to_register[sem.location] = nir_intrinsic_base(intr);
372 
373                /* No elements before component_offset are allowed to be set. */
374                assert(!(out.component_mask & BITFIELD_MASK(out.component_offset)));
375             }
376          }
377       }
378    }
379 
380    nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data;
381    int count = util_dynarray_num_elements(&array, nir_xfb_output_info);
382 
383    if (!count)
384       return NULL;
385 
386    if (count > 1) {
387       /* Sort outputs by buffer, location, and component. */
388       qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
389 
390       /* Merge outputs referencing the same slot. */
391       for (int i = 0; i < count - 1; i++) {
392          nir_xfb_output_info *cur = &outputs[i];
393 
394          if (!cur->component_mask)
395             continue;
396 
397          /* Outputs referencing the same buffer and location are contiguous. */
398          for (int j = i + 1;
399               j < count &&
400               cur->buffer == outputs[j].buffer &&
401               cur->location == outputs[j].location; j++) {
402             if (outputs[j].component_mask &&
403                 outputs[j].offset - outputs[j].component_offset * 4 ==
404                 cur->offset - cur->component_offset * 4) {
405                unsigned merged_offset = MIN2(cur->component_offset,
406                                              outputs[j].component_offset);
407                /* component_mask is relative to 0, not component_offset */
408                unsigned merged_mask = cur->component_mask | outputs[j].component_mask;
409 
410                /* The component mask should have no holes after merging. */
411                if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) {
412                   /* Merge outputs. */
413                   cur->component_offset = merged_offset;
414                   cur->component_mask = merged_mask;
415                   cur->offset = (uint32_t)cur->offset -
416                                 (uint32_t)cur->component_offset * 4 +
417                                 (uint32_t)merged_offset * 4;
418                   /* Disable the other output. */
419                   outputs[j].component_mask = 0;
420                }
421             }
422          }
423       }
424 
425       /* Sort outputs again to put disabled outputs at the end. */
426       qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out);
427 
428       /* Remove disabled outputs. */
429       for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--)
430          count = i;
431    }
432 
433    for (unsigned i = 0; i < count; i++)
434       assert(outputs[i].component_mask);
435 
436    /* Create nir_xfb_info. */
437    nir_xfb_info *info = calloc(1, nir_xfb_info_size(count));
438    if (!info) {
439       util_dynarray_fini(&array);
440       return NULL;
441    }
442 
443    /* Fill nir_xfb_info. */
444    info->buffers_written = buffer_mask;
445    info->streams_written = stream_mask;
446    memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream));
447    info->output_count = count;
448    memcpy(info->outputs, outputs, count * sizeof(outputs[0]));
449 
450    /* Set strides. */
451    for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) {
452       if (buffer_mask & BITFIELD_BIT(i))
453          info->buffers[i].stride = nir->info.xfb_stride[i];
454    }
455 
456    /* Set varying_count. */
457    for (unsigned i = 0; i < count; i++)
458       info->buffers[outputs[i].buffer].varying_count++;
459 
460    util_dynarray_fini(&array);
461    return info;
462 }
463 
464 void
nir_print_xfb_info(nir_xfb_info * info,FILE * fp)465 nir_print_xfb_info(nir_xfb_info *info, FILE *fp)
466 {
467    fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written);
468    fprintf(fp, "streams_written: 0x%x\n", info->streams_written);
469 
470    for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) {
471       if (BITFIELD_BIT(i) & info->buffers_written) {
472          fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i,
473                  info->buffers[i].stride,
474                  info->buffers[i].varying_count,
475                  info->buffer_to_stream[i]);
476       }
477    }
478 
479    fprintf(fp, "output_count: %u\n", info->output_count);
480 
481    for (unsigned i = 0; i < info->output_count; i++) {
482       fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, "
483                   "component_offset=%u, component_mask=0x%x\n",
484               i, info->outputs[i].buffer,
485               info->outputs[i].offset,
486               info->outputs[i].location,
487               info->outputs[i].component_offset,
488               info->outputs[i].component_mask);
489    }
490 }
491