1 /*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "pipe/p_defines.h"
24
25 #include "compiler/nir/nir.h"
26
27 #include "nv50/nv50_context.h"
28 #include "nv50/nv50_program.h"
29
30 #include "nv50_ir_driver.h"
31
32 static inline unsigned
bitcount4(const uint32_t val)33 bitcount4(const uint32_t val)
34 {
35 static const uint8_t cnt[16]
36 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
37 return cnt[val & 0xf];
38 }
39
40 static int
nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out * info)41 nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out *info)
42 {
43 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
44 unsigned i, n, c;
45
46 n = 0;
47 for (i = 0; i < info->numInputs; ++i) {
48 prog->in[i].id = i;
49 prog->in[i].sn = info->in[i].sn;
50 prog->in[i].si = info->in[i].si;
51 prog->in[i].hw = n;
52 prog->in[i].mask = info->in[i].mask;
53
54 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
55
56 for (c = 0; c < 4; ++c)
57 if (info->in[i].mask & (1 << c))
58 info->in[i].slot[c] = n++;
59
60 if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
61 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
62 }
63 prog->in_nr = info->numInputs;
64
65 for (i = 0; i < info->numSysVals; ++i) {
66 switch (info->sv[i].sn) {
67 case TGSI_SEMANTIC_INSTANCEID:
68 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
69 continue;
70 case TGSI_SEMANTIC_VERTEXID:
71 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
72 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
73 continue;
74 case TGSI_SEMANTIC_PRIMID:
75 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
76 break;
77 default:
78 break;
79 }
80 }
81
82 /*
83 * Corner case: VP has no inputs, but we will still need to submit data to
84 * draw it. HW will shout at us and won't draw anything if we don't enable
85 * any input, so let's just pretend it's the first one.
86 */
87 if (prog->vp.attrs[0] == 0 &&
88 prog->vp.attrs[1] == 0 &&
89 prog->vp.attrs[2] == 0)
90 prog->vp.attrs[0] |= 0xf;
91
92 /* VertexID before InstanceID */
93 if (info->io.vertexId < info->numSysVals)
94 info->sv[info->io.vertexId].slot[0] = n++;
95 if (info->io.instanceId < info->numSysVals)
96 info->sv[info->io.instanceId].slot[0] = n++;
97
98 n = 0;
99 for (i = 0; i < info->numOutputs; ++i) {
100 switch (info->out[i].sn) {
101 case TGSI_SEMANTIC_PSIZE:
102 prog->vp.psiz = i;
103 break;
104 case TGSI_SEMANTIC_CLIPDIST:
105 prog->vp.clpd[info->out[i].si] = n;
106 break;
107 case TGSI_SEMANTIC_EDGEFLAG:
108 prog->vp.edgeflag = i;
109 break;
110 case TGSI_SEMANTIC_BCOLOR:
111 prog->vp.bfc[info->out[i].si] = i;
112 break;
113 case TGSI_SEMANTIC_LAYER:
114 prog->gp.has_layer = true;
115 prog->gp.layerid = n;
116 break;
117 case TGSI_SEMANTIC_VIEWPORT_INDEX:
118 prog->gp.has_viewport = true;
119 prog->gp.viewportid = n;
120 break;
121 default:
122 break;
123 }
124 prog->out[i].id = i;
125 prog->out[i].sn = info->out[i].sn;
126 prog->out[i].si = info->out[i].si;
127 prog->out[i].hw = n;
128 prog->out[i].mask = info->out[i].mask;
129
130 for (c = 0; c < 4; ++c)
131 if (info->out[i].mask & (1 << c))
132 info->out[i].slot[c] = n++;
133 }
134 prog->out_nr = info->numOutputs;
135 prog->max_out = n;
136 if (!prog->max_out)
137 prog->max_out = 1;
138
139 if (prog->vp.psiz < info->numOutputs)
140 prog->vp.psiz = prog->out[prog->vp.psiz].hw;
141
142 return 0;
143 }
144
145 static int
nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out * info)146 nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out *info)
147 {
148 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
149 unsigned i, n, m, c;
150 unsigned nvary;
151 unsigned nflat;
152 unsigned nintp = 0;
153
154 /* count recorded non-flat inputs */
155 for (m = 0, i = 0; i < info->numInputs; ++i) {
156 switch (info->in[i].sn) {
157 case TGSI_SEMANTIC_POSITION:
158 continue;
159 default:
160 m += info->in[i].flat ? 0 : 1;
161 break;
162 }
163 }
164 /* careful: id may be != i in info->in[prog->in[i].id] */
165
166 /* Fill prog->in[] so that non-flat inputs are first and
167 * kick out special inputs that don't use the RESULT_MAP.
168 */
169 for (n = 0, i = 0; i < info->numInputs; ++i) {
170 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
171 prog->fp.interp |= info->in[i].mask << 24;
172 for (c = 0; c < 4; ++c)
173 if (info->in[i].mask & (1 << c))
174 info->in[i].slot[c] = nintp++;
175 } else {
176 unsigned j = info->in[i].flat ? m++ : n++;
177
178 if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
179 prog->vp.bfc[info->in[i].si] = j;
180 else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
181 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
182
183 prog->in[j].id = i;
184 prog->in[j].mask = info->in[i].mask;
185 prog->in[j].sn = info->in[i].sn;
186 prog->in[j].si = info->in[i].si;
187 prog->in[j].linear = info->in[i].linear;
188
189 prog->in_nr++;
190 }
191 }
192 if (!(prog->fp.interp & (8 << 24))) {
193 ++nintp;
194 prog->fp.interp |= 8 << 24;
195 }
196
197 for (i = 0; i < prog->in_nr; ++i) {
198 int j = prog->in[i].id;
199
200 prog->in[i].hw = nintp;
201 for (c = 0; c < 4; ++c)
202 if (prog->in[i].mask & (1 << c))
203 info->in[j].slot[c] = nintp++;
204 }
205 /* (n == m) if m never increased, i.e. no flat inputs */
206 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
207 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
208 nvary = nintp - nflat;
209
210 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
211 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
212
213 /* put front/back colors right after HPOS */
214 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
215 for (i = 0; i < 2; ++i)
216 if (prog->vp.bfc[i] < 0xff)
217 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
218
219 /* FP outputs */
220
221 if (info->prop.fp.numColourResults > 1)
222 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
223
224 for (i = 0; i < info->numOutputs; ++i) {
225 prog->out[i].id = i;
226 prog->out[i].sn = info->out[i].sn;
227 prog->out[i].si = info->out[i].si;
228 prog->out[i].mask = info->out[i].mask;
229
230 if (i == info->io.fragDepth || i == info->io.sampleMask)
231 continue;
232 prog->out[i].hw = info->out[i].si * 4;
233
234 for (c = 0; c < 4; ++c)
235 info->out[i].slot[c] = prog->out[i].hw + c;
236
237 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
238 }
239
240 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
241 info->out[info->io.sampleMask].slot[0] = prog->max_out++;
242 prog->fp.has_samplemask = 1;
243 }
244
245 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
246 info->out[info->io.fragDepth].slot[2] = prog->max_out++;
247
248 if (!prog->max_out)
249 prog->max_out = 4;
250
251 return 0;
252 }
253
254 static int
nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out * info)255 nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
256 {
257 switch (info->type) {
258 case PIPE_SHADER_VERTEX:
259 return nv50_vertprog_assign_slots(info);
260 case PIPE_SHADER_GEOMETRY:
261 return nv50_vertprog_assign_slots(info);
262 case PIPE_SHADER_FRAGMENT:
263 return nv50_fragprog_assign_slots(info);
264 case PIPE_SHADER_COMPUTE:
265 return 0;
266 default:
267 return -1;
268 }
269 }
270
271 static struct nv50_stream_output_state *
nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out * info,const struct pipe_stream_output_info * pso)272 nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out *info,
273 const struct pipe_stream_output_info *pso)
274 {
275 struct nv50_stream_output_state *so;
276 unsigned b, i, c;
277 unsigned base[4];
278
279 so = MALLOC_STRUCT(nv50_stream_output_state);
280 if (!so)
281 return NULL;
282 memset(so->map, 0xff, sizeof(so->map));
283
284 for (b = 0; b < 4; ++b)
285 so->num_attribs[b] = 0;
286 for (i = 0; i < pso->num_outputs; ++i) {
287 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components;
288 b = pso->output[i].output_buffer;
289 assert(b < 4);
290 so->num_attribs[b] = MAX2(so->num_attribs[b], end);
291 }
292
293 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
294
295 so->stride[0] = pso->stride[0] * 4;
296 base[0] = 0;
297 for (b = 1; b < 4; ++b) {
298 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
299 so->stride[b] = so->num_attribs[b] * 4;
300 if (so->num_attribs[b])
301 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
302 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
303 }
304 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
305 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
306 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
307 }
308
309 so->map_size = base[3] + so->num_attribs[3];
310
311 for (i = 0; i < pso->num_outputs; ++i) {
312 const unsigned s = pso->output[i].start_component;
313 const unsigned p = pso->output[i].dst_offset;
314 const unsigned r = pso->output[i].register_index;
315 b = pso->output[i].output_buffer;
316
317 if (r >= info->numOutputs)
318 continue;
319
320 for (c = 0; c < pso->output[i].num_components; ++c)
321 so->map[base[b] + p + c] = info->out[r].slot[s + c];
322 }
323
324 return so;
325 }
326
327 bool
nv50_program_translate(struct nv50_program * prog,uint16_t chipset,struct util_debug_callback * debug)328 nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
329 struct util_debug_callback *debug)
330 {
331 struct nv50_ir_prog_info *info;
332 struct nv50_ir_prog_info_out info_out = {};
333 int i, ret;
334 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
335
336 info = CALLOC_STRUCT(nv50_ir_prog_info);
337 if (!info)
338 return false;
339
340 info->type = prog->type;
341 info->target = chipset;
342
343 info->bin.sourceRep = prog->pipe.type;
344 switch (prog->pipe.type) {
345 case PIPE_SHADER_IR_TGSI:
346 info->bin.source = (void *)prog->pipe.tokens;
347 break;
348 case PIPE_SHADER_IR_NIR:
349 info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);
350 break;
351 default:
352 assert(!"unsupported IR!");
353 free(info);
354 return false;
355 }
356
357 info->bin.smemSize = prog->cp.smem_size;
358 info->io.auxCBSlot = 15;
359 info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
360 info->io.genUserClip = prog->vp.clpd_nr;
361 if (prog->fp.alphatest)
362 info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
363
364 info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
365 info->io.bufInfoBase = NV50_CB_AUX_BUF_INFO(0);
366 info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
367 info->io.msInfoCBSlot = 15;
368 info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
369
370 info->io.membarOffset = NV50_CB_AUX_MEMBAR_OFFSET;
371 info->io.gmemMembar = 15;
372
373 info->assignSlots = nv50_program_assign_varying_slots;
374
375 prog->vp.bfc[0] = 0xff;
376 prog->vp.bfc[1] = 0xff;
377 prog->vp.edgeflag = 0xff;
378 prog->vp.clpd[0] = map_undef;
379 prog->vp.clpd[1] = map_undef;
380 prog->vp.psiz = map_undef;
381 prog->gp.has_layer = 0;
382 prog->gp.has_viewport = 0;
383
384 if (prog->type == PIPE_SHADER_COMPUTE)
385 info->prop.cp.inputOffset = 0x14;
386
387 info_out.driverPriv = prog;
388
389 #ifndef NDEBUG
390 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
391 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
392 info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
393 #else
394 info->optLevel = 3;
395 #endif
396
397 ret = nv50_ir_generate_code(info, &info_out);
398 if (ret) {
399 NOUVEAU_ERR("shader translation failed: %i\n", ret);
400 goto out;
401 }
402
403 prog->code = info_out.bin.code;
404 prog->code_size = info_out.bin.codeSize;
405 prog->fixups = info_out.bin.relocData;
406 prog->interps = info_out.bin.fixupData;
407 prog->max_gpr = MAX2(4, (info_out.bin.maxGPR >> 1) + 1);
408 prog->tls_space = info_out.bin.tlsSpace;
409 prog->cp.smem_size = info_out.bin.smemSize;
410 prog->mul_zero_wins = info->io.mul_zero_wins;
411 prog->vp.need_vertex_id = info_out.io.vertexId < PIPE_MAX_SHADER_INPUTS;
412
413 prog->vp.clip_enable = (1 << info_out.io.clipDistances) - 1;
414 prog->vp.cull_enable =
415 ((1 << info_out.io.cullDistances) - 1) << info_out.io.clipDistances;
416 prog->vp.clip_mode = 0;
417 for (i = 0; i < info_out.io.cullDistances; ++i)
418 prog->vp.clip_mode |= 1 << ((info_out.io.clipDistances + i) * 4);
419
420 if (prog->type == PIPE_SHADER_FRAGMENT) {
421 if (info_out.prop.fp.writesDepth) {
422 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
423 prog->fp.flags[1] = 0x11;
424 }
425 if (info_out.prop.fp.usesDiscard)
426 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
427 } else
428 if (prog->type == PIPE_SHADER_GEOMETRY) {
429 switch (info_out.prop.gp.outputPrim) {
430 case PIPE_PRIM_LINE_STRIP:
431 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
432 break;
433 case PIPE_PRIM_TRIANGLE_STRIP:
434 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
435 break;
436 case PIPE_PRIM_POINTS:
437 default:
438 assert(info_out.prop.gp.outputPrim == PIPE_PRIM_POINTS);
439 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
440 break;
441 }
442 prog->gp.vert_count = CLAMP(info_out.prop.gp.maxVertices, 1, 1024);
443 } else
444 if (prog->type == PIPE_SHADER_COMPUTE) {
445 for (i = 0; i < NV50_MAX_GLOBALS; i++) {
446 prog->cp.gmem[i] = (struct nv50_gmem_state){
447 .valid = info_out.prop.cp.gmem[i].valid,
448 .image = info_out.prop.cp.gmem[i].image,
449 .slot = info_out.prop.cp.gmem[i].slot
450 };
451 }
452 }
453
454 if (prog->pipe.stream_output.num_outputs)
455 prog->so = nv50_program_create_strmout_state(&info_out,
456 &prog->pipe.stream_output);
457
458 util_debug_message(debug, SHADER_INFO,
459 "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, loops: %d, bytes: %d",
460 prog->type, info_out.bin.tlsSpace, info_out.bin.smemSize,
461 prog->max_gpr, info_out.bin.instructions, info_out.loops,
462 info_out.bin.codeSize);
463
464 out:
465 if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)
466 ralloc_free((void *)info->bin.source);
467 FREE(info);
468 return !ret;
469 }
470
471 bool
nv50_program_upload_code(struct nv50_context * nv50,struct nv50_program * prog)472 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
473 {
474 struct nouveau_heap *heap;
475 int ret;
476 uint32_t size = align(prog->code_size, 0x40);
477 uint8_t prog_type;
478
479 switch (prog->type) {
480 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break;
481 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
482 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
483 case PIPE_SHADER_COMPUTE: heap = nv50->screen->fp_code_heap; break;
484 default:
485 assert(!"invalid program type");
486 return false;
487 }
488
489 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
490 if (ret) {
491 /* Out of space: evict everything to compactify the code segment, hoping
492 * the working set is much smaller and drifts slowly. Improve me !
493 */
494 while (heap->next) {
495 struct nv50_program *evict = heap->next->priv;
496 if (evict)
497 nouveau_heap_free(&evict->mem);
498 }
499 debug_printf("WARNING: out of code space, evicting all shaders.\n");
500 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
501 if (ret) {
502 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
503 return false;
504 }
505 }
506
507 if (prog->type == PIPE_SHADER_COMPUTE) {
508 /* CP code must be uploaded in FP code segment. */
509 prog_type = 1;
510 } else {
511 prog->code_base = prog->mem->start;
512 prog_type = prog->type;
513 }
514
515 ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
516 if (ret < 0) {
517 nouveau_heap_free(&prog->mem);
518 return false;
519 }
520 if (ret > 0)
521 nv50->state.new_tls_space = true;
522
523 if (prog->fixups)
524 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
525 if (prog->interps)
526 nv50_ir_apply_fixups(prog->interps, prog->code,
527 prog->fp.force_persample_interp,
528 false /* flatshade */,
529 prog->fp.alphatest - 1,
530 false /* msaa */);
531
532 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
533 (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
534 NOUVEAU_BO_VRAM, prog->code_size, prog->code);
535
536 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
537 PUSH_DATA (nv50->base.pushbuf, 0);
538
539 return true;
540 }
541
542 void
nv50_program_destroy(struct nv50_context * nv50,struct nv50_program * p)543 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
544 {
545 const struct pipe_shader_state pipe = p->pipe;
546 const ubyte type = p->type;
547
548 if (p->mem)
549 nouveau_heap_free(&p->mem);
550
551 FREE(p->code);
552
553 FREE(p->fixups);
554 FREE(p->interps);
555 FREE(p->so);
556
557 memset(p, 0, sizeof(*p));
558
559 p->pipe = pipe;
560 p->type = type;
561 }
562