1 /*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50_program.h"
24 #include "nv50_context.h"
25
26 #include "codegen/nv50_ir_driver.h"
27
28 static INLINE unsigned
bitcount4(const uint32_t val)29 bitcount4(const uint32_t val)
30 {
31 static const uint8_t cnt[16]
32 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
33 return cnt[val & 0xf];
34 }
35
36 static int
nv50_vertprog_assign_slots(struct nv50_ir_prog_info * info)37 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
38 {
39 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
40 unsigned i, n, c;
41
42 n = 0;
43 for (i = 0; i < info->numInputs; ++i) {
44 prog->in[i].id = i;
45 prog->in[i].sn = info->in[i].sn;
46 prog->in[i].si = info->in[i].si;
47 prog->in[i].hw = n;
48 prog->in[i].mask = info->in[i].mask;
49
50 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
51
52 for (c = 0; c < 4; ++c)
53 if (info->in[i].mask & (1 << c))
54 info->in[i].slot[c] = n++;
55 }
56 prog->in_nr = info->numInputs;
57
58 for (i = 0; i < info->numSysVals; ++i) {
59 switch (info->sv[i].sn) {
60 case TGSI_SEMANTIC_INSTANCEID:
61 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
62 continue;
63 case TGSI_SEMANTIC_VERTEXID:
64 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
65 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
66 continue;
67 default:
68 break;
69 }
70 }
71
72 /*
73 * Corner case: VP has no inputs, but we will still need to submit data to
74 * draw it. HW will shout at us and won't draw anything if we don't enable
75 * any input, so let's just pretend it's the first one.
76 */
77 if (prog->vp.attrs[0] == 0 &&
78 prog->vp.attrs[1] == 0 &&
79 prog->vp.attrs[2] == 0)
80 prog->vp.attrs[0] |= 0xf;
81
82 /* VertexID before InstanceID */
83 if (info->io.vertexId < info->numSysVals)
84 info->sv[info->io.vertexId].slot[0] = n++;
85 if (info->io.instanceId < info->numSysVals)
86 info->sv[info->io.instanceId].slot[0] = n++;
87
88 n = 0;
89 for (i = 0; i < info->numOutputs; ++i) {
90 switch (info->out[i].sn) {
91 case TGSI_SEMANTIC_PSIZE:
92 prog->vp.psiz = i;
93 break;
94 case TGSI_SEMANTIC_CLIPDIST:
95 prog->vp.clpd[info->out[i].si] = n;
96 break;
97 case TGSI_SEMANTIC_EDGEFLAG:
98 prog->vp.edgeflag = i;
99 break;
100 case TGSI_SEMANTIC_BCOLOR:
101 prog->vp.bfc[info->out[i].si] = i;
102 break;
103 default:
104 break;
105 }
106 prog->out[i].id = i;
107 prog->out[i].sn = info->out[i].sn;
108 prog->out[i].si = info->out[i].si;
109 prog->out[i].hw = n;
110 prog->out[i].mask = info->out[i].mask;
111
112 for (c = 0; c < 4; ++c)
113 if (info->out[i].mask & (1 << c))
114 info->out[i].slot[c] = n++;
115 }
116 prog->out_nr = info->numOutputs;
117 prog->max_out = n;
118
119 if (prog->vp.psiz < info->numOutputs)
120 prog->vp.psiz = prog->out[prog->vp.psiz].hw;
121
122 return 0;
123 }
124
125 static int
nv50_fragprog_assign_slots(struct nv50_ir_prog_info * info)126 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
127 {
128 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
129 unsigned i, n, m, c;
130 unsigned nvary;
131 unsigned nflat;
132 unsigned nintp = 0;
133
134 /* count recorded non-flat inputs */
135 for (m = 0, i = 0; i < info->numInputs; ++i) {
136 switch (info->in[i].sn) {
137 case TGSI_SEMANTIC_POSITION:
138 case TGSI_SEMANTIC_FACE:
139 continue;
140 default:
141 m += info->in[i].flat ? 0 : 1;
142 break;
143 }
144 }
145 /* careful: id may be != i in info->in[prog->in[i].id] */
146
147 /* Fill prog->in[] so that non-flat inputs are first and
148 * kick out special inputs that don't use the RESULT_MAP.
149 */
150 for (n = 0, i = 0; i < info->numInputs; ++i) {
151 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
152 prog->fp.interp |= info->in[i].mask << 24;
153 for (c = 0; c < 4; ++c)
154 if (info->in[i].mask & (1 << c))
155 info->in[i].slot[c] = nintp++;
156 } else
157 if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
158 info->in[i].slot[0] = 255;
159 } else {
160 unsigned j = info->in[i].flat ? m++ : n++;
161
162 if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
163 prog->vp.bfc[info->in[i].si] = j;
164
165 prog->in[j].id = i;
166 prog->in[j].mask = info->in[i].mask;
167 prog->in[j].sn = info->in[i].sn;
168 prog->in[j].si = info->in[i].si;
169 prog->in[j].linear = info->in[i].linear;
170
171 prog->in_nr++;
172 }
173 }
174 if (!(prog->fp.interp & (8 << 24))) {
175 ++nintp;
176 prog->fp.interp |= 8 << 24;
177 }
178
179 for (i = 0; i < prog->in_nr; ++i) {
180 int j = prog->in[i].id;
181
182 prog->in[i].hw = nintp;
183 for (c = 0; c < 4; ++c)
184 if (prog->in[i].mask & (1 << c))
185 info->in[j].slot[c] = nintp++;
186 }
187 /* (n == m) if m never increased, i.e. no flat inputs */
188 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
189 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
190 nvary = nintp - nflat;
191
192 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
193 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
194
195 /* put front/back colors right after HPOS */
196 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
197 for (i = 0; i < 2; ++i)
198 if (prog->vp.bfc[i] < 0xff)
199 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
200
201 /* FP outputs */
202
203 if (info->prop.fp.numColourResults > 1)
204 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
205
206 for (i = 0; i < info->numOutputs; ++i) {
207 prog->out[i].id = i;
208 prog->out[i].sn = info->out[i].sn;
209 prog->out[i].si = info->out[i].si;
210 prog->out[i].mask = info->out[i].mask;
211
212 if (i == info->io.fragDepth || i == info->io.sampleMask)
213 continue;
214 prog->out[i].hw = info->out[i].si * 4;
215
216 for (c = 0; c < 4; ++c)
217 info->out[i].slot[c] = prog->out[i].hw + c;
218
219 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
220 }
221
222 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
223 info->out[info->io.sampleMask].slot[0] = prog->max_out++;
224
225 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
226 info->out[info->io.fragDepth].slot[2] = prog->max_out++;
227
228 if (!prog->max_out)
229 prog->max_out = 4;
230
231 return 0;
232 }
233
234 static int
nv50_program_assign_varying_slots(struct nv50_ir_prog_info * info)235 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
236 {
237 switch (info->type) {
238 case PIPE_SHADER_VERTEX:
239 return nv50_vertprog_assign_slots(info);
240 case PIPE_SHADER_GEOMETRY:
241 return nv50_vertprog_assign_slots(info);
242 case PIPE_SHADER_FRAGMENT:
243 return nv50_fragprog_assign_slots(info);
244 default:
245 return -1;
246 }
247 }
248
249 static struct nv50_stream_output_state *
nv50_program_create_strmout_state(const struct nv50_ir_prog_info * info,const struct pipe_stream_output_info * pso)250 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
251 const struct pipe_stream_output_info *pso)
252 {
253 struct nv50_stream_output_state *so;
254 unsigned b, i, c;
255 unsigned base[4];
256
257 so = MALLOC_STRUCT(nv50_stream_output_state);
258 if (!so)
259 return NULL;
260 memset(so->map, 0xff, sizeof(so->map));
261
262 for (b = 0; b < 4; ++b)
263 so->num_attribs[b] = 0;
264 for (i = 0; i < pso->num_outputs; ++i) {
265 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components;
266 b = pso->output[i].output_buffer;
267 assert(b < 4);
268 so->num_attribs[b] = MAX2(so->num_attribs[b], end);
269 }
270
271 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
272
273 so->stride[0] = pso->stride[0] * 4;
274 base[0] = 0;
275 for (b = 1; b < 4; ++b) {
276 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
277 so->stride[b] = so->num_attribs[b] * 4;
278 if (so->num_attribs[b])
279 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
280 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
281 }
282 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
283 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
284 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
285 }
286
287 so->map_size = base[3] + so->num_attribs[3];
288
289 for (i = 0; i < pso->num_outputs; ++i) {
290 const unsigned s = pso->output[i].start_component;
291 const unsigned p = pso->output[i].dst_offset;
292 const unsigned r = pso->output[i].register_index;
293 b = pso->output[i].output_buffer;
294
295 for (c = 0; c < pso->output[i].num_components; ++c)
296 so->map[base[b] + p + c] = info->out[r].slot[s + c];
297 }
298
299 return so;
300 }
301
302 boolean
nv50_program_translate(struct nv50_program * prog,uint16_t chipset)303 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
304 {
305 struct nv50_ir_prog_info *info;
306 int ret;
307 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
308
309 info = CALLOC_STRUCT(nv50_ir_prog_info);
310 if (!info)
311 return FALSE;
312
313 info->type = prog->type;
314 info->target = chipset;
315 info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
316 info->bin.source = (void *)prog->pipe.tokens;
317
318 info->io.ucpBinding = 15;
319 info->io.ucpBase = 0;
320 info->io.genUserClip = prog->vp.clpd_nr;
321
322 info->assignSlots = nv50_program_assign_varying_slots;
323
324 prog->vp.bfc[0] = 0xff;
325 prog->vp.bfc[1] = 0xff;
326 prog->vp.edgeflag = 0xff;
327 prog->vp.clpd[0] = map_undef;
328 prog->vp.clpd[1] = map_undef;
329 prog->vp.psiz = map_undef;
330 prog->gp.primid = 0x80;
331
332 info->driverPriv = prog;
333
334 #ifdef DEBUG
335 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
336 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
337 #else
338 info->optLevel = 3;
339 #endif
340
341 ret = nv50_ir_generate_code(info);
342 if (ret) {
343 NOUVEAU_ERR("shader translation failed: %i\n", ret);
344 goto out;
345 }
346 if (info->bin.syms) /* we don't need them yet */
347 FREE(info->bin.syms);
348
349 prog->code = info->bin.code;
350 prog->code_size = info->bin.codeSize;
351 prog->fixups = info->bin.relocData;
352 prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
353 prog->tls_space = info->bin.tlsSpace;
354
355 if (prog->type == PIPE_SHADER_FRAGMENT) {
356 if (info->prop.fp.writesDepth) {
357 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
358 prog->fp.flags[1] = 0x11;
359 }
360 if (info->prop.fp.usesDiscard)
361 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
362 }
363
364 if (prog->pipe.stream_output.num_outputs)
365 prog->so = nv50_program_create_strmout_state(info,
366 &prog->pipe.stream_output);
367
368 out:
369 FREE(info);
370 return !ret;
371 }
372
373 boolean
nv50_program_upload_code(struct nv50_context * nv50,struct nv50_program * prog)374 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
375 {
376 struct nouveau_heap *heap;
377 int ret;
378 uint32_t size = align(prog->code_size, 0x40);
379
380 switch (prog->type) {
381 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break;
382 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
383 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
384 default:
385 assert(!"invalid program type");
386 return FALSE;
387 }
388
389 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
390 if (ret) {
391 /* Out of space: evict everything to compactify the code segment, hoping
392 * the working set is much smaller and drifts slowly. Improve me !
393 */
394 while (heap->next) {
395 struct nv50_program *evict = heap->next->priv;
396 if (evict)
397 nouveau_heap_free(&evict->mem);
398 }
399 debug_printf("WARNING: out of code space, evicting all shaders.\n");
400 }
401 prog->code_base = prog->mem->start;
402
403 ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
404 if (ret < 0)
405 return FALSE;
406 if (ret > 0)
407 nv50->state.new_tls_space = TRUE;
408
409 if (prog->fixups)
410 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
411
412 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
413 (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
414 NOUVEAU_BO_VRAM, prog->code_size, prog->code);
415
416 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
417 PUSH_DATA (nv50->base.pushbuf, 0);
418
419 return TRUE;
420 }
421
422 void
nv50_program_destroy(struct nv50_context * nv50,struct nv50_program * p)423 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
424 {
425 const struct pipe_shader_state pipe = p->pipe;
426 const ubyte type = p->type;
427
428 if (p->mem)
429 nouveau_heap_free(&p->mem);
430
431 if (p->code)
432 FREE(p->code);
433
434 if (p->fixups)
435 FREE(p->fixups);
436
437 if (p->so)
438 FREE(p->so);
439
440 memset(p, 0, sizeof(*p));
441
442 p->pipe = pipe;
443 p->type = type;
444 }
445