1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 #include <pthread.h>
33 #include "main/imports.h"
34 #include "program/prog_parameter.h"
35 #include "program/prog_print.h"
36 #include "program/prog_to_nir.h"
37 #include "program/program.h"
38 #include "program/programopt.h"
39 #include "tnl/tnl.h"
40 #include "util/ralloc.h"
41 #include "compiler/glsl/ir.h"
42 #include "compiler/glsl/glsl_to_nir.h"
43 #include "compiler/nir/nir_serialize.h"
44
45 #include "brw_program.h"
46 #include "brw_context.h"
47 #include "compiler/brw_nir.h"
48 #include "brw_defines.h"
49 #include "intel_batchbuffer.h"
50
51 static bool
brw_nir_lower_uniforms(nir_shader * nir,bool is_scalar)52 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
53 {
54 if (is_scalar) {
55 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
56 type_size_scalar_bytes);
57 return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
58 } else {
59 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
60 type_size_vec4_bytes);
61 return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
62 }
63 }
64
65 nir_shader *
brw_create_nir(struct brw_context * brw,const struct gl_shader_program * shader_prog,struct gl_program * prog,gl_shader_stage stage,bool is_scalar)66 brw_create_nir(struct brw_context *brw,
67 const struct gl_shader_program *shader_prog,
68 struct gl_program *prog,
69 gl_shader_stage stage,
70 bool is_scalar)
71 {
72 struct gl_context *ctx = &brw->ctx;
73 const nir_shader_compiler_options *options =
74 ctx->Const.ShaderCompilerOptions[stage].NirOptions;
75 nir_shader *nir;
76
77 /* First, lower the GLSL IR or Mesa IR to NIR */
78 if (shader_prog) {
79 nir = glsl_to_nir(shader_prog, stage, options);
80 nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
81 nir_lower_returns(nir);
82 nir_validate_shader(nir);
83 NIR_PASS_V(nir, nir_lower_io_to_temporaries,
84 nir_shader_get_entrypoint(nir), true, false);
85 } else {
86 nir = prog_to_nir(prog, options);
87 NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
88 }
89 nir_validate_shader(nir);
90
91 /* Lower PatchVerticesIn from system value to uniform. This needs to
92 * happen before brw_preprocess_nir, since that will lower system values
93 * to intrinsics.
94 *
95 * We only do this for TES if no TCS is present, since otherwise we know
96 * the number of vertices in the patch at link time and we can lower it
97 * directly to a constant. We do this in nir_lower_patch_vertices, which
98 * needs to run after brw_nir_preprocess has turned the system values
99 * into intrinsics.
100 */
101 const bool lower_patch_vertices_in_to_uniform =
102 (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
103 (stage == MESA_SHADER_TESS_EVAL &&
104 !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
105
106 if (lower_patch_vertices_in_to_uniform)
107 brw_nir_lower_patch_vertices_in_to_uniform(nir);
108
109 nir = brw_preprocess_nir(brw->screen->compiler, nir);
110
111 if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
112 assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
113 struct gl_linked_shader *linked_tcs =
114 shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
115 uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
116 nir_lower_tes_patch_vertices(nir, patch_vertices);
117 }
118
119 if (stage == MESA_SHADER_FRAGMENT) {
120 static const struct nir_lower_wpos_ytransform_options wpos_options = {
121 .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
122 .fs_coord_pixel_center_integer = 1,
123 .fs_coord_origin_upper_left = 1,
124 };
125
126 bool progress = false;
127 NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
128 if (progress) {
129 _mesa_add_state_reference(prog->Parameters,
130 (gl_state_index *) wpos_options.state_tokens);
131 }
132 }
133
134 NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
135
136 return nir;
137 }
138
139 void
brw_shader_gather_info(nir_shader * nir,struct gl_program * prog)140 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
141 {
142 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
143
144 /* Copy the info we just generated back into the gl_program */
145 const char *prog_name = prog->info.name;
146 const char *prog_label = prog->info.label;
147 prog->info = nir->info;
148 prog->info.name = prog_name;
149 prog->info.label = prog_label;
150 }
151
152 static unsigned
get_new_program_id(struct intel_screen * screen)153 get_new_program_id(struct intel_screen *screen)
154 {
155 return p_atomic_inc_return(&screen->program_id);
156 }
157
brwNewProgram(struct gl_context * ctx,GLenum target,GLuint id,bool is_arb_asm)158 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
159 GLuint id, bool is_arb_asm)
160 {
161 struct brw_context *brw = brw_context(ctx);
162 struct brw_program *prog = rzalloc(NULL, struct brw_program);
163
164 if (prog) {
165 prog->id = get_new_program_id(brw->screen);
166
167 return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
168 }
169
170 return NULL;
171 }
172
brwDeleteProgram(struct gl_context * ctx,struct gl_program * prog)173 static void brwDeleteProgram( struct gl_context *ctx,
174 struct gl_program *prog )
175 {
176 struct brw_context *brw = brw_context(ctx);
177
178 /* Beware! prog's refcount has reached zero, and it's about to be freed.
179 *
180 * In brw_upload_pipeline_state(), we compare brw->programs[i] to
181 * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
182 * pointer has changed.
183 *
184 * We cannot leave brw->programs[i] as a dangling pointer to the dead
185 * program. malloc() may allocate the same memory for a new gl_program,
186 * causing us to see matching pointers...but totally different programs.
187 *
188 * We cannot set brw->programs[i] to NULL, either. If we've deleted the
189 * active program, Mesa may set ctx->FooProgram._Current to NULL. That
190 * would cause us to see matching pointers (NULL == NULL), and fail to
191 * detect that a program has changed since our last draw.
192 *
193 * So, set it to a bogus gl_program pointer that will never match,
194 * causing us to properly reevaluate the state on our next draw.
195 *
196 * Getting this wrong causes heisenbugs which are very hard to catch,
197 * as you need a very specific allocation pattern to hit the problem.
198 */
199 static const struct gl_program deleted_program;
200
201 for (int i = 0; i < MESA_SHADER_STAGES; i++) {
202 if (brw->programs[i] == prog)
203 brw->programs[i] = (struct gl_program *) &deleted_program;
204 }
205
206 _mesa_delete_program( ctx, prog );
207 }
208
209
210 static GLboolean
brwProgramStringNotify(struct gl_context * ctx,GLenum target,struct gl_program * prog)211 brwProgramStringNotify(struct gl_context *ctx,
212 GLenum target,
213 struct gl_program *prog)
214 {
215 assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
216
217 struct brw_context *brw = brw_context(ctx);
218 const struct brw_compiler *compiler = brw->screen->compiler;
219
220 switch (target) {
221 case GL_FRAGMENT_PROGRAM_ARB: {
222 struct brw_program *newFP = brw_program(prog);
223 const struct brw_program *curFP =
224 brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
225
226 if (newFP == curFP)
227 brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
228 newFP->id = get_new_program_id(brw->screen);
229
230 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
231
232 brw_shader_gather_info(prog->nir, prog);
233
234 brw_fs_precompile(ctx, prog);
235 break;
236 }
237 case GL_VERTEX_PROGRAM_ARB: {
238 struct brw_program *newVP = brw_program(prog);
239 const struct brw_program *curVP =
240 brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
241
242 if (newVP == curVP)
243 brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
244 if (newVP->program.arb.IsPositionInvariant) {
245 _mesa_insert_mvp_code(ctx, &newVP->program);
246 }
247 newVP->id = get_new_program_id(brw->screen);
248
249 /* Also tell tnl about it:
250 */
251 _tnl_program_string(ctx, target, prog);
252
253 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
254 compiler->scalar_stage[MESA_SHADER_VERTEX]);
255
256 brw_shader_gather_info(prog->nir, prog);
257
258 brw_vs_precompile(ctx, prog);
259 break;
260 }
261 default:
262 /*
263 * driver->ProgramStringNotify is only called for ARB programs, fixed
264 * function vertex programs, and ir_to_mesa (which isn't used by the
265 * i965 back-end). Therefore, even after geometry shaders are added,
266 * this function should only ever be called with a target of
267 * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
268 */
269 unreachable("Unexpected target in brwProgramStringNotify");
270 }
271
272 return true;
273 }
274
275 static void
brw_memory_barrier(struct gl_context * ctx,GLbitfield barriers)276 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
277 {
278 struct brw_context *brw = brw_context(ctx);
279 const struct gen_device_info *devinfo = &brw->screen->devinfo;
280 unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH |
281 PIPE_CONTROL_NO_WRITE |
282 PIPE_CONTROL_CS_STALL);
283 assert(devinfo->gen >= 7 && devinfo->gen <= 10);
284
285 if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
286 GL_ELEMENT_ARRAY_BARRIER_BIT |
287 GL_COMMAND_BARRIER_BIT))
288 bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
289
290 if (barriers & GL_UNIFORM_BARRIER_BIT)
291 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
292 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
293
294 if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
295 bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
296
297 if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
298 GL_PIXEL_BUFFER_BARRIER_BIT))
299 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
300 PIPE_CONTROL_RENDER_TARGET_FLUSH);
301
302 if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
303 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
304 PIPE_CONTROL_RENDER_TARGET_FLUSH);
305
306 /* Typed surface messages are handled by the render cache on IVB, so we
307 * need to flush it too.
308 */
309 if (devinfo->gen == 7 && !devinfo->is_haswell)
310 bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
311
312 brw_emit_pipe_control_flush(brw, bits);
313 }
314
315 static void
brw_blend_barrier(struct gl_context * ctx)316 brw_blend_barrier(struct gl_context *ctx)
317 {
318 struct brw_context *brw = brw_context(ctx);
319 const struct gen_device_info *devinfo = &brw->screen->devinfo;
320
321 if (!ctx->Extensions.MESA_shader_framebuffer_fetch) {
322 if (devinfo->gen >= 6) {
323 brw_emit_pipe_control_flush(brw,
324 PIPE_CONTROL_RENDER_TARGET_FLUSH |
325 PIPE_CONTROL_CS_STALL);
326 brw_emit_pipe_control_flush(brw,
327 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
328 } else {
329 brw_emit_pipe_control_flush(brw,
330 PIPE_CONTROL_RENDER_TARGET_FLUSH);
331 }
332 }
333 }
334
335 void
brw_get_scratch_bo(struct brw_context * brw,struct brw_bo ** scratch_bo,int size)336 brw_get_scratch_bo(struct brw_context *brw,
337 struct brw_bo **scratch_bo, int size)
338 {
339 struct brw_bo *old_bo = *scratch_bo;
340
341 if (old_bo && old_bo->size < size) {
342 brw_bo_unreference(old_bo);
343 old_bo = NULL;
344 }
345
346 if (!old_bo) {
347 *scratch_bo = brw_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
348 }
349 }
350
351 /**
352 * Reserve enough scratch space for the given stage to hold \p per_thread_size
353 * bytes times the given \p thread_count.
354 */
355 void
brw_alloc_stage_scratch(struct brw_context * brw,struct brw_stage_state * stage_state,unsigned per_thread_size)356 brw_alloc_stage_scratch(struct brw_context *brw,
357 struct brw_stage_state *stage_state,
358 unsigned per_thread_size)
359 {
360 if (stage_state->per_thread_scratch >= per_thread_size)
361 return;
362
363 stage_state->per_thread_scratch = per_thread_size;
364
365 if (stage_state->scratch_bo)
366 brw_bo_unreference(stage_state->scratch_bo);
367
368 const struct gen_device_info *devinfo = &brw->screen->devinfo;
369 unsigned thread_count;
370 switch(stage_state->stage) {
371 case MESA_SHADER_VERTEX:
372 thread_count = devinfo->max_vs_threads;
373 break;
374 case MESA_SHADER_TESS_CTRL:
375 thread_count = devinfo->max_tcs_threads;
376 break;
377 case MESA_SHADER_TESS_EVAL:
378 thread_count = devinfo->max_tes_threads;
379 break;
380 case MESA_SHADER_GEOMETRY:
381 thread_count = devinfo->max_gs_threads;
382 break;
383 case MESA_SHADER_FRAGMENT:
384 thread_count = devinfo->max_wm_threads;
385 break;
386 case MESA_SHADER_COMPUTE: {
387 unsigned subslices = MAX2(brw->screen->subslice_total, 1);
388
389 /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
390 *
391 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
392 * allocate scratch space enough so that each slice has 4 slices
393 * allowed."
394 *
395 * According to the other driver team, this applies to compute shaders
396 * as well. This is not currently documented at all.
397 *
398 * brw->screen->subslice_total is the TOTAL number of subslices
399 * and we wish to view that there are 4 subslices per slice
400 * instead of the actual number of subslices per slice.
401 */
402 if (devinfo->gen >= 9)
403 subslices = 4 * brw->screen->devinfo.num_slices;
404
405 unsigned scratch_ids_per_subslice;
406 if (devinfo->is_haswell) {
407 /* WaCSScratchSize:hsw
408 *
409 * Haswell's scratch space address calculation appears to be sparse
410 * rather than tightly packed. The Thread ID has bits indicating
411 * which subslice, EU within a subslice, and thread within an EU it
412 * is. There's a maximum of two slices and two subslices, so these
413 * can be stored with a single bit. Even though there are only 10 EUs
414 * per subslice, this is stored in 4 bits, so there's an effective
415 * maximum value of 16 EUs. Similarly, although there are only 7
416 * threads per EU, this is stored in a 3 bit number, giving an
417 * effective maximum value of 8 threads per EU.
418 *
419 * This means that we need to use 16 * 8 instead of 10 * 7 for the
420 * number of threads per subslice.
421 */
422 scratch_ids_per_subslice = 16 * 8;
423 } else if (devinfo->is_cherryview) {
424 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
425 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
426 * as if it had 8 EUs.
427 */
428 scratch_ids_per_subslice = 8 * 7;
429 } else {
430 scratch_ids_per_subslice = devinfo->max_cs_threads;
431 }
432
433 thread_count = scratch_ids_per_subslice * subslices;
434 break;
435 }
436 default:
437 unreachable("Unsupported stage!");
438 }
439
440 stage_state->scratch_bo =
441 brw_bo_alloc(brw->bufmgr, "shader scratch space",
442 per_thread_size * thread_count, 4096);
443 }
444
brwInitFragProgFuncs(struct dd_function_table * functions)445 void brwInitFragProgFuncs( struct dd_function_table *functions )
446 {
447 assert(functions->ProgramStringNotify == _tnl_program_string);
448
449 functions->NewProgram = brwNewProgram;
450 functions->DeleteProgram = brwDeleteProgram;
451 functions->ProgramStringNotify = brwProgramStringNotify;
452
453 functions->LinkShader = brw_link_shader;
454
455 functions->MemoryBarrier = brw_memory_barrier;
456 functions->BlendBarrier = brw_blend_barrier;
457 }
458
459 struct shader_times {
460 uint64_t time;
461 uint64_t written;
462 uint64_t reset;
463 };
464
465 void
brw_init_shader_time(struct brw_context * brw)466 brw_init_shader_time(struct brw_context *brw)
467 {
468 const int max_entries = 2048;
469 brw->shader_time.bo =
470 brw_bo_alloc(brw->bufmgr, "shader time",
471 max_entries * BRW_SHADER_TIME_STRIDE * 3, 4096);
472 brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
473 brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
474 brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
475 max_entries);
476 brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
477 max_entries);
478 brw->shader_time.max_entries = max_entries;
479 }
480
481 static int
compare_time(const void * a,const void * b)482 compare_time(const void *a, const void *b)
483 {
484 uint64_t * const *a_val = a;
485 uint64_t * const *b_val = b;
486
487 /* We don't just subtract because we're turning the value to an int. */
488 if (**a_val < **b_val)
489 return -1;
490 else if (**a_val == **b_val)
491 return 0;
492 else
493 return 1;
494 }
495
496 static void
print_shader_time_line(const char * stage,const char * name,int shader_num,uint64_t time,uint64_t total)497 print_shader_time_line(const char *stage, const char *name,
498 int shader_num, uint64_t time, uint64_t total)
499 {
500 fprintf(stderr, "%-6s%-18s", stage, name);
501
502 if (shader_num != 0)
503 fprintf(stderr, "%4d: ", shader_num);
504 else
505 fprintf(stderr, " : ");
506
507 fprintf(stderr, "%16lld (%7.2f Gcycles) %4.1f%%\n",
508 (long long)time,
509 (double)time / 1000000000.0,
510 (double)time / total * 100.0);
511 }
512
513 static void
brw_report_shader_time(struct brw_context * brw)514 brw_report_shader_time(struct brw_context *brw)
515 {
516 if (!brw->shader_time.bo || !brw->shader_time.num_entries)
517 return;
518
519 uint64_t scaled[brw->shader_time.num_entries];
520 uint64_t *sorted[brw->shader_time.num_entries];
521 uint64_t total_by_type[ST_CS + 1];
522 memset(total_by_type, 0, sizeof(total_by_type));
523 double total = 0;
524 for (int i = 0; i < brw->shader_time.num_entries; i++) {
525 uint64_t written = 0, reset = 0;
526 enum shader_time_shader_type type = brw->shader_time.types[i];
527
528 sorted[i] = &scaled[i];
529
530 switch (type) {
531 case ST_VS:
532 case ST_TCS:
533 case ST_TES:
534 case ST_GS:
535 case ST_FS8:
536 case ST_FS16:
537 case ST_CS:
538 written = brw->shader_time.cumulative[i].written;
539 reset = brw->shader_time.cumulative[i].reset;
540 break;
541
542 default:
543 /* I sometimes want to print things that aren't the 3 shader times.
544 * Just print the sum in that case.
545 */
546 written = 1;
547 reset = 0;
548 break;
549 }
550
551 uint64_t time = brw->shader_time.cumulative[i].time;
552 if (written) {
553 scaled[i] = time / written * (written + reset);
554 } else {
555 scaled[i] = time;
556 }
557
558 switch (type) {
559 case ST_VS:
560 case ST_TCS:
561 case ST_TES:
562 case ST_GS:
563 case ST_FS8:
564 case ST_FS16:
565 case ST_CS:
566 total_by_type[type] += scaled[i];
567 break;
568 default:
569 break;
570 }
571
572 total += scaled[i];
573 }
574
575 if (total == 0) {
576 fprintf(stderr, "No shader time collected yet\n");
577 return;
578 }
579
580 qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
581
582 fprintf(stderr, "\n");
583 fprintf(stderr, "type ID cycles spent %% of total\n");
584 for (int s = 0; s < brw->shader_time.num_entries; s++) {
585 const char *stage;
586 /* Work back from the sorted pointers times to a time to print. */
587 int i = sorted[s] - scaled;
588
589 if (scaled[i] == 0)
590 continue;
591
592 int shader_num = brw->shader_time.ids[i];
593 const char *shader_name = brw->shader_time.names[i];
594
595 switch (brw->shader_time.types[i]) {
596 case ST_VS:
597 stage = "vs";
598 break;
599 case ST_TCS:
600 stage = "tcs";
601 break;
602 case ST_TES:
603 stage = "tes";
604 break;
605 case ST_GS:
606 stage = "gs";
607 break;
608 case ST_FS8:
609 stage = "fs8";
610 break;
611 case ST_FS16:
612 stage = "fs16";
613 break;
614 case ST_CS:
615 stage = "cs";
616 break;
617 default:
618 stage = "other";
619 break;
620 }
621
622 print_shader_time_line(stage, shader_name, shader_num,
623 scaled[i], total);
624 }
625
626 fprintf(stderr, "\n");
627 print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
628 print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
629 print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
630 print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
631 print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
632 print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
633 print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
634 }
635
636 static void
brw_collect_shader_time(struct brw_context * brw)637 brw_collect_shader_time(struct brw_context *brw)
638 {
639 if (!brw->shader_time.bo)
640 return;
641
642 /* This probably stalls on the last rendering. We could fix that by
643 * delaying reading the reports, but it doesn't look like it's a big
644 * overhead compared to the cost of tracking the time in the first place.
645 */
646 void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
647
648 for (int i = 0; i < brw->shader_time.num_entries; i++) {
649 uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
650
651 brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
652 brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
653 brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
654 }
655
656 /* Zero the BO out to clear it out for our next collection.
657 */
658 memset(bo_map, 0, brw->shader_time.bo->size);
659 brw_bo_unmap(brw->shader_time.bo);
660 }
661
662 void
brw_collect_and_report_shader_time(struct brw_context * brw)663 brw_collect_and_report_shader_time(struct brw_context *brw)
664 {
665 brw_collect_shader_time(brw);
666
667 if (brw->shader_time.report_time == 0 ||
668 get_time() - brw->shader_time.report_time >= 1.0) {
669 brw_report_shader_time(brw);
670 brw->shader_time.report_time = get_time();
671 }
672 }
673
674 /**
675 * Chooses an index in the shader_time buffer and sets up tracking information
676 * for our printouts.
677 *
678 * Note that this holds on to references to the underlying programs, which may
679 * change their lifetimes compared to normal operation.
680 */
681 int
brw_get_shader_time_index(struct brw_context * brw,struct gl_program * prog,enum shader_time_shader_type type,bool is_glsl_sh)682 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
683 enum shader_time_shader_type type, bool is_glsl_sh)
684 {
685 int shader_time_index = brw->shader_time.num_entries++;
686 assert(shader_time_index < brw->shader_time.max_entries);
687 brw->shader_time.types[shader_time_index] = type;
688
689 const char *name;
690 if (prog->Id == 0) {
691 name = "ff";
692 } else if (is_glsl_sh) {
693 name = prog->info.label ?
694 ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
695 } else {
696 name = "prog";
697 }
698
699 brw->shader_time.names[shader_time_index] = name;
700 brw->shader_time.ids[shader_time_index] = prog->Id;
701
702 return shader_time_index;
703 }
704
705 void
brw_destroy_shader_time(struct brw_context * brw)706 brw_destroy_shader_time(struct brw_context *brw)
707 {
708 brw_bo_unreference(brw->shader_time.bo);
709 brw->shader_time.bo = NULL;
710 }
711
712 void
brw_stage_prog_data_free(const void * p)713 brw_stage_prog_data_free(const void *p)
714 {
715 struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
716
717 ralloc_free(prog_data->param);
718 ralloc_free(prog_data->pull_param);
719 }
720
721 void
brw_dump_arb_asm(const char * stage,struct gl_program * prog)722 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
723 {
724 fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
725 stage, prog->Id, stage);
726 _mesa_print_program(prog);
727 }
728
729 void
brw_setup_tex_for_precompile(struct brw_context * brw,struct brw_sampler_prog_key_data * tex,struct gl_program * prog)730 brw_setup_tex_for_precompile(struct brw_context *brw,
731 struct brw_sampler_prog_key_data *tex,
732 struct gl_program *prog)
733 {
734 const struct gen_device_info *devinfo = &brw->screen->devinfo;
735 const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
736 unsigned sampler_count = util_last_bit(prog->SamplersUsed);
737 for (unsigned i = 0; i < sampler_count; i++) {
738 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
739 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
740 tex->swizzles[i] =
741 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
742 } else {
743 /* Color sampler: assume no swizzling. */
744 tex->swizzles[i] = SWIZZLE_XYZW;
745 }
746 }
747 }
748
749 /**
750 * Sets up the starting offsets for the groups of binding table entries
751 * common to all pipeline stages.
752 *
753 * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
754 * unused but also make sure that addition of small offsets to them will
755 * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
756 */
757 uint32_t
brw_assign_common_binding_table_offsets(const struct gen_device_info * devinfo,const struct gl_program * prog,struct brw_stage_prog_data * stage_prog_data,uint32_t next_binding_table_offset)758 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
759 const struct gl_program *prog,
760 struct brw_stage_prog_data *stage_prog_data,
761 uint32_t next_binding_table_offset)
762 {
763 int num_textures = util_last_bit(prog->SamplersUsed);
764
765 stage_prog_data->binding_table.texture_start = next_binding_table_offset;
766 next_binding_table_offset += num_textures;
767
768 if (prog->info.num_ubos) {
769 assert(prog->info.num_ubos <= BRW_MAX_UBO);
770 stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
771 next_binding_table_offset += prog->info.num_ubos;
772 } else {
773 stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
774 }
775
776 if (prog->info.num_ssbos || prog->info.num_abos) {
777 assert(prog->info.num_abos <= BRW_MAX_ABO);
778 assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
779 stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
780 next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
781 } else {
782 stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
783 }
784
785 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
786 stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
787 next_binding_table_offset++;
788 } else {
789 stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
790 }
791
792 if (prog->info.uses_texture_gather) {
793 if (devinfo->gen >= 8) {
794 stage_prog_data->binding_table.gather_texture_start =
795 stage_prog_data->binding_table.texture_start;
796 } else {
797 stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
798 next_binding_table_offset += num_textures;
799 }
800 } else {
801 stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
802 }
803
804 if (prog->info.num_images) {
805 stage_prog_data->binding_table.image_start = next_binding_table_offset;
806 next_binding_table_offset += prog->info.num_images;
807 } else {
808 stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
809 }
810
811 /* This may or may not be used depending on how the compile goes. */
812 stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
813 next_binding_table_offset++;
814
815 /* Plane 0 is just the regular texture section */
816 stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
817
818 stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
819 next_binding_table_offset += num_textures;
820
821 stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
822 next_binding_table_offset += num_textures;
823
824 /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
825
826 assert(next_binding_table_offset <= BRW_MAX_SURFACES);
827 return next_binding_table_offset;
828 }
829
830 void
brw_program_serialize_nir(struct gl_context * ctx,struct gl_program * prog)831 brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
832 {
833 struct blob writer;
834 blob_init(&writer);
835 nir_serialize(&writer, prog->nir);
836 prog->driver_cache_blob = ralloc_size(NULL, writer.size);
837 memcpy(prog->driver_cache_blob, writer.data, writer.size);
838 prog->driver_cache_blob_size = writer.size;
839 blob_finish(&writer);
840 }
841
842 void
brw_program_deserialize_nir(struct gl_context * ctx,struct gl_program * prog,gl_shader_stage stage)843 brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
844 gl_shader_stage stage)
845 {
846 if (!prog->nir) {
847 assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
848 const struct nir_shader_compiler_options *options =
849 ctx->Const.ShaderCompilerOptions[stage].NirOptions;
850 struct blob_reader reader;
851 blob_reader_init(&reader, prog->driver_cache_blob,
852 prog->driver_cache_blob_size);
853 prog->nir = nir_deserialize(NULL, options, &reader);
854 }
855
856 if (prog->driver_cache_blob) {
857 ralloc_free(prog->driver_cache_blob);
858 prog->driver_cache_blob = NULL;
859 prog->driver_cache_blob_size = 0;
860 }
861 }
862