1 /*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Adam Rak <adam.rak@streamnovation.com>
25 */
26
27 #ifdef HAVE_OPENCL
28 #include <gelf.h>
29 #include <libelf.h>
30 #endif
31 #include <stdio.h>
32 #include <errno.h>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "pipebuffer/pb_buffer.h"
45 #include "evergreend.h"
46 #include "r600_shader.h"
47 #include "r600_pipe.h"
48 #include "r600_formats.h"
49 #include "evergreen_compute.h"
50 #include "evergreen_compute_internal.h"
51 #include "compute_memory_pool.h"
52 #include <inttypes.h>
53
54 /**
55 RAT0 is for global binding write
56 VTX1 is for global binding read
57
58 for writing images RAT1...
59 for reading images TEX2...
60 TEX2-RAT1 is paired
61
62 TEX2... consumes the same fetch resources, that VTX2... would consume
63
64 CONST0 and VTX0 is for parameters
65 CONST0 is binding smaller input parameter buffer, and for constant indexing,
66 also constant cached
67 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68 the constant cache can handle
69
70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
72 we should reserve another one too.=> 10 image binding for writing max.
73
74 from Nvidia OpenCL:
75 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
76 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
77
78 so 10 for writing is enough. 176 is the max for reading according to the docs
79
80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81 writable images will consume TEX slots, VTX slots too because of linear indexing
82
83 */
84
85 #ifdef HAVE_OPENCL
radeon_shader_binary_init(struct r600_shader_binary * b)86 static void radeon_shader_binary_init(struct r600_shader_binary *b)
87 {
88 memset(b, 0, sizeof(*b));
89 }
90
radeon_shader_binary_clean(struct r600_shader_binary * b)91 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
92 {
93 if (!b)
94 return;
95 FREE(b->code);
96 FREE(b->config);
97 FREE(b->rodata);
98 FREE(b->global_symbol_offsets);
99 FREE(b->relocs);
100 FREE(b->disasm_string);
101 }
102 #endif
103
r600_compute_buffer_alloc_vram(struct r600_screen * screen,unsigned size)104 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
105 unsigned size)
106 {
107 struct pipe_resource *buffer = NULL;
108 assert(size);
109
110 buffer = pipe_buffer_create((struct pipe_screen*) screen,
111 0, PIPE_USAGE_IMMUTABLE, size);
112
113 return (struct r600_resource *)buffer;
114 }
115
116
evergreen_set_rat(struct r600_pipe_compute * pipe,unsigned id,struct r600_resource * bo,int start,int size)117 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
118 unsigned id,
119 struct r600_resource *bo,
120 int start,
121 int size)
122 {
123 struct pipe_surface rat_templ;
124 struct r600_surface *surf = NULL;
125 struct r600_context *rctx = NULL;
126
127 assert(id < 12);
128 assert((size & 3) == 0);
129 assert((start & 0xFF) == 0);
130
131 rctx = pipe->ctx;
132
133 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
134
135 /* Create the RAT surface */
136 memset(&rat_templ, 0, sizeof(rat_templ));
137 rat_templ.format = PIPE_FORMAT_R32_UINT;
138 rat_templ.u.tex.level = 0;
139 rat_templ.u.tex.first_layer = 0;
140 rat_templ.u.tex.last_layer = 0;
141
142 /* Add the RAT the list of color buffers. Drop the old buffer first. */
143 pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
144 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
145 (struct pipe_context *)pipe->ctx,
146 (struct pipe_resource *)bo, &rat_templ);
147
148 /* Update the number of color buffers */
149 pipe->ctx->framebuffer.state.nr_cbufs =
150 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
151
152 /* Update the cb_target_mask
153 * XXX: I think this is a potential spot for bugs once we start doing
154 * GL interop. cb_target_mask may be modified in the 3D sections
155 * of this driver. */
156 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
157
158 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
159 evergreen_init_color_surface_rat(rctx, surf);
160 }
161
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)162 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
163 unsigned vb_index,
164 unsigned offset,
165 struct pipe_resource *buffer)
166 {
167 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
168 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
169 vb->buffer_offset = offset;
170 vb->buffer.resource = buffer;
171 vb->is_user_buffer = false;
172
173 /* The vertex instructions in the compute shaders use the texture cache,
174 * so we need to invalidate it. */
175 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
176 state->enabled_mask |= 1 << vb_index;
177 state->dirty_mask |= 1 << vb_index;
178 r600_mark_atom_dirty(rctx, &state->atom);
179 }
180
evergreen_cs_set_constant_buffer(struct r600_context * rctx,unsigned cb_index,unsigned offset,unsigned size,struct pipe_resource * buffer)181 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
182 unsigned cb_index,
183 unsigned offset,
184 unsigned size,
185 struct pipe_resource *buffer)
186 {
187 struct pipe_constant_buffer cb;
188 cb.buffer_size = size;
189 cb.buffer_offset = offset;
190 cb.buffer = buffer;
191 cb.user_buffer = NULL;
192
193 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
194 }
195
196 /* We need to define these R600 registers here, because we can't include
197 * evergreend.h and r600d.h.
198 */
199 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
200 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
201
202 #ifdef HAVE_OPENCL
parse_symbol_table(Elf_Data * symbol_table_data,const GElf_Shdr * symbol_table_header,struct r600_shader_binary * binary)203 static void parse_symbol_table(Elf_Data *symbol_table_data,
204 const GElf_Shdr *symbol_table_header,
205 struct r600_shader_binary *binary)
206 {
207 GElf_Sym symbol;
208 unsigned i = 0;
209 unsigned symbol_count =
210 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
211
212 /* We are over allocating this list, because symbol_count gives the
213 * total number of symbols, and we will only be filling the list
214 * with offsets of global symbols. The memory savings from
215 * allocating the correct size of this list will be small, and
216 * I don't think it is worth the cost of pre-computing the number
217 * of global symbols.
218 */
219 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
220
221 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
222 unsigned i;
223 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
224 symbol.st_shndx == 0 /* Undefined symbol */) {
225 continue;
226 }
227
228 binary->global_symbol_offsets[binary->global_symbol_count] =
229 symbol.st_value;
230
231 /* Sort the list using bubble sort. This list will usually
232 * be small. */
233 for (i = binary->global_symbol_count; i > 0; --i) {
234 uint64_t lhs = binary->global_symbol_offsets[i - 1];
235 uint64_t rhs = binary->global_symbol_offsets[i];
236 if (lhs < rhs) {
237 break;
238 }
239 binary->global_symbol_offsets[i] = lhs;
240 binary->global_symbol_offsets[i - 1] = rhs;
241 }
242 ++binary->global_symbol_count;
243 }
244 }
245
246
parse_relocs(Elf * elf,Elf_Data * relocs,Elf_Data * symbols,unsigned symbol_sh_link,struct r600_shader_binary * binary)247 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
248 unsigned symbol_sh_link,
249 struct r600_shader_binary *binary)
250 {
251 unsigned i;
252
253 if (!relocs || !symbols || !binary->reloc_count) {
254 return;
255 }
256 binary->relocs = CALLOC(binary->reloc_count,
257 sizeof(struct r600_shader_reloc));
258 for (i = 0; i < binary->reloc_count; i++) {
259 GElf_Sym symbol;
260 GElf_Rel rel;
261 char *symbol_name;
262 struct r600_shader_reloc *reloc = &binary->relocs[i];
263
264 gelf_getrel(relocs, i, &rel);
265 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
266 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
267
268 reloc->offset = rel.r_offset;
269 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
270 reloc->name[sizeof(reloc->name)-1] = 0;
271 }
272 }
273
r600_elf_read(const char * elf_data,unsigned elf_size,struct r600_shader_binary * binary)274 static void r600_elf_read(const char *elf_data, unsigned elf_size,
275 struct r600_shader_binary *binary)
276 {
277 char *elf_buffer;
278 Elf *elf;
279 Elf_Scn *section = NULL;
280 Elf_Data *symbols = NULL, *relocs = NULL;
281 size_t section_str_index;
282 unsigned symbol_sh_link = 0;
283
284 /* One of the libelf implementations
285 * (http://www.mr511.de/software/english.htm) requires calling
286 * elf_version() before elf_memory().
287 */
288 elf_version(EV_CURRENT);
289 elf_buffer = MALLOC(elf_size);
290 memcpy(elf_buffer, elf_data, elf_size);
291
292 elf = elf_memory(elf_buffer, elf_size);
293
294 elf_getshdrstrndx(elf, §ion_str_index);
295
296 while ((section = elf_nextscn(elf, section))) {
297 const char *name;
298 Elf_Data *section_data = NULL;
299 GElf_Shdr section_header;
300 if (gelf_getshdr(section, §ion_header) != §ion_header) {
301 fprintf(stderr, "Failed to read ELF section header\n");
302 return;
303 }
304 name = elf_strptr(elf, section_str_index, section_header.sh_name);
305 if (!strcmp(name, ".text")) {
306 section_data = elf_getdata(section, section_data);
307 binary->code_size = section_data->d_size;
308 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
309 memcpy(binary->code, section_data->d_buf, binary->code_size);
310 } else if (!strcmp(name, ".AMDGPU.config")) {
311 section_data = elf_getdata(section, section_data);
312 binary->config_size = section_data->d_size;
313 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
314 memcpy(binary->config, section_data->d_buf, binary->config_size);
315 } else if (!strcmp(name, ".AMDGPU.disasm")) {
316 /* Always read disassembly if it's available. */
317 section_data = elf_getdata(section, section_data);
318 binary->disasm_string = strndup(section_data->d_buf,
319 section_data->d_size);
320 } else if (!strncmp(name, ".rodata", 7)) {
321 section_data = elf_getdata(section, section_data);
322 binary->rodata_size = section_data->d_size;
323 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
324 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
325 } else if (!strncmp(name, ".symtab", 7)) {
326 symbols = elf_getdata(section, section_data);
327 symbol_sh_link = section_header.sh_link;
328 parse_symbol_table(symbols, §ion_header, binary);
329 } else if (!strcmp(name, ".rel.text")) {
330 relocs = elf_getdata(section, section_data);
331 binary->reloc_count = section_header.sh_size /
332 section_header.sh_entsize;
333 }
334 }
335
336 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
337
338 if (elf){
339 elf_end(elf);
340 }
341 FREE(elf_buffer);
342
343 /* Cache the config size per symbol */
344 if (binary->global_symbol_count) {
345 binary->config_size_per_symbol =
346 binary->config_size / binary->global_symbol_count;
347 } else {
348 binary->global_symbol_count = 1;
349 binary->config_size_per_symbol = binary->config_size;
350 }
351 }
352
r600_shader_binary_config_start(const struct r600_shader_binary * binary,uint64_t symbol_offset)353 static const unsigned char *r600_shader_binary_config_start(
354 const struct r600_shader_binary *binary,
355 uint64_t symbol_offset)
356 {
357 unsigned i;
358 for (i = 0; i < binary->global_symbol_count; ++i) {
359 if (binary->global_symbol_offsets[i] == symbol_offset) {
360 unsigned offset = i * binary->config_size_per_symbol;
361 return binary->config + offset;
362 }
363 }
364 return binary->config;
365 }
366
r600_shader_binary_read_config(const struct r600_shader_binary * binary,struct r600_bytecode * bc,uint64_t symbol_offset,bool * use_kill)367 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
368 struct r600_bytecode *bc,
369 uint64_t symbol_offset,
370 bool *use_kill)
371 {
372 unsigned i;
373 const unsigned char *config =
374 r600_shader_binary_config_start(binary, symbol_offset);
375
376 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
377 unsigned reg =
378 util_le32_to_cpu(*(uint32_t*)(config + i));
379 unsigned value =
380 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
381 switch (reg) {
382 /* R600 / R700 */
383 case R_028850_SQ_PGM_RESOURCES_PS:
384 case R_028868_SQ_PGM_RESOURCES_VS:
385 /* Evergreen / Northern Islands */
386 case R_028844_SQ_PGM_RESOURCES_PS:
387 case R_028860_SQ_PGM_RESOURCES_VS:
388 case R_0288D4_SQ_PGM_RESOURCES_LS:
389 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
390 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
391 break;
392 case R_02880C_DB_SHADER_CONTROL:
393 *use_kill = G_02880C_KILL_ENABLE(value);
394 break;
395 case R_0288E8_SQ_LDS_ALLOC:
396 bc->nlds_dw = value;
397 break;
398 }
399 }
400 }
401
r600_create_shader(struct r600_bytecode * bc,const struct r600_shader_binary * binary,bool * use_kill)402 static unsigned r600_create_shader(struct r600_bytecode *bc,
403 const struct r600_shader_binary *binary,
404 bool *use_kill)
405
406 {
407 assert(binary->code_size % 4 == 0);
408 bc->bytecode = CALLOC(1, binary->code_size);
409 memcpy(bc->bytecode, binary->code, binary->code_size);
410 bc->ndw = binary->code_size / 4;
411
412 r600_shader_binary_read_config(binary, bc, 0, use_kill);
413 return 0;
414 }
415
416 #endif
417
r600_destroy_shader(struct r600_bytecode * bc)418 static void r600_destroy_shader(struct r600_bytecode *bc)
419 {
420 FREE(bc->bytecode);
421 }
422
evergreen_create_compute_state(struct pipe_context * ctx,const struct pipe_compute_state * cso)423 static void *evergreen_create_compute_state(struct pipe_context *ctx,
424 const struct pipe_compute_state *cso)
425 {
426 struct r600_context *rctx = (struct r600_context *)ctx;
427 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
428 #ifdef HAVE_OPENCL
429 const struct pipe_binary_program_header *header;
430 void *p;
431 bool use_kill;
432 #endif
433
434 shader->ctx = rctx;
435 shader->local_size = cso->static_shared_mem;
436 shader->input_size = cso->req_input_mem;
437
438 shader->ir_type = cso->ir_type;
439
440 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
441 shader->ir_type == PIPE_SHADER_IR_NIR) {
442 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
443
444 /* Precompile the shader with the expected shader key, to reduce jank at
445 * draw time. Also produces output for shader-db.
446 */
447 bool dirty;
448 r600_shader_select(ctx, shader->sel, &dirty, true);
449
450 return shader;
451 }
452 #ifdef HAVE_OPENCL
453 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
454 header = cso->prog;
455 radeon_shader_binary_init(&shader->binary);
456 r600_elf_read(header->blob, header->num_bytes, &shader->binary);
457 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
458
459 /* Upload code + ROdata */
460 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
461 shader->bc.ndw * 4);
462 p = r600_buffer_map_sync_with_rings(
463 &rctx->b, shader->code_bo,
464 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
465 //TODO: use util_memcpy_cpu_to_le32 ?
466 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
467 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
468 #endif
469
470 return shader;
471 }
472
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)473 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
474 {
475 struct r600_context *rctx = (struct r600_context *)ctx;
476 struct r600_pipe_compute *shader = state;
477
478 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
479
480 if (!shader)
481 return;
482
483 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
484 shader->ir_type == PIPE_SHADER_IR_NIR) {
485 r600_delete_shader_selector(ctx, shader->sel);
486 } else {
487 #ifdef HAVE_OPENCL
488 radeon_shader_binary_clean(&shader->binary);
489 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
490 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
491 #endif
492 r600_destroy_shader(&shader->bc);
493 }
494 FREE(shader);
495 }
496
evergreen_bind_compute_state(struct pipe_context * ctx,void * state)497 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
498 {
499 struct r600_context *rctx = (struct r600_context *)ctx;
500 struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
501 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
502
503 if (!state) {
504 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
505 return;
506 }
507
508 if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
509 cstate->ir_type == PIPE_SHADER_IR_NIR) {
510 bool compute_dirty;
511 if (r600_shader_select(ctx, cstate->sel, &compute_dirty, false))
512 R600_ERR("Failed to select compute shader\n");
513 }
514
515 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
516 }
517
518 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
519 * kernel parameters there are implicit parameters that need to be stored
520 * in the vertex buffer as well. Here is how these parameters are organized in
521 * the buffer:
522 *
523 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
524 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
525 * DWORDS 6-8: Number of work items within each work group in each dimension
526 * (x,y,z)
527 * DWORDS 9+ : Kernel parameters
528 */
evergreen_compute_upload_input(struct pipe_context * ctx,const struct pipe_grid_info * info)529 static void evergreen_compute_upload_input(struct pipe_context *ctx,
530 const struct pipe_grid_info *info)
531 {
532 struct r600_context *rctx = (struct r600_context *)ctx;
533 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
534 unsigned i;
535 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
536 * parameters.
537 */
538 unsigned input_size;
539 uint32_t *num_work_groups_start;
540 uint32_t *global_size_start;
541 uint32_t *local_size_start;
542 uint32_t *kernel_parameters_start;
543 struct pipe_box box;
544 struct pipe_transfer *transfer = NULL;
545
546 if (!shader)
547 return;
548 if (shader->input_size == 0) {
549 return;
550 }
551 input_size = shader->input_size + 36;
552 if (!shader->kernel_param) {
553 /* Add space for the grid dimensions */
554 shader->kernel_param = (struct r600_resource *)
555 pipe_buffer_create(ctx->screen, 0,
556 PIPE_USAGE_IMMUTABLE, input_size);
557 }
558
559 u_box_1d(0, input_size, &box);
560 num_work_groups_start = ctx->buffer_map(ctx,
561 (struct pipe_resource*)shader->kernel_param,
562 0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
563 &box, &transfer);
564 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
565 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
566 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
567
568 /* Copy the work group size */
569 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
570
571 /* Copy the global size */
572 for (i = 0; i < 3; i++) {
573 global_size_start[i] = info->grid[i] * info->block[i];
574 }
575
576 /* Copy the local dimensions */
577 memcpy(local_size_start, info->block, 3 * sizeof(uint));
578
579 /* Copy the kernel inputs */
580 memcpy(kernel_parameters_start, info->input, shader->input_size);
581
582 for (i = 0; i < (input_size / 4); i++) {
583 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
584 ((unsigned*)num_work_groups_start)[i]);
585 }
586
587 ctx->buffer_unmap(ctx, transfer);
588
589 /* ID=0 and ID=3 are reserved for the parameters.
590 * LLVM will preferably use ID=0, but it does not work for dynamic
591 * indices. */
592 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
593 (struct pipe_resource*)shader->kernel_param);
594 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
595 (struct pipe_resource*)shader->kernel_param);
596 }
597
evergreen_emit_dispatch(struct r600_context * rctx,const struct pipe_grid_info * info,uint32_t indirect_grid[3])598 static void evergreen_emit_dispatch(struct r600_context *rctx,
599 const struct pipe_grid_info *info,
600 uint32_t indirect_grid[3])
601 {
602 int i;
603 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
604 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
605 bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
606 unsigned num_waves;
607 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
608 unsigned wave_divisor = (16 * num_pipes);
609 int group_size = 1;
610 unsigned lds_size = (shader->local_size + info->variable_shared_mem) / 4;
611
612 if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
613 shader->ir_type != PIPE_SHADER_IR_NIR)
614 lds_size += shader->bc.nlds_dw;
615
616 /* Calculate group_size */
617 for (i = 0; i < 3; i++) {
618 group_size *= info->block[i];
619 }
620
621 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
622 num_waves = (info->block[0] * info->block[1] * info->block[2] +
623 wave_divisor - 1) / wave_divisor;
624
625 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
626 "%u wavefronts per thread block, "
627 "allocating %u dwords lds.\n",
628 num_pipes, num_waves, lds_size);
629
630 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
631
632 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
633 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
634 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
635 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
636
637 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
638 group_size);
639
640 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
641 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
642 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
643 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
644
645 if (rctx->b.gfx_level < CAYMAN) {
646 assert(lds_size <= 8192);
647 } else {
648 /* Cayman appears to have a slightly smaller limit, see the
649 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
650 assert(lds_size <= 8160);
651 }
652
653 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
654 lds_size | (num_waves << 14));
655
656 if (info->indirect) {
657 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
658 radeon_emit(cs, indirect_grid[0]);
659 radeon_emit(cs, indirect_grid[1]);
660 radeon_emit(cs, indirect_grid[2]);
661 radeon_emit(cs, 1);
662 } else {
663 /* Dispatch packet */
664 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
665 radeon_emit(cs, info->grid[0]);
666 radeon_emit(cs, info->grid[1]);
667 radeon_emit(cs, info->grid[2]);
668 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
669 radeon_emit(cs, 1);
670 }
671
672 if (rctx->is_debug)
673 eg_trace_emit(rctx);
674 }
675
compute_setup_cbs(struct r600_context * rctx)676 static void compute_setup_cbs(struct r600_context *rctx)
677 {
678 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
679 unsigned i;
680
681 /* Emit colorbuffers. */
682 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
683 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
684 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
685 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
686 (struct r600_resource*)cb->base.texture,
687 RADEON_USAGE_READWRITE |
688 RADEON_PRIO_SHADER_RW_BUFFER);
689
690 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
691 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
692 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
693 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
694 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
695 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
696 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
697 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
698
699 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
700 radeon_emit(cs, reloc);
701
702 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
703 radeon_emit(cs, reloc);
704 }
705 for (; i < 8 ; i++)
706 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
707 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
708 for (; i < 12; i++)
709 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
710 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
711
712 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
713 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
714 rctx->compute_cb_target_mask);
715 }
716
compute_emit_cs(struct r600_context * rctx,const struct pipe_grid_info * info)717 static void compute_emit_cs(struct r600_context *rctx,
718 const struct pipe_grid_info *info)
719 {
720 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
721 bool compute_dirty = false;
722 struct r600_pipe_shader *current;
723 struct r600_shader_atomic combined_atomics[8];
724 uint8_t atomic_used_mask;
725 uint32_t indirect_grid[3] = { 0, 0, 0 };
726
727 /* make sure that the gfx ring is only one active */
728 if (radeon_emitted(&rctx->b.dma.cs, 0)) {
729 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
730 }
731
732 r600_update_compressed_resource_state(rctx, true);
733
734 if (!rctx->cmd_buf_is_compute) {
735 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
736 rctx->cmd_buf_is_compute = true;
737 }
738
739 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
740 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
741 if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty, false)) {
742 R600_ERR("Failed to select compute shader\n");
743 return;
744 }
745
746 current = rctx->cs_shader_state.shader->sel->current;
747 if (compute_dirty) {
748 rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
749 r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
750 r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
751 }
752
753 bool need_buf_const = current->shader.uses_tex_buffers ||
754 current->shader.has_txq_cube_array_z_comp;
755
756 if (info->indirect) {
757 struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
758 unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
759 unsigned offset = info->indirect_offset / 4;
760 indirect_grid[0] = data[offset];
761 indirect_grid[1] = data[offset + 1];
762 indirect_grid[2] = data[offset + 2];
763 }
764 for (int i = 0; i < 3; i++) {
765 rctx->cs_block_grid_sizes[i] = info->block[i];
766 rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
767 }
768 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
769 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
770
771 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
772 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
773
774 if (need_buf_const) {
775 eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
776 }
777 r600_update_driver_const_buffers(rctx, true);
778
779 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
780 if (atomic_used_mask) {
781 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
782 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
783 }
784 } else
785 r600_need_cs_space(rctx, 0, true, 0);
786
787 /* Initialize all the compute-related registers.
788 *
789 * See evergreen_init_atom_start_compute_cs() in this file for the list
790 * of registers initialized by the start_compute_cs_cmd atom.
791 */
792 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
793
794 /* emit config state */
795 if (rctx->b.gfx_level == EVERGREEN) {
796 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
797 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
798 radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
799 radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
800 radeon_emit(cs, 0);
801 radeon_emit(cs, 0);
802 radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
803 } else
804 r600_emit_atom(rctx, &rctx->config_state.atom);
805 }
806
807 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
808 r600_flush_emit(rctx);
809
810 if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
811 rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
812
813 compute_setup_cbs(rctx);
814
815 /* Emit vertex buffer state */
816 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
817 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
818 } else {
819 uint32_t rat_mask;
820
821 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
822 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
823 rat_mask);
824 }
825
826 r600_emit_atom(rctx, &rctx->b.render_cond_atom);
827
828 /* Emit constant buffer state */
829 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
830
831 /* Emit sampler state */
832 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
833
834 /* Emit sampler view (texture resource) state */
835 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
836
837 /* Emit images state */
838 r600_emit_atom(rctx, &rctx->compute_images.atom);
839
840 /* Emit buffers state */
841 r600_emit_atom(rctx, &rctx->compute_buffers.atom);
842
843 /* Emit shader state */
844 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
845
846 /* Emit dispatch state and dispatch packet */
847 evergreen_emit_dispatch(rctx, info, indirect_grid);
848
849 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
850 */
851 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
852 R600_CONTEXT_INV_VERTEX_CACHE |
853 R600_CONTEXT_INV_TEX_CACHE;
854 r600_flush_emit(rctx);
855 rctx->b.flags = 0;
856
857 if (rctx->b.gfx_level >= CAYMAN) {
858 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
859 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
860 /* DEALLOC_STATE prevents the GPU from hanging when a
861 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
862 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
863 */
864 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
865 radeon_emit(cs, 0);
866 }
867 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
868 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
869 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
870
871 #if 0
872 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
873 for (i = 0; i < cs->cdw; i++) {
874 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
875 }
876 #endif
877
878 }
879
880
881 /**
882 * Emit function for r600_cs_shader_state atom
883 */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)884 void evergreen_emit_cs_shader(struct r600_context *rctx,
885 struct r600_atom *atom)
886 {
887 struct r600_cs_shader_state *state =
888 (struct r600_cs_shader_state*)atom;
889 struct r600_pipe_compute *shader = state->shader;
890 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
891 uint64_t va;
892 struct r600_resource *code_bo;
893 unsigned ngpr, nstack;
894
895 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
896 shader->ir_type == PIPE_SHADER_IR_NIR) {
897 code_bo = shader->sel->current->bo;
898 va = shader->sel->current->bo->gpu_address;
899 ngpr = shader->sel->current->shader.bc.ngpr;
900 nstack = shader->sel->current->shader.bc.nstack;
901 } else {
902 code_bo = shader->code_bo;
903 va = shader->code_bo->gpu_address + state->pc;
904 ngpr = shader->bc.ngpr;
905 nstack = shader->bc.nstack;
906 }
907
908 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
909 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
910 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
911 S_0288D4_NUM_GPRS(ngpr) |
912 S_0288D4_DX10_CLAMP(1) |
913 S_0288D4_STACK_SIZE(nstack));
914 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
915
916 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
917 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
918 code_bo, RADEON_USAGE_READ |
919 RADEON_PRIO_SHADER_BINARY));
920 }
921
evergreen_launch_grid(struct pipe_context * ctx,const struct pipe_grid_info * info)922 static void evergreen_launch_grid(struct pipe_context *ctx,
923 const struct pipe_grid_info *info)
924 {
925 struct r600_context *rctx = (struct r600_context *)ctx;
926 #ifdef HAVE_OPENCL
927 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
928 bool use_kill;
929
930 if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
931 shader->ir_type != PIPE_SHADER_IR_NIR) {
932 rctx->cs_shader_state.pc = info->pc;
933 /* Get the config information for this kernel. */
934 r600_shader_binary_read_config(&shader->binary, &shader->bc,
935 info->pc, &use_kill);
936 } else {
937 use_kill = false;
938 rctx->cs_shader_state.pc = 0;
939 }
940 #endif
941
942 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
943
944
945 evergreen_compute_upload_input(ctx, info);
946 compute_emit_cs(rctx, info);
947 }
948
evergreen_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** surfaces)949 static void evergreen_set_compute_resources(struct pipe_context *ctx,
950 unsigned start, unsigned count,
951 struct pipe_surface **surfaces)
952 {
953 struct r600_context *rctx = (struct r600_context *)ctx;
954 struct r600_surface **resources = (struct r600_surface **)surfaces;
955
956 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
957 start, count);
958
959 for (unsigned i = 0; i < count; i++) {
960 /* The First four vertex buffers are reserved for parameters and
961 * global buffers. */
962 unsigned vtx_id = 4 + i;
963 if (resources[i]) {
964 struct r600_resource_global *buffer =
965 (struct r600_resource_global*)
966 resources[i]->base.texture;
967 if (resources[i]->base.writable) {
968 assert(i+1 < 12);
969
970 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
971 (struct r600_resource *)resources[i]->base.texture,
972 buffer->chunk->start_in_dw*4,
973 resources[i]->base.texture->width0);
974 }
975
976 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
977 buffer->chunk->start_in_dw * 4,
978 resources[i]->base.texture);
979 }
980 }
981 }
982
evergreen_set_global_binding(struct pipe_context * ctx,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)983 static void evergreen_set_global_binding(struct pipe_context *ctx,
984 unsigned first, unsigned n,
985 struct pipe_resource **resources,
986 uint32_t **handles)
987 {
988 struct r600_context *rctx = (struct r600_context *)ctx;
989 struct compute_memory_pool *pool = rctx->screen->global_pool;
990 struct r600_resource_global **buffers =
991 (struct r600_resource_global **)resources;
992 unsigned i;
993
994 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
995 first, n);
996
997 if (!resources) {
998 /* XXX: Unset */
999 return;
1000 }
1001
1002 /* We mark these items for promotion to the pool if they
1003 * aren't already there */
1004 for (i = first; i < first + n; i++) {
1005 struct compute_memory_item *item = buffers[i]->chunk;
1006
1007 if (!is_item_in_pool(item))
1008 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1009 }
1010
1011 if (compute_memory_finalize_pending(pool, ctx) == -1) {
1012 /* XXX: Unset */
1013 return;
1014 }
1015
1016 for (i = first; i < first + n; i++)
1017 {
1018 uint32_t buffer_offset;
1019 uint32_t handle;
1020 assert(resources[i]->target == PIPE_BUFFER);
1021 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1022
1023 buffer_offset = util_le32_to_cpu(*(handles[i]));
1024 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1025
1026 *(handles[i]) = util_cpu_to_le32(handle);
1027 }
1028
1029 /* globals for writing */
1030 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1031 /* globals for reading */
1032 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1033 (struct pipe_resource*)pool->bo);
1034
1035 /* constants for reading, LLVM puts them in text segment */
1036 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1037 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1038 }
1039
1040 /**
1041 * This function initializes all the compute specific registers that need to
1042 * be initialized for each compute command stream. Registers that are common
1043 * to both compute and 3D will be initialized at the beginning of each compute
1044 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
1045 * packet requires that the shader type bit be set, we must initialize all
1046 * context registers needed for compute in this function. The registers
1047 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1048 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1049 * on the GPU family.
1050 */
evergreen_init_atom_start_compute_cs(struct r600_context * rctx)1051 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1052 {
1053 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1054 int num_threads;
1055 int num_stack_entries;
1056
1057 /* since all required registers are initialized in the
1058 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1059 */
1060 r600_init_command_buffer(cb, 256);
1061 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1062
1063 /* We're setting config registers here. */
1064 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1065 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1066
1067 switch (rctx->b.family) {
1068 case CHIP_CEDAR:
1069 default:
1070 num_threads = 128;
1071 num_stack_entries = 256;
1072 break;
1073 case CHIP_REDWOOD:
1074 num_threads = 128;
1075 num_stack_entries = 256;
1076 break;
1077 case CHIP_JUNIPER:
1078 num_threads = 128;
1079 num_stack_entries = 512;
1080 break;
1081 case CHIP_CYPRESS:
1082 case CHIP_HEMLOCK:
1083 num_threads = 128;
1084 num_stack_entries = 512;
1085 break;
1086 case CHIP_PALM:
1087 num_threads = 128;
1088 num_stack_entries = 256;
1089 break;
1090 case CHIP_SUMO:
1091 num_threads = 128;
1092 num_stack_entries = 256;
1093 break;
1094 case CHIP_SUMO2:
1095 num_threads = 128;
1096 num_stack_entries = 512;
1097 break;
1098 case CHIP_BARTS:
1099 num_threads = 128;
1100 num_stack_entries = 512;
1101 break;
1102 case CHIP_TURKS:
1103 num_threads = 128;
1104 num_stack_entries = 256;
1105 break;
1106 case CHIP_CAICOS:
1107 num_threads = 128;
1108 num_stack_entries = 256;
1109 break;
1110 }
1111
1112 /* The primitive type always needs to be POINTLIST for compute. */
1113 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1114 V_008958_DI_PT_POINTLIST);
1115
1116 if (rctx->b.gfx_level < CAYMAN) {
1117
1118 /* These registers control which simds can be used by each stage.
1119 * The default for these registers is 0xffffffff, which means
1120 * all simds are available for each stage. It's possible we may
1121 * want to play around with these in the future, but for now
1122 * the default value is fine.
1123 *
1124 * R_008E20_SQ_STATIC_THREAD_MGMT1
1125 * R_008E24_SQ_STATIC_THREAD_MGMT2
1126 * R_008E28_SQ_STATIC_THREAD_MGMT3
1127 */
1128
1129 /* XXX: We may need to adjust the thread and stack resource
1130 * values for 3D/compute interop */
1131
1132 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1133
1134 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1135 * Set the number of threads used by the PS/VS/GS/ES stage to
1136 * 0.
1137 */
1138 r600_store_value(cb, 0);
1139
1140 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1141 * Set the number of threads used by the CS (aka LS) stage to
1142 * the maximum number of threads and set the number of threads
1143 * for the HS stage to 0. */
1144 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1145
1146 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1147 * Set the Control Flow stack entries to 0 for PS/VS stages */
1148 r600_store_value(cb, 0);
1149
1150 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1151 * Set the Control Flow stack entries to 0 for GS/ES stages */
1152 r600_store_value(cb, 0);
1153
1154 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1155 * Set the Control Flow stack entries to 0 for the HS stage, and
1156 * set it to the maximum value for the CS (aka LS) stage. */
1157 r600_store_value(cb,
1158 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1159 }
1160 /* Give the compute shader all the available LDS space.
1161 * NOTE: This only sets the maximum number of dwords that a compute
1162 * shader can allocate. When a shader is executed, we still need to
1163 * allocate the appropriate amount of LDS dwords using the
1164 * CM_R_0288E8_SQ_LDS_ALLOC register.
1165 */
1166 if (rctx->b.gfx_level < CAYMAN) {
1167 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1168 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1169 } else {
1170 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1171 S_0286FC_NUM_PS_LDS(0) |
1172 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1173 }
1174
1175 /* Context Registers */
1176
1177 if (rctx->b.gfx_level < CAYMAN) {
1178 /* workaround for hw issues with dyn gpr - must set all limits
1179 * to 240 instead of 0, 0x1e == 240 / 8
1180 */
1181 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1182 S_028838_PS_GPRS(0x1e) |
1183 S_028838_VS_GPRS(0x1e) |
1184 S_028838_GS_GPRS(0x1e) |
1185 S_028838_ES_GPRS(0x1e) |
1186 S_028838_HS_GPRS(0x1e) |
1187 S_028838_LS_GPRS(0x1e));
1188 }
1189
1190 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1191 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1192 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1193
1194 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1195
1196 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1197 S_0286E8_TID_IN_GROUP_ENA(1) |
1198 S_0286E8_TGID_ENA(1) |
1199 S_0286E8_DISABLE_INDEX_PACK(1));
1200
1201 /* The LOOP_CONST registers are an optimizations for loops that allows
1202 * you to store the initial counter, increment value, and maximum
1203 * counter value in a register so that hardware can calculate the
1204 * correct number of iterations for the loop, so that you don't need
1205 * to have the loop counter in your shader code. We don't currently use
1206 * this optimization, so we must keep track of the counter in the
1207 * shader and use a break instruction to exit loops. However, the
1208 * hardware will still uses this register to determine when to exit a
1209 * loop, so we need to initialize the counter to 0, set the increment
1210 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1211 * is the maximum value allowed. This gives us a maximum of 4096
1212 * iterations for our loops, but hopefully our break instruction will
1213 * execute before some time before the 4096th iteration.
1214 */
1215 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1216 }
1217
1218
evergreen_get_compute_state_info(struct pipe_context * ctx,void * state,struct pipe_compute_state_object_info * info)1219 static void evergreen_get_compute_state_info(struct pipe_context *ctx, void *state,
1220 struct pipe_compute_state_object_info *info)
1221 {
1222 struct r600_context *rctx = (struct r600_context*)ctx;
1223 struct r600_pipe_compute *shader = state;
1224
1225 /* This is somehow copied from RadeonSI, but in thruth this not more
1226 * than an educated guess. */
1227 uint8_t wave_size = r600_wavefront_size(rctx->b.screen->family);
1228 info->private_memory = shader->sel->current->scratch_space_needed;
1229 info->preferred_simd_size = wave_size;
1230 info->simd_sizes = wave_size;
1231 info->max_threads = 128;
1232 }
1233
evergreen_init_compute_state_functions(struct r600_context * rctx)1234 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1235 {
1236 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1237 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1238 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1239 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1240 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1241 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1242 rctx->b.b.launch_grid = evergreen_launch_grid;
1243 rctx->b.b.get_compute_state_info = evergreen_get_compute_state_info;
1244 }
1245
r600_compute_global_transfer_map(struct pipe_context * ctx,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** ptransfer)1246 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1247 struct pipe_resource *resource,
1248 unsigned level,
1249 unsigned usage,
1250 const struct pipe_box *box,
1251 struct pipe_transfer **ptransfer)
1252 {
1253 struct r600_context *rctx = (struct r600_context*)ctx;
1254 struct compute_memory_pool *pool = rctx->screen->global_pool;
1255 struct r600_resource_global* buffer =
1256 (struct r600_resource_global*)resource;
1257
1258 struct compute_memory_item *item = buffer->chunk;
1259 struct pipe_resource *dst = NULL;
1260 unsigned offset = box->x;
1261
1262 if (usage & PIPE_MAP_READ)
1263 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1264
1265 if (usage & PIPE_MAP_WRITE)
1266 buffer->chunk->status |= ITEM_MAPPED_FOR_WRITING;
1267
1268 if (is_item_in_pool(item)) {
1269 compute_memory_demote_item(pool, item, ctx);
1270 }
1271 else {
1272 if (item->real_buffer == NULL) {
1273 item->real_buffer =
1274 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1275 }
1276 }
1277
1278 dst = (struct pipe_resource*)item->real_buffer;
1279
1280 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1281 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1282 "width = %u, height = %u, depth = %u)\n", level, usage,
1283 box->x, box->y, box->z, box->width, box->height,
1284 box->depth);
1285 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1286 "%u (box.x)\n", item->id, box->x);
1287
1288
1289 assert(resource->target == PIPE_BUFFER);
1290 assert(resource->bind & PIPE_BIND_GLOBAL);
1291 assert(box->x >= 0);
1292 assert(box->y == 0);
1293 assert(box->z == 0);
1294
1295 if (buffer->base.b.is_user_ptr)
1296 return NULL;
1297
1298 ///TODO: do it better, mapping is not possible if the pool is too big
1299 return pipe_buffer_map_range(ctx, dst,
1300 offset, box->width, usage & ~PIPE_MAP_READ, ptransfer);
1301 }
1302
r600_compute_global_transfer_unmap(struct pipe_context * ctx,struct pipe_transfer * transfer)1303 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1304 struct pipe_transfer *transfer)
1305 {
1306 /* struct r600_resource_global are not real resources, they just map
1307 * to an offset within the compute memory pool. The function
1308 * r600_compute_global_transfer_map() maps the memory pool
1309 * resource rather than the struct r600_resource_global passed to
1310 * it as an argument and then initializes ptransfer->resource with
1311 * the memory pool resource (via pipe_buffer_map_range).
1312 * When transfer_unmap is called it uses the memory pool's
1313 * vtable which calls r600_buffer_transfer_map() rather than
1314 * this function.
1315 */
1316 assert (!"This function should not be called");
1317 }
1318
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)1319 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1320 struct pipe_resource *res)
1321 {
1322 struct r600_resource_global* buffer = NULL;
1323 struct r600_screen* rscreen = NULL;
1324
1325 assert(res->target == PIPE_BUFFER);
1326 assert(res->bind & PIPE_BIND_GLOBAL);
1327
1328 buffer = (struct r600_resource_global*)res;
1329 rscreen = (struct r600_screen*)screen;
1330
1331 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1332 buffer->chunk = NULL;
1333
1334 if (buffer->base.b.is_user_ptr)
1335 r600_buffer_destroy(screen, res);
1336 else
1337 free(res);
1338 }
1339
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)1340 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1341 const struct pipe_resource *templ)
1342 {
1343 struct r600_resource_global* result = NULL;
1344 struct r600_screen* rscreen = NULL;
1345 int size_in_dw = 0;
1346
1347 assert(templ->target == PIPE_BUFFER);
1348 assert(templ->bind & PIPE_BIND_GLOBAL);
1349 assert(templ->array_size == 1 || templ->array_size == 0);
1350 assert(templ->depth0 == 1 || templ->depth0 == 0);
1351 assert(templ->height0 == 1 || templ->height0 == 0);
1352
1353 result = (struct r600_resource_global*)
1354 CALLOC(sizeof(struct r600_resource_global), 1);
1355 rscreen = (struct r600_screen*)screen;
1356
1357 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1358 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1359 templ->array_size);
1360
1361 result->base.b.b = *templ;
1362 result->base.b.b.screen = screen;
1363 result->base.compute_global_bo = true;
1364 pipe_reference_init(&result->base.b.b.reference, 1);
1365
1366 size_in_dw = (templ->width0+3) / 4;
1367
1368 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1369
1370 if (result->chunk == NULL)
1371 {
1372 free(result);
1373 return NULL;
1374 }
1375
1376 return &result->base.b.b;
1377 }
1378