• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Adam Rak <adam.rak@streamnovation.com>
25  */
26 
27 #ifdef HAVE_OPENCL
28 #include <gelf.h>
29 #include <libelf.h>
30 #endif
31 #include <stdio.h>
32 #include <errno.h>
33 #include "pipe/p_defines.h"
34 #include "pipe/p_state.h"
35 #include "pipe/p_context.h"
36 #include "util/u_blitter.h"
37 #include "util/list.h"
38 #include "util/u_transfer.h"
39 #include "util/u_surface.h"
40 #include "util/u_pack_color.h"
41 #include "util/u_memory.h"
42 #include "util/u_inlines.h"
43 #include "util/u_framebuffer.h"
44 #include "pipebuffer/pb_buffer.h"
45 #include "evergreend.h"
46 #include "r600_shader.h"
47 #include "r600_pipe.h"
48 #include "r600_formats.h"
49 #include "evergreen_compute.h"
50 #include "evergreen_compute_internal.h"
51 #include "compute_memory_pool.h"
52 #include <inttypes.h>
53 
54 /**
55 RAT0 is for global binding write
56 VTX1 is for global binding read
57 
58 for writing images RAT1...
59 for reading images TEX2...
60   TEX2-RAT1 is paired
61 
62 TEX2... consumes the same fetch resources, that VTX2... would consume
63 
64 CONST0 and VTX0 is for parameters
65   CONST0 is binding smaller input parameter buffer, and for constant indexing,
66   also constant cached
67   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68   the constant cache can handle
69 
70 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71 because we reserve RAT0 for global bindings. With byteaddressing enabled,
72 we should reserve another one too.=> 10 image binding for writing max.
73 
74 from Nvidia OpenCL:
75   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
76   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
77 
78 so 10 for writing is enough. 176 is the max for reading according to the docs
79 
80 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81 writable images will consume TEX slots, VTX slots too because of linear indexing
82 
83 */
84 
85 #ifdef HAVE_OPENCL
radeon_shader_binary_init(struct r600_shader_binary * b)86 static void radeon_shader_binary_init(struct r600_shader_binary *b)
87 {
88 	memset(b, 0, sizeof(*b));
89 }
90 
radeon_shader_binary_clean(struct r600_shader_binary * b)91 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
92 {
93 	if (!b)
94 		return;
95 	FREE(b->code);
96 	FREE(b->config);
97 	FREE(b->rodata);
98 	FREE(b->global_symbol_offsets);
99 	FREE(b->relocs);
100 	FREE(b->disasm_string);
101 }
102 #endif
103 
r600_compute_buffer_alloc_vram(struct r600_screen * screen,unsigned size)104 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
105 						     unsigned size)
106 {
107 	struct pipe_resource *buffer = NULL;
108 	assert(size);
109 
110 	buffer = pipe_buffer_create((struct pipe_screen*) screen,
111 				    0, PIPE_USAGE_IMMUTABLE, size);
112 
113 	return (struct r600_resource *)buffer;
114 }
115 
116 
evergreen_set_rat(struct r600_pipe_compute * pipe,unsigned id,struct r600_resource * bo,int start,int size)117 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
118 			      unsigned id,
119 			      struct r600_resource *bo,
120 			      int start,
121 			      int size)
122 {
123 	struct pipe_surface rat_templ;
124 	struct r600_surface *surf = NULL;
125 	struct r600_context *rctx = NULL;
126 
127 	assert(id < 12);
128 	assert((size & 3) == 0);
129 	assert((start & 0xFF) == 0);
130 
131 	rctx = pipe->ctx;
132 
133 	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
134 
135 	/* Create the RAT surface */
136 	memset(&rat_templ, 0, sizeof(rat_templ));
137 	rat_templ.format = PIPE_FORMAT_R32_UINT;
138 	rat_templ.u.tex.level = 0;
139 	rat_templ.u.tex.first_layer = 0;
140 	rat_templ.u.tex.last_layer = 0;
141 
142 	/* Add the RAT the list of color buffers. Drop the old buffer first. */
143 	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
144 	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
145 		(struct pipe_context *)pipe->ctx,
146 		(struct pipe_resource *)bo, &rat_templ);
147 
148 	/* Update the number of color buffers */
149 	pipe->ctx->framebuffer.state.nr_cbufs =
150 		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
151 
152 	/* Update the cb_target_mask
153 	 * XXX: I think this is a potential spot for bugs once we start doing
154 	 * GL interop.  cb_target_mask may be modified in the 3D sections
155 	 * of this driver. */
156 	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
157 
158 	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
159 	evergreen_init_color_surface_rat(rctx, surf);
160 }
161 
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)162 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
163 					   unsigned vb_index,
164 					   unsigned offset,
165 					   struct pipe_resource *buffer)
166 {
167 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
168 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
169 	vb->buffer_offset = offset;
170 	vb->buffer.resource = buffer;
171 	vb->is_user_buffer = false;
172 
173 	/* The vertex instructions in the compute shaders use the texture cache,
174 	 * so we need to invalidate it. */
175 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
176 	state->enabled_mask |= 1 << vb_index;
177 	state->dirty_mask |= 1 << vb_index;
178 	r600_mark_atom_dirty(rctx, &state->atom);
179 }
180 
evergreen_cs_set_constant_buffer(struct r600_context * rctx,unsigned cb_index,unsigned offset,unsigned size,struct pipe_resource * buffer)181 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
182 					     unsigned cb_index,
183 					     unsigned offset,
184 					     unsigned size,
185 					     struct pipe_resource *buffer)
186 {
187 	struct pipe_constant_buffer cb;
188 	cb.buffer_size = size;
189 	cb.buffer_offset = offset;
190 	cb.buffer = buffer;
191 	cb.user_buffer = NULL;
192 
193 	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
194 }
195 
196 /* We need to define these R600 registers here, because we can't include
197  * evergreend.h and r600d.h.
198  */
199 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
200 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
201 
202 #ifdef HAVE_OPENCL
parse_symbol_table(Elf_Data * symbol_table_data,const GElf_Shdr * symbol_table_header,struct r600_shader_binary * binary)203 static void parse_symbol_table(Elf_Data *symbol_table_data,
204 				const GElf_Shdr *symbol_table_header,
205 				struct r600_shader_binary *binary)
206 {
207 	GElf_Sym symbol;
208 	unsigned i = 0;
209 	unsigned symbol_count =
210 		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
211 
212 	/* We are over allocating this list, because symbol_count gives the
213 	 * total number of symbols, and we will only be filling the list
214 	 * with offsets of global symbols.  The memory savings from
215 	 * allocating the correct size of this list will be small, and
216 	 * I don't think it is worth the cost of pre-computing the number
217 	 * of global symbols.
218 	 */
219 	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
220 
221 	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
222 		unsigned i;
223 		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
224 		    symbol.st_shndx == 0 /* Undefined symbol */) {
225 			continue;
226 		}
227 
228 		binary->global_symbol_offsets[binary->global_symbol_count] =
229 					symbol.st_value;
230 
231 		/* Sort the list using bubble sort.  This list will usually
232 		 * be small. */
233 		for (i = binary->global_symbol_count; i > 0; --i) {
234 			uint64_t lhs = binary->global_symbol_offsets[i - 1];
235 			uint64_t rhs = binary->global_symbol_offsets[i];
236 			if (lhs < rhs) {
237 				break;
238 			}
239 			binary->global_symbol_offsets[i] = lhs;
240 			binary->global_symbol_offsets[i - 1] = rhs;
241 		}
242 		++binary->global_symbol_count;
243 	}
244 }
245 
246 
parse_relocs(Elf * elf,Elf_Data * relocs,Elf_Data * symbols,unsigned symbol_sh_link,struct r600_shader_binary * binary)247 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
248 			unsigned symbol_sh_link,
249 			struct r600_shader_binary *binary)
250 {
251 	unsigned i;
252 
253 	if (!relocs || !symbols || !binary->reloc_count) {
254 		return;
255 	}
256 	binary->relocs = CALLOC(binary->reloc_count,
257 			sizeof(struct r600_shader_reloc));
258 	for (i = 0; i < binary->reloc_count; i++) {
259 		GElf_Sym symbol;
260 		GElf_Rel rel;
261 		char *symbol_name;
262 		struct r600_shader_reloc *reloc = &binary->relocs[i];
263 
264 		gelf_getrel(relocs, i, &rel);
265 		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
266 		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
267 
268 		reloc->offset = rel.r_offset;
269 		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
270 		reloc->name[sizeof(reloc->name)-1] = 0;
271 	}
272 }
273 
r600_elf_read(const char * elf_data,unsigned elf_size,struct r600_shader_binary * binary)274 static void r600_elf_read(const char *elf_data, unsigned elf_size,
275 		 struct r600_shader_binary *binary)
276 {
277 	char *elf_buffer;
278 	Elf *elf;
279 	Elf_Scn *section = NULL;
280 	Elf_Data *symbols = NULL, *relocs = NULL;
281 	size_t section_str_index;
282 	unsigned symbol_sh_link = 0;
283 
284 	/* One of the libelf implementations
285 	 * (http://www.mr511.de/software/english.htm) requires calling
286 	 * elf_version() before elf_memory().
287 	 */
288 	elf_version(EV_CURRENT);
289 	elf_buffer = MALLOC(elf_size);
290 	memcpy(elf_buffer, elf_data, elf_size);
291 
292 	elf = elf_memory(elf_buffer, elf_size);
293 
294 	elf_getshdrstrndx(elf, &section_str_index);
295 
296 	while ((section = elf_nextscn(elf, section))) {
297 		const char *name;
298 		Elf_Data *section_data = NULL;
299 		GElf_Shdr section_header;
300 		if (gelf_getshdr(section, &section_header) != &section_header) {
301 			fprintf(stderr, "Failed to read ELF section header\n");
302 			return;
303 		}
304 		name = elf_strptr(elf, section_str_index, section_header.sh_name);
305 		if (!strcmp(name, ".text")) {
306 			section_data = elf_getdata(section, section_data);
307 			binary->code_size = section_data->d_size;
308 			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
309 			memcpy(binary->code, section_data->d_buf, binary->code_size);
310 		} else if (!strcmp(name, ".AMDGPU.config")) {
311 			section_data = elf_getdata(section, section_data);
312 			binary->config_size = section_data->d_size;
313 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
314 			memcpy(binary->config, section_data->d_buf, binary->config_size);
315 		} else if (!strcmp(name, ".AMDGPU.disasm")) {
316 			/* Always read disassembly if it's available. */
317 			section_data = elf_getdata(section, section_data);
318 			binary->disasm_string = strndup(section_data->d_buf,
319 							section_data->d_size);
320 		} else if (!strncmp(name, ".rodata", 7)) {
321 			section_data = elf_getdata(section, section_data);
322 			binary->rodata_size = section_data->d_size;
323 			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
324 			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
325 		} else if (!strncmp(name, ".symtab", 7)) {
326 			symbols = elf_getdata(section, section_data);
327 			symbol_sh_link = section_header.sh_link;
328 			parse_symbol_table(symbols, &section_header, binary);
329 		} else if (!strcmp(name, ".rel.text")) {
330 			relocs = elf_getdata(section, section_data);
331 			binary->reloc_count = section_header.sh_size /
332 					section_header.sh_entsize;
333 		}
334 	}
335 
336 	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
337 
338 	if (elf){
339 		elf_end(elf);
340 	}
341 	FREE(elf_buffer);
342 
343 	/* Cache the config size per symbol */
344 	if (binary->global_symbol_count) {
345 		binary->config_size_per_symbol =
346 			binary->config_size / binary->global_symbol_count;
347 	} else {
348 		binary->global_symbol_count = 1;
349 		binary->config_size_per_symbol = binary->config_size;
350 	}
351 }
352 
r600_shader_binary_config_start(const struct r600_shader_binary * binary,uint64_t symbol_offset)353 static const unsigned char *r600_shader_binary_config_start(
354 	const struct r600_shader_binary *binary,
355 	uint64_t symbol_offset)
356 {
357 	unsigned i;
358 	for (i = 0; i < binary->global_symbol_count; ++i) {
359 		if (binary->global_symbol_offsets[i] == symbol_offset) {
360 			unsigned offset = i * binary->config_size_per_symbol;
361 			return binary->config + offset;
362 		}
363 	}
364 	return binary->config;
365 }
366 
r600_shader_binary_read_config(const struct r600_shader_binary * binary,struct r600_bytecode * bc,uint64_t symbol_offset,bool * use_kill)367 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
368 					   struct r600_bytecode *bc,
369 					   uint64_t symbol_offset,
370 					   bool *use_kill)
371 {
372        unsigned i;
373        const unsigned char *config =
374                r600_shader_binary_config_start(binary, symbol_offset);
375 
376        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
377                unsigned reg =
378                        util_le32_to_cpu(*(uint32_t*)(config + i));
379                unsigned value =
380                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
381                switch (reg) {
382                /* R600 / R700 */
383                case R_028850_SQ_PGM_RESOURCES_PS:
384                case R_028868_SQ_PGM_RESOURCES_VS:
385                /* Evergreen / Northern Islands */
386                case R_028844_SQ_PGM_RESOURCES_PS:
387                case R_028860_SQ_PGM_RESOURCES_VS:
388                case R_0288D4_SQ_PGM_RESOURCES_LS:
389                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
390                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
391                        break;
392                case R_02880C_DB_SHADER_CONTROL:
393                        *use_kill = G_02880C_KILL_ENABLE(value);
394                        break;
395                case R_0288E8_SQ_LDS_ALLOC:
396                        bc->nlds_dw = value;
397                        break;
398                }
399        }
400 }
401 
r600_create_shader(struct r600_bytecode * bc,const struct r600_shader_binary * binary,bool * use_kill)402 static unsigned r600_create_shader(struct r600_bytecode *bc,
403 				   const struct r600_shader_binary *binary,
404 				   bool *use_kill)
405 
406 {
407 	assert(binary->code_size % 4 == 0);
408 	bc->bytecode = CALLOC(1, binary->code_size);
409 	memcpy(bc->bytecode, binary->code, binary->code_size);
410 	bc->ndw = binary->code_size / 4;
411 
412 	r600_shader_binary_read_config(binary, bc, 0, use_kill);
413 	return 0;
414 }
415 
416 #endif
417 
r600_destroy_shader(struct r600_bytecode * bc)418 static void r600_destroy_shader(struct r600_bytecode *bc)
419 {
420 	FREE(bc->bytecode);
421 }
422 
evergreen_create_compute_state(struct pipe_context * ctx,const struct pipe_compute_state * cso)423 static void *evergreen_create_compute_state(struct pipe_context *ctx,
424 					    const struct pipe_compute_state *cso)
425 {
426 	struct r600_context *rctx = (struct r600_context *)ctx;
427 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
428 #ifdef HAVE_OPENCL
429 	const struct pipe_binary_program_header *header;
430 	void *p;
431 	bool use_kill;
432 #endif
433 
434 	shader->ctx = rctx;
435 	shader->local_size = cso->static_shared_mem;
436 	shader->input_size = cso->req_input_mem;
437 
438 	shader->ir_type = cso->ir_type;
439 
440 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
441 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
442 		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
443 
444 		/* Precompile the shader with the expected shader key, to reduce jank at
445 		 * draw time. Also produces output for shader-db.
446 		 */
447 		bool dirty;
448 		r600_shader_select(ctx, shader->sel, &dirty, true);
449 
450 		return shader;
451 	}
452 #ifdef HAVE_OPENCL
453 	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
454 	header = cso->prog;
455 	radeon_shader_binary_init(&shader->binary);
456 	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
457 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
458 
459 	/* Upload code + ROdata */
460 	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
461 							shader->bc.ndw * 4);
462 	p = r600_buffer_map_sync_with_rings(
463 		&rctx->b, shader->code_bo,
464 		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
465 	//TODO: use util_memcpy_cpu_to_le32 ?
466 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
467 	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
468 #endif
469 
470 	return shader;
471 }
472 
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)473 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
474 {
475 	struct r600_context *rctx = (struct r600_context *)ctx;
476 	struct r600_pipe_compute *shader = state;
477 
478 	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
479 
480 	if (!shader)
481 		return;
482 
483 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
484 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
485 		r600_delete_shader_selector(ctx, shader->sel);
486 	} else {
487 #ifdef HAVE_OPENCL
488 		radeon_shader_binary_clean(&shader->binary);
489 		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
490 		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
491 #endif
492 		r600_destroy_shader(&shader->bc);
493 	}
494 	FREE(shader);
495 }
496 
evergreen_bind_compute_state(struct pipe_context * ctx,void * state)497 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
498 {
499 	struct r600_context *rctx = (struct r600_context *)ctx;
500 	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
501 	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
502 
503 	if (!state) {
504 		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
505 		return;
506 	}
507 
508 	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
509 	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
510 		bool compute_dirty;
511 		if (r600_shader_select(ctx, cstate->sel, &compute_dirty, false))
512 			R600_ERR("Failed to select compute shader\n");
513 	}
514 
515 	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
516 }
517 
518 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
519  * kernel parameters there are implicit parameters that need to be stored
520  * in the vertex buffer as well.  Here is how these parameters are organized in
521  * the buffer:
522  *
523  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
524  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
525  * DWORDS 6-8: Number of work items within each work group in each dimension
526  *             (x,y,z)
527  * DWORDS 9+ : Kernel parameters
528  */
evergreen_compute_upload_input(struct pipe_context * ctx,const struct pipe_grid_info * info)529 static void evergreen_compute_upload_input(struct pipe_context *ctx,
530 					   const struct pipe_grid_info *info)
531 {
532 	struct r600_context *rctx = (struct r600_context *)ctx;
533 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
534 	unsigned i;
535 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
536 	 * parameters.
537 	 */
538 	unsigned input_size;
539 	uint32_t *num_work_groups_start;
540 	uint32_t *global_size_start;
541 	uint32_t *local_size_start;
542 	uint32_t *kernel_parameters_start;
543 	struct pipe_box box;
544 	struct pipe_transfer *transfer = NULL;
545 
546 	if (!shader)
547 		return;
548 	if (shader->input_size == 0) {
549 		return;
550 	}
551 	input_size = shader->input_size + 36;
552 	if (!shader->kernel_param) {
553 		/* Add space for the grid dimensions */
554 		shader->kernel_param = (struct r600_resource *)
555 			pipe_buffer_create(ctx->screen, 0,
556 					PIPE_USAGE_IMMUTABLE, input_size);
557 	}
558 
559 	u_box_1d(0, input_size, &box);
560 	num_work_groups_start = ctx->buffer_map(ctx,
561 			(struct pipe_resource*)shader->kernel_param,
562 			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
563 			&box, &transfer);
564 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
565 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
566 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
567 
568 	/* Copy the work group size */
569 	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
570 
571 	/* Copy the global size */
572 	for (i = 0; i < 3; i++) {
573 		global_size_start[i] = info->grid[i] * info->block[i];
574 	}
575 
576 	/* Copy the local dimensions */
577 	memcpy(local_size_start, info->block, 3 * sizeof(uint));
578 
579 	/* Copy the kernel inputs */
580 	memcpy(kernel_parameters_start, info->input, shader->input_size);
581 
582 	for (i = 0; i < (input_size / 4); i++) {
583 		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
584 			((unsigned*)num_work_groups_start)[i]);
585 	}
586 
587 	ctx->buffer_unmap(ctx, transfer);
588 
589 	/* ID=0 and ID=3 are reserved for the parameters.
590 	 * LLVM will preferably use ID=0, but it does not work for dynamic
591 	 * indices. */
592 	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
593 			(struct pipe_resource*)shader->kernel_param);
594 	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
595 			(struct pipe_resource*)shader->kernel_param);
596 }
597 
evergreen_emit_dispatch(struct r600_context * rctx,const struct pipe_grid_info * info,uint32_t indirect_grid[3])598 static void evergreen_emit_dispatch(struct r600_context *rctx,
599 				    const struct pipe_grid_info *info,
600 				    uint32_t indirect_grid[3])
601 {
602 	int i;
603 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
604 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
605 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
606 	unsigned num_waves;
607 	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
608 	unsigned wave_divisor = (16 * num_pipes);
609 	int group_size = 1;
610 	unsigned lds_size = (shader->local_size + info->variable_shared_mem) / 4;
611 
612 	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
613 	    shader->ir_type != PIPE_SHADER_IR_NIR)
614 		lds_size += shader->bc.nlds_dw;
615 
616 	/* Calculate group_size */
617 	for (i = 0; i < 3; i++) {
618 		group_size *= info->block[i];
619 	}
620 
621 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
622 	num_waves = (info->block[0] * info->block[1] * info->block[2] +
623 			wave_divisor - 1) / wave_divisor;
624 
625 	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
626 				"%u wavefronts per thread block, "
627 				"allocating %u dwords lds.\n",
628 				num_pipes, num_waves, lds_size);
629 
630 	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
631 
632 	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
633 	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
634 	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
635 	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
636 
637 	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
638 								group_size);
639 
640 	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
641 	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
642 	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
643 	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
644 
645 	if (rctx->b.gfx_level < CAYMAN) {
646 		assert(lds_size <= 8192);
647 	} else {
648 		/* Cayman appears to have a slightly smaller limit, see the
649 		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
650 		assert(lds_size <= 8160);
651 	}
652 
653 	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
654 					lds_size | (num_waves << 14));
655 
656 	if (info->indirect) {
657 		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
658 		radeon_emit(cs, indirect_grid[0]);
659 		radeon_emit(cs, indirect_grid[1]);
660 		radeon_emit(cs, indirect_grid[2]);
661 		radeon_emit(cs, 1);
662 	} else {
663 		/* Dispatch packet */
664 		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
665 		radeon_emit(cs, info->grid[0]);
666 		radeon_emit(cs, info->grid[1]);
667 		radeon_emit(cs, info->grid[2]);
668 		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
669 		radeon_emit(cs, 1);
670 	}
671 
672 	if (rctx->is_debug)
673 		eg_trace_emit(rctx);
674 }
675 
compute_setup_cbs(struct r600_context * rctx)676 static void compute_setup_cbs(struct r600_context *rctx)
677 {
678 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
679 	unsigned i;
680 
681 	/* Emit colorbuffers. */
682 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
683 	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
684 		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
685 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
686 						       (struct r600_resource*)cb->base.texture,
687 						       RADEON_USAGE_READWRITE |
688 						       RADEON_PRIO_SHADER_RW_BUFFER);
689 
690 		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
691 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
692 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
693 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
694 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
695 		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
696 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
697 		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
698 
699 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
700 		radeon_emit(cs, reloc);
701 
702 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
703 		radeon_emit(cs, reloc);
704 	}
705 	for (; i < 8 ; i++)
706 		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
707 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
708 	for (; i < 12; i++)
709 		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
710 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
711 
712 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
713 	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
714 				       rctx->compute_cb_target_mask);
715 }
716 
compute_emit_cs(struct r600_context * rctx,const struct pipe_grid_info * info)717 static void compute_emit_cs(struct r600_context *rctx,
718 			    const struct pipe_grid_info *info)
719 {
720 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
721 	bool compute_dirty = false;
722 	struct r600_pipe_shader *current;
723 	struct r600_shader_atomic combined_atomics[8];
724 	uint8_t atomic_used_mask;
725 	uint32_t indirect_grid[3] = { 0, 0, 0 };
726 
727 	/* make sure that the gfx ring is only one active */
728 	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
729 		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
730 	}
731 
732 	r600_update_compressed_resource_state(rctx, true);
733 
734 	if (!rctx->cmd_buf_is_compute) {
735 		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
736 		rctx->cmd_buf_is_compute = true;
737 	}
738 
739 	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
740 	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
741 		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty, false)) {
742 			R600_ERR("Failed to select compute shader\n");
743 			return;
744 		}
745 
746 		current = rctx->cs_shader_state.shader->sel->current;
747 		if (compute_dirty) {
748 			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
749 			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
750 			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
751 		}
752 
753 		bool need_buf_const = current->shader.uses_tex_buffers ||
754 			current->shader.has_txq_cube_array_z_comp;
755 
756 		if (info->indirect) {
757 			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
758 			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
759 			unsigned offset = info->indirect_offset / 4;
760 			indirect_grid[0] = data[offset];
761 			indirect_grid[1] = data[offset + 1];
762 			indirect_grid[2] = data[offset + 2];
763 		}
764 		for (int i = 0; i < 3; i++) {
765 			rctx->cs_block_grid_sizes[i] = info->block[i];
766 			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
767 		}
768 		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
769 		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
770 
771 		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
772 		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
773 
774 		if (need_buf_const) {
775 			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
776 		}
777 		r600_update_driver_const_buffers(rctx, true);
778 
779 		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
780 		if (atomic_used_mask) {
781 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
782 			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
783 		}
784 	} else
785 		r600_need_cs_space(rctx, 0, true, 0);
786 
787 	/* Initialize all the compute-related registers.
788 	 *
789 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
790 	 * of registers initialized by the start_compute_cs_cmd atom.
791 	 */
792 	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
793 
794 	/* emit config state */
795 	if (rctx->b.gfx_level == EVERGREEN) {
796 		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
797 		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
798 			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
799 			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
800 			radeon_emit(cs, 0);
801 			radeon_emit(cs, 0);
802 			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
803 		} else
804 			r600_emit_atom(rctx, &rctx->config_state.atom);
805 	}
806 
807 	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
808 	r600_flush_emit(rctx);
809 
810 	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
811 	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
812 
813 		compute_setup_cbs(rctx);
814 
815 		/* Emit vertex buffer state */
816 		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
817 		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
818 	} else {
819 		uint32_t rat_mask;
820 
821 		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
822 		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
823 					       rat_mask);
824 	}
825 
826 	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
827 
828 	/* Emit constant buffer state */
829 	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
830 
831 	/* Emit sampler state */
832 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
833 
834 	/* Emit sampler view (texture resource) state */
835 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
836 
837 	/* Emit images state */
838 	r600_emit_atom(rctx, &rctx->compute_images.atom);
839 
840 	/* Emit buffers state */
841 	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
842 
843 	/* Emit shader state */
844 	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
845 
846 	/* Emit dispatch state and dispatch packet */
847 	evergreen_emit_dispatch(rctx, info, indirect_grid);
848 
849 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
850 	 */
851 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
852 		      R600_CONTEXT_INV_VERTEX_CACHE |
853 	              R600_CONTEXT_INV_TEX_CACHE;
854 	r600_flush_emit(rctx);
855 	rctx->b.flags = 0;
856 
857 	if (rctx->b.gfx_level >= CAYMAN) {
858 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
859 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
860 		/* DEALLOC_STATE prevents the GPU from hanging when a
861 		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
862 		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
863 		 */
864 		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
865 		radeon_emit(cs, 0);
866 	}
867 	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
868 	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
869 		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
870 
871 #if 0
872 	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
873 	for (i = 0; i < cs->cdw; i++) {
874 		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
875 	}
876 #endif
877 
878 }
879 
880 
881 /**
882  * Emit function for r600_cs_shader_state atom
883  */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)884 void evergreen_emit_cs_shader(struct r600_context *rctx,
885 			      struct r600_atom *atom)
886 {
887 	struct r600_cs_shader_state *state =
888 					(struct r600_cs_shader_state*)atom;
889 	struct r600_pipe_compute *shader = state->shader;
890 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
891 	uint64_t va;
892 	struct r600_resource *code_bo;
893 	unsigned ngpr, nstack;
894 
895 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
896 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
897 		code_bo = shader->sel->current->bo;
898 		va = shader->sel->current->bo->gpu_address;
899 		ngpr = shader->sel->current->shader.bc.ngpr;
900 		nstack = shader->sel->current->shader.bc.nstack;
901 	} else {
902 		code_bo = shader->code_bo;
903 		va = shader->code_bo->gpu_address + state->pc;
904 		ngpr = shader->bc.ngpr;
905 		nstack = shader->bc.nstack;
906 	}
907 
908 	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
909 	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
910 	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
911 			S_0288D4_NUM_GPRS(ngpr) |
912 			S_0288D4_DX10_CLAMP(1) |
913 			S_0288D4_STACK_SIZE(nstack));
914 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
915 
916 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
917 	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
918 					      code_bo, RADEON_USAGE_READ |
919 					      RADEON_PRIO_SHADER_BINARY));
920 }
921 
evergreen_launch_grid(struct pipe_context * ctx,const struct pipe_grid_info * info)922 static void evergreen_launch_grid(struct pipe_context *ctx,
923 				  const struct pipe_grid_info *info)
924 {
925 	struct r600_context *rctx = (struct r600_context *)ctx;
926 #ifdef HAVE_OPENCL
927 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
928 	bool use_kill;
929 
930 	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
931 	    shader->ir_type != PIPE_SHADER_IR_NIR) {
932 		rctx->cs_shader_state.pc = info->pc;
933 		/* Get the config information for this kernel. */
934 		r600_shader_binary_read_config(&shader->binary, &shader->bc,
935 					       info->pc, &use_kill);
936 	} else {
937 		use_kill = false;
938 		rctx->cs_shader_state.pc = 0;
939 	}
940 #endif
941 
942 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
943 
944 
945 	evergreen_compute_upload_input(ctx, info);
946 	compute_emit_cs(rctx, info);
947 }
948 
evergreen_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** surfaces)949 static void evergreen_set_compute_resources(struct pipe_context *ctx,
950 					    unsigned start, unsigned count,
951 					    struct pipe_surface **surfaces)
952 {
953 	struct r600_context *rctx = (struct r600_context *)ctx;
954 	struct r600_surface **resources = (struct r600_surface **)surfaces;
955 
956 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
957 			start, count);
958 
959 	for (unsigned i = 0; i < count; i++) {
960 		/* The First four vertex buffers are reserved for parameters and
961 		 * global buffers. */
962 		unsigned vtx_id = 4 + i;
963 		if (resources[i]) {
964 			struct r600_resource_global *buffer =
965 				(struct r600_resource_global*)
966 				resources[i]->base.texture;
967 			if (resources[i]->base.writable) {
968 				assert(i+1 < 12);
969 
970 				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
971 				(struct r600_resource *)resources[i]->base.texture,
972 				buffer->chunk->start_in_dw*4,
973 				resources[i]->base.texture->width0);
974 			}
975 
976 			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
977 					buffer->chunk->start_in_dw * 4,
978 					resources[i]->base.texture);
979 		}
980 	}
981 }
982 
evergreen_set_global_binding(struct pipe_context * ctx,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)983 static void evergreen_set_global_binding(struct pipe_context *ctx,
984 					 unsigned first, unsigned n,
985 					 struct pipe_resource **resources,
986 					 uint32_t **handles)
987 {
988 	struct r600_context *rctx = (struct r600_context *)ctx;
989 	struct compute_memory_pool *pool = rctx->screen->global_pool;
990 	struct r600_resource_global **buffers =
991 		(struct r600_resource_global **)resources;
992 	unsigned i;
993 
994 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
995 			first, n);
996 
997 	if (!resources) {
998 		/* XXX: Unset */
999 		return;
1000 	}
1001 
1002 	/* We mark these items for promotion to the pool if they
1003 	 * aren't already there */
1004 	for (i = first; i < first + n; i++) {
1005 		struct compute_memory_item *item = buffers[i]->chunk;
1006 
1007 		if (!is_item_in_pool(item))
1008 			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1009 	}
1010 
1011 	if (compute_memory_finalize_pending(pool, ctx) == -1) {
1012 		/* XXX: Unset */
1013 		return;
1014 	}
1015 
1016 	for (i = first; i < first + n; i++)
1017 	{
1018 		uint32_t buffer_offset;
1019 		uint32_t handle;
1020 		assert(resources[i]->target == PIPE_BUFFER);
1021 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1022 
1023 		buffer_offset = util_le32_to_cpu(*(handles[i]));
1024 		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1025 
1026 		*(handles[i]) = util_cpu_to_le32(handle);
1027 	}
1028 
1029 	/* globals for writing */
1030 	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1031 	/* globals for reading */
1032 	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1033 				(struct pipe_resource*)pool->bo);
1034 
1035 	/* constants for reading, LLVM puts them in text segment */
1036 	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1037 				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1038 }
1039 
1040 /**
1041  * This function initializes all the compute specific registers that need to
1042  * be initialized for each compute command stream.  Registers that are common
1043  * to both compute and 3D will be initialized at the beginning of each compute
1044  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1045  * packet requires that the shader type bit be set, we must initialize all
1046  * context registers needed for compute in this function.  The registers
1047  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1048  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1049  * on the GPU family.
1050  */
evergreen_init_atom_start_compute_cs(struct r600_context * rctx)1051 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1052 {
1053 	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1054 	int num_threads;
1055 	int num_stack_entries;
1056 
1057 	/* since all required registers are initialized in the
1058 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1059 	 */
1060 	r600_init_command_buffer(cb, 256);
1061 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1062 
1063 	/* We're setting config registers here. */
1064 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1065 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1066 
1067 	switch (rctx->b.family) {
1068 	case CHIP_CEDAR:
1069 	default:
1070 		num_threads = 128;
1071 		num_stack_entries = 256;
1072 		break;
1073 	case CHIP_REDWOOD:
1074 		num_threads = 128;
1075 		num_stack_entries = 256;
1076 		break;
1077 	case CHIP_JUNIPER:
1078 		num_threads = 128;
1079 		num_stack_entries = 512;
1080 		break;
1081 	case CHIP_CYPRESS:
1082 	case CHIP_HEMLOCK:
1083 		num_threads = 128;
1084 		num_stack_entries = 512;
1085 		break;
1086 	case CHIP_PALM:
1087 		num_threads = 128;
1088 		num_stack_entries = 256;
1089 		break;
1090 	case CHIP_SUMO:
1091 		num_threads = 128;
1092 		num_stack_entries = 256;
1093 		break;
1094 	case CHIP_SUMO2:
1095 		num_threads = 128;
1096 		num_stack_entries = 512;
1097 		break;
1098 	case CHIP_BARTS:
1099 		num_threads = 128;
1100 		num_stack_entries = 512;
1101 		break;
1102 	case CHIP_TURKS:
1103 		num_threads = 128;
1104 		num_stack_entries = 256;
1105 		break;
1106 	case CHIP_CAICOS:
1107 		num_threads = 128;
1108 		num_stack_entries = 256;
1109 		break;
1110 	}
1111 
1112 	/* The primitive type always needs to be POINTLIST for compute. */
1113 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1114 						V_008958_DI_PT_POINTLIST);
1115 
1116 	if (rctx->b.gfx_level < CAYMAN) {
1117 
1118 		/* These registers control which simds can be used by each stage.
1119 		 * The default for these registers is 0xffffffff, which means
1120 		 * all simds are available for each stage.  It's possible we may
1121 		 * want to play around with these in the future, but for now
1122 		 * the default value is fine.
1123 		 *
1124 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1125 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1126 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1127 		 */
1128 
1129 		/* XXX: We may need to adjust the thread and stack resource
1130 		 * values for 3D/compute interop */
1131 
1132 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1133 
1134 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1135 		 * Set the number of threads used by the PS/VS/GS/ES stage to
1136 		 * 0.
1137 		 */
1138 		r600_store_value(cb, 0);
1139 
1140 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1141 		 * Set the number of threads used by the CS (aka LS) stage to
1142 		 * the maximum number of threads and set the number of threads
1143 		 * for the HS stage to 0. */
1144 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1145 
1146 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1147 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1148 		r600_store_value(cb, 0);
1149 
1150 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1151 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1152 		r600_store_value(cb, 0);
1153 
1154 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1155 		 * Set the Control Flow stack entries to 0 for the HS stage, and
1156 		 * set it to the maximum value for the CS (aka LS) stage. */
1157 		r600_store_value(cb,
1158 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1159 	}
1160 	/* Give the compute shader all the available LDS space.
1161 	 * NOTE: This only sets the maximum number of dwords that a compute
1162 	 * shader can allocate.  When a shader is executed, we still need to
1163 	 * allocate the appropriate amount of LDS dwords using the
1164 	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1165 	 */
1166 	if (rctx->b.gfx_level < CAYMAN) {
1167 		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1168 			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1169 	} else {
1170 		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1171 			S_0286FC_NUM_PS_LDS(0) |
1172 			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1173 	}
1174 
1175 	/* Context Registers */
1176 
1177 	if (rctx->b.gfx_level < CAYMAN) {
1178 		/* workaround for hw issues with dyn gpr - must set all limits
1179 		 * to 240 instead of 0, 0x1e == 240 / 8
1180 		 */
1181 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1182 				S_028838_PS_GPRS(0x1e) |
1183 				S_028838_VS_GPRS(0x1e) |
1184 				S_028838_GS_GPRS(0x1e) |
1185 				S_028838_ES_GPRS(0x1e) |
1186 				S_028838_HS_GPRS(0x1e) |
1187 				S_028838_LS_GPRS(0x1e));
1188 	}
1189 
1190 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1191 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1192 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1193 
1194 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1195 
1196 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1197 			       S_0286E8_TID_IN_GROUP_ENA(1) |
1198 			       S_0286E8_TGID_ENA(1) |
1199 			       S_0286E8_DISABLE_INDEX_PACK(1));
1200 
1201 	/* The LOOP_CONST registers are an optimizations for loops that allows
1202 	 * you to store the initial counter, increment value, and maximum
1203 	 * counter value in a register so that hardware can calculate the
1204 	 * correct number of iterations for the loop, so that you don't need
1205 	 * to have the loop counter in your shader code.  We don't currently use
1206 	 * this optimization, so we must keep track of the counter in the
1207 	 * shader and use a break instruction to exit loops.  However, the
1208 	 * hardware will still uses this register to determine when to exit a
1209 	 * loop, so we need to initialize the counter to 0, set the increment
1210 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1211 	 * is the maximum value allowed.  This gives us a maximum of 4096
1212 	 * iterations for our loops, but hopefully our break instruction will
1213 	 * execute before some time before the 4096th iteration.
1214 	 */
1215 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1216 }
1217 
1218 
evergreen_get_compute_state_info(struct pipe_context * ctx,void * state,struct pipe_compute_state_object_info * info)1219 static void evergreen_get_compute_state_info(struct pipe_context *ctx, void *state,
1220                                              struct pipe_compute_state_object_info *info)
1221 {
1222 	struct r600_context *rctx = (struct r600_context*)ctx;
1223 	struct r600_pipe_compute *shader = state;
1224 
1225 	/* This is somehow copied from RadeonSI, but in thruth this not more
1226 	 * than an educated guess. */
1227 	uint8_t wave_size = r600_wavefront_size(rctx->b.screen->family);
1228 	info->private_memory = shader->sel->current->scratch_space_needed;
1229 	info->preferred_simd_size = wave_size;
1230 	info->simd_sizes = wave_size;
1231 	info->max_threads = 128;
1232 }
1233 
evergreen_init_compute_state_functions(struct r600_context * rctx)1234 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1235 {
1236 	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1237 	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1238 	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1239 //	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1240 	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1241 	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1242 	rctx->b.b.launch_grid = evergreen_launch_grid;
1243 	rctx->b.b.get_compute_state_info = evergreen_get_compute_state_info;
1244 }
1245 
r600_compute_global_transfer_map(struct pipe_context * ctx,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** ptransfer)1246 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1247 				      struct pipe_resource *resource,
1248 				      unsigned level,
1249 				      unsigned usage,
1250 				      const struct pipe_box *box,
1251 				      struct pipe_transfer **ptransfer)
1252 {
1253 	struct r600_context *rctx = (struct r600_context*)ctx;
1254 	struct compute_memory_pool *pool = rctx->screen->global_pool;
1255 	struct r600_resource_global* buffer =
1256 		(struct r600_resource_global*)resource;
1257 
1258 	struct compute_memory_item *item = buffer->chunk;
1259 	struct pipe_resource *dst = NULL;
1260 	unsigned offset = box->x;
1261 
1262 	if (usage & PIPE_MAP_READ)
1263 		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1264 
1265 	if (usage & PIPE_MAP_WRITE)
1266 		buffer->chunk->status |= ITEM_MAPPED_FOR_WRITING;
1267 
1268 	if (is_item_in_pool(item)) {
1269 		compute_memory_demote_item(pool, item, ctx);
1270 	}
1271 	else {
1272 		if (item->real_buffer == NULL) {
1273 			item->real_buffer =
1274 					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1275 		}
1276 	}
1277 
1278 	dst = (struct pipe_resource*)item->real_buffer;
1279 
1280 	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1281 			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1282 			"width = %u, height = %u, depth = %u)\n", level, usage,
1283 			box->x, box->y, box->z, box->width, box->height,
1284 			box->depth);
1285 	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1286 		"%u (box.x)\n", item->id, box->x);
1287 
1288 
1289 	assert(resource->target == PIPE_BUFFER);
1290 	assert(resource->bind & PIPE_BIND_GLOBAL);
1291 	assert(box->x >= 0);
1292 	assert(box->y == 0);
1293 	assert(box->z == 0);
1294 
1295 	if (buffer->base.b.is_user_ptr)
1296 		return NULL;
1297 
1298 	///TODO: do it better, mapping is not possible if the pool is too big
1299 	return pipe_buffer_map_range(ctx, dst,
1300 			offset, box->width, usage & ~PIPE_MAP_READ, ptransfer);
1301 }
1302 
r600_compute_global_transfer_unmap(struct pipe_context * ctx,struct pipe_transfer * transfer)1303 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1304 					struct pipe_transfer *transfer)
1305 {
1306 	/* struct r600_resource_global are not real resources, they just map
1307 	 * to an offset within the compute memory pool.  The function
1308 	 * r600_compute_global_transfer_map() maps the memory pool
1309 	 * resource rather than the struct r600_resource_global passed to
1310 	 * it as an argument and then initializes ptransfer->resource with
1311 	 * the memory pool resource (via pipe_buffer_map_range).
1312 	 * When transfer_unmap is called it uses the memory pool's
1313 	 * vtable which calls r600_buffer_transfer_map() rather than
1314 	 * this function.
1315 	 */
1316 	assert (!"This function should not be called");
1317 }
1318 
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)1319 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1320 					struct pipe_resource *res)
1321 {
1322 	struct r600_resource_global* buffer = NULL;
1323 	struct r600_screen* rscreen = NULL;
1324 
1325 	assert(res->target == PIPE_BUFFER);
1326 	assert(res->bind & PIPE_BIND_GLOBAL);
1327 
1328 	buffer = (struct r600_resource_global*)res;
1329 	rscreen = (struct r600_screen*)screen;
1330 
1331 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1332 	buffer->chunk = NULL;
1333 
1334 	if (buffer->base.b.is_user_ptr)
1335 		r600_buffer_destroy(screen, res);
1336 	else
1337 		free(res);
1338 }
1339 
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)1340 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1341 							const struct pipe_resource *templ)
1342 {
1343 	struct r600_resource_global* result = NULL;
1344 	struct r600_screen* rscreen = NULL;
1345 	int size_in_dw = 0;
1346 
1347 	assert(templ->target == PIPE_BUFFER);
1348 	assert(templ->bind & PIPE_BIND_GLOBAL);
1349 	assert(templ->array_size == 1 || templ->array_size == 0);
1350 	assert(templ->depth0 == 1 || templ->depth0 == 0);
1351 	assert(templ->height0 == 1 || templ->height0 == 0);
1352 
1353 	result = (struct r600_resource_global*)
1354 	CALLOC(sizeof(struct r600_resource_global), 1);
1355 	rscreen = (struct r600_screen*)screen;
1356 
1357 	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1358 	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1359 			templ->array_size);
1360 
1361 	result->base.b.b = *templ;
1362 	result->base.b.b.screen = screen;
1363 	result->base.compute_global_bo = true;
1364 	pipe_reference_init(&result->base.b.b.reference, 1);
1365 
1366 	size_in_dw = (templ->width0+3) / 4;
1367 
1368 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1369 
1370 	if (result->chunk == NULL)
1371 	{
1372 		free(result);
1373 		return NULL;
1374 	}
1375 
1376 	return &result->base.b.b;
1377 }
1378