• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Adam Rak <adam.rak@streamnovation.com>
25  */
26 
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "evergreend.h"
42 #include "r600_shader.h"
43 #include "r600_pipe.h"
44 #include "r600_formats.h"
45 #include "evergreen_compute.h"
46 #include "evergreen_compute_internal.h"
47 #include "compute_memory_pool.h"
48 #include "sb/sb_public.h"
49 #include "radeon/radeon_elf_util.h"
50 #include <inttypes.h>
51 
52 /**
53 RAT0 is for global binding write
54 VTX1 is for global binding read
55 
56 for wrting images RAT1...
57 for reading images TEX2...
58   TEX2-RAT1 is paired
59 
60 TEX2... consumes the same fetch resources, that VTX2... would consume
61 
62 CONST0 and VTX0 is for parameters
63   CONST0 is binding smaller input parameter buffer, and for constant indexing,
64   also constant cached
65   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
66   the constant cache can handle
67 
68 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
69 because we reserve RAT0 for global bindings. With byteaddressing enabled,
70 we should reserve another one too.=> 10 image binding for writing max.
71 
72 from Nvidia OpenCL:
73   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
74   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
75 
76 so 10 for writing is enough. 176 is the max for reading according to the docs
77 
78 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
79 writable images will consume TEX slots, VTX slots too because of linear indexing
80 
81 */
82 
r600_compute_buffer_alloc_vram(struct r600_screen * screen,unsigned size)83 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
84 						     unsigned size)
85 {
86 	struct pipe_resource *buffer = NULL;
87 	assert(size);
88 
89 	buffer = pipe_buffer_create((struct pipe_screen*) screen,
90 				    0, PIPE_USAGE_IMMUTABLE, size);
91 
92 	return (struct r600_resource *)buffer;
93 }
94 
95 
evergreen_set_rat(struct r600_pipe_compute * pipe,unsigned id,struct r600_resource * bo,int start,int size)96 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
97 			      unsigned id,
98 			      struct r600_resource *bo,
99 			      int start,
100 			      int size)
101 {
102 	struct pipe_surface rat_templ;
103 	struct r600_surface *surf = NULL;
104 	struct r600_context *rctx = NULL;
105 
106 	assert(id < 12);
107 	assert((size & 3) == 0);
108 	assert((start & 0xFF) == 0);
109 
110 	rctx = pipe->ctx;
111 
112 	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
113 
114 	/* Create the RAT surface */
115 	memset(&rat_templ, 0, sizeof(rat_templ));
116 	rat_templ.format = PIPE_FORMAT_R32_UINT;
117 	rat_templ.u.tex.level = 0;
118 	rat_templ.u.tex.first_layer = 0;
119 	rat_templ.u.tex.last_layer = 0;
120 
121 	/* Add the RAT the list of color buffers */
122 	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
123 		(struct pipe_context *)pipe->ctx,
124 		(struct pipe_resource *)bo, &rat_templ);
125 
126 	/* Update the number of color buffers */
127 	pipe->ctx->framebuffer.state.nr_cbufs =
128 		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
129 
130 	/* Update the cb_target_mask
131 	 * XXX: I think this is a potential spot for bugs once we start doing
132 	 * GL interop.  cb_target_mask may be modified in the 3D sections
133 	 * of this driver. */
134 	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
135 
136 	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
137 	evergreen_init_color_surface_rat(rctx, surf);
138 }
139 
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)140 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
141 					   unsigned vb_index,
142 					   unsigned offset,
143 					   struct pipe_resource *buffer)
144 {
145 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
146 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
147 	vb->stride = 1;
148 	vb->buffer_offset = offset;
149 	vb->buffer = buffer;
150 	vb->user_buffer = NULL;
151 
152 	/* The vertex instructions in the compute shaders use the texture cache,
153 	 * so we need to invalidate it. */
154 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
155 	state->enabled_mask |= 1 << vb_index;
156 	state->dirty_mask |= 1 << vb_index;
157 	r600_mark_atom_dirty(rctx, &state->atom);
158 }
159 
evergreen_cs_set_constant_buffer(struct r600_context * rctx,unsigned cb_index,unsigned offset,unsigned size,struct pipe_resource * buffer)160 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
161 					     unsigned cb_index,
162 					     unsigned offset,
163 					     unsigned size,
164 					     struct pipe_resource *buffer)
165 {
166 	struct pipe_constant_buffer cb;
167 	cb.buffer_size = size;
168 	cb.buffer_offset = offset;
169 	cb.buffer = buffer;
170 	cb.user_buffer = NULL;
171 
172 	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
173 }
174 
175 /* We need to define these R600 registers here, because we can't include
176  * evergreend.h and r600d.h.
177  */
178 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
179 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
180 
181 #ifdef HAVE_OPENCL
182 
r600_shader_binary_read_config(const struct radeon_shader_binary * binary,struct r600_bytecode * bc,uint64_t symbol_offset,boolean * use_kill)183 static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
184 					   struct r600_bytecode *bc,
185 					   uint64_t symbol_offset,
186 					   boolean *use_kill)
187 {
188        unsigned i;
189        const unsigned char *config =
190                radeon_shader_binary_config_start(binary, symbol_offset);
191 
192        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
193                unsigned reg =
194                        util_le32_to_cpu(*(uint32_t*)(config + i));
195                unsigned value =
196                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
197                switch (reg) {
198                /* R600 / R700 */
199                case R_028850_SQ_PGM_RESOURCES_PS:
200                case R_028868_SQ_PGM_RESOURCES_VS:
201                /* Evergreen / Northern Islands */
202                case R_028844_SQ_PGM_RESOURCES_PS:
203                case R_028860_SQ_PGM_RESOURCES_VS:
204                case R_0288D4_SQ_PGM_RESOURCES_LS:
205                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
206                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
207                        break;
208                case R_02880C_DB_SHADER_CONTROL:
209                        *use_kill = G_02880C_KILL_ENABLE(value);
210                        break;
211                case R_0288E8_SQ_LDS_ALLOC:
212                        bc->nlds_dw = value;
213                        break;
214                }
215        }
216 }
217 
r600_create_shader(struct r600_bytecode * bc,const struct radeon_shader_binary * binary,boolean * use_kill)218 static unsigned r600_create_shader(struct r600_bytecode *bc,
219 				   const struct radeon_shader_binary *binary,
220 				   boolean *use_kill)
221 
222 {
223 	assert(binary->code_size % 4 == 0);
224 	bc->bytecode = CALLOC(1, binary->code_size);
225 	memcpy(bc->bytecode, binary->code, binary->code_size);
226 	bc->ndw = binary->code_size / 4;
227 
228 	r600_shader_binary_read_config(binary, bc, 0, use_kill);
229 	return 0;
230 }
231 
232 #endif
233 
r600_destroy_shader(struct r600_bytecode * bc)234 static void r600_destroy_shader(struct r600_bytecode *bc)
235 {
236 	FREE(bc->bytecode);
237 }
238 
evergreen_create_compute_state(struct pipe_context * ctx,const struct pipe_compute_state * cso)239 static void *evergreen_create_compute_state(struct pipe_context *ctx,
240 					    const struct pipe_compute_state *cso)
241 {
242 	struct r600_context *rctx = (struct r600_context *)ctx;
243 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
244 #ifdef HAVE_OPENCL
245 	const struct pipe_llvm_program_header *header;
246 	const char *code;
247 	void *p;
248 	boolean use_kill;
249 
250 	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
251 	header = cso->prog;
252 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
253 	radeon_shader_binary_init(&shader->binary);
254 	radeon_elf_read(code, header->num_bytes, &shader->binary);
255 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
256 
257 	/* Upload code + ROdata */
258 	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
259 							shader->bc.ndw * 4);
260 	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
261 	//TODO: use util_memcpy_cpu_to_le32 ?
262 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
263 	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
264 #endif
265 
266 	shader->ctx = rctx;
267 	shader->local_size = cso->req_local_mem;
268 	shader->private_size = cso->req_private_mem;
269 	shader->input_size = cso->req_input_mem;
270 
271 	return shader;
272 }
273 
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)274 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
275 {
276 	struct r600_context *rctx = (struct r600_context *)ctx;
277 	struct r600_pipe_compute *shader = state;
278 
279 	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
280 
281 	if (!shader)
282 		return;
283 
284 	radeon_shader_binary_clean(&shader->binary);
285 	r600_destroy_shader(&shader->bc);
286 
287 	/* TODO destroy shader->code_bo, shader->const_bo
288 	 * we'll need something like r600_buffer_free */
289 	FREE(shader);
290 }
291 
evergreen_bind_compute_state(struct pipe_context * ctx,void * state)292 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
293 {
294 	struct r600_context *rctx = (struct r600_context *)ctx;
295 
296 	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
297 
298 	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
299 }
300 
301 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
302  * kernel parameters there are implicit parameters that need to be stored
303  * in the vertex buffer as well.  Here is how these parameters are organized in
304  * the buffer:
305  *
306  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
307  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
308  * DWORDS 6-8: Number of work items within each work group in each dimension
309  *             (x,y,z)
310  * DWORDS 9+ : Kernel parameters
311  */
evergreen_compute_upload_input(struct pipe_context * ctx,const struct pipe_grid_info * info)312 static void evergreen_compute_upload_input(struct pipe_context *ctx,
313 					   const struct pipe_grid_info *info)
314 {
315 	struct r600_context *rctx = (struct r600_context *)ctx;
316 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
317 	unsigned i;
318 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
319 	 * parameters.
320 	 */
321 	unsigned input_size = shader->input_size + 36;
322 	uint32_t *num_work_groups_start;
323 	uint32_t *global_size_start;
324 	uint32_t *local_size_start;
325 	uint32_t *kernel_parameters_start;
326 	struct pipe_box box;
327 	struct pipe_transfer *transfer = NULL;
328 
329 	if (shader->input_size == 0) {
330 		return;
331 	}
332 
333 	if (!shader->kernel_param) {
334 		/* Add space for the grid dimensions */
335 		shader->kernel_param = (struct r600_resource *)
336 			pipe_buffer_create(ctx->screen, 0,
337 					PIPE_USAGE_IMMUTABLE, input_size);
338 	}
339 
340 	u_box_1d(0, input_size, &box);
341 	num_work_groups_start = ctx->transfer_map(ctx,
342 			(struct pipe_resource*)shader->kernel_param,
343 			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
344 			&box, &transfer);
345 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
346 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
347 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
348 
349 	/* Copy the work group size */
350 	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
351 
352 	/* Copy the global size */
353 	for (i = 0; i < 3; i++) {
354 		global_size_start[i] = info->grid[i] * info->block[i];
355 	}
356 
357 	/* Copy the local dimensions */
358 	memcpy(local_size_start, info->block, 3 * sizeof(uint));
359 
360 	/* Copy the kernel inputs */
361 	memcpy(kernel_parameters_start, info->input, shader->input_size);
362 
363 	for (i = 0; i < (input_size / 4); i++) {
364 		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
365 			((unsigned*)num_work_groups_start)[i]);
366 	}
367 
368 	ctx->transfer_unmap(ctx, transfer);
369 
370 	/* ID=0 and ID=3 are reserved for the parameters.
371 	 * LLVM will preferably use ID=0, but it does not work for dynamic
372 	 * indices. */
373 	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
374 			(struct pipe_resource*)shader->kernel_param);
375 	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
376 			(struct pipe_resource*)shader->kernel_param);
377 }
378 
evergreen_emit_dispatch(struct r600_context * rctx,const struct pipe_grid_info * info)379 static void evergreen_emit_dispatch(struct r600_context *rctx,
380 				    const struct pipe_grid_info *info)
381 {
382 	int i;
383 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
384 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
385 	unsigned num_waves;
386 	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
387 	unsigned wave_divisor = (16 * num_pipes);
388 	int group_size = 1;
389 	int grid_size = 1;
390 	unsigned lds_size = shader->local_size / 4 +
391 		shader->bc.nlds_dw;
392 
393 
394 	/* Calculate group_size/grid_size */
395 	for (i = 0; i < 3; i++) {
396 		group_size *= info->block[i];
397 	}
398 
399 	for (i = 0; i < 3; i++)	{
400 		grid_size *= info->grid[i];
401 	}
402 
403 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
404 	num_waves = (info->block[0] * info->block[1] * info->block[2] +
405 			wave_divisor - 1) / wave_divisor;
406 
407 	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
408 				"%u wavefronts per thread block, "
409 				"allocating %u dwords lds.\n",
410 				num_pipes, num_waves, lds_size);
411 
412 	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
413 
414 	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
415 	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
416 	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
417 	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
418 
419 	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
420 								group_size);
421 
422 	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
423 	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
424 	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
425 	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
426 
427 	if (rctx->b.chip_class < CAYMAN) {
428 		assert(lds_size <= 8192);
429 	} else {
430 		/* Cayman appears to have a slightly smaller limit, see the
431 		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
432 		assert(lds_size <= 8160);
433 	}
434 
435 	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
436 					lds_size | (num_waves << 14));
437 
438 	/* Dispatch packet */
439 	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
440 	radeon_emit(cs, info->grid[0]);
441 	radeon_emit(cs, info->grid[1]);
442 	radeon_emit(cs, info->grid[2]);
443 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
444 	radeon_emit(cs, 1);
445 }
446 
compute_emit_cs(struct r600_context * rctx,const struct pipe_grid_info * info)447 static void compute_emit_cs(struct r600_context *rctx,
448 			    const struct pipe_grid_info *info)
449 {
450 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
451 	unsigned i;
452 
453 	/* make sure that the gfx ring is only one active */
454 	if (radeon_emitted(rctx->b.dma.cs, 0)) {
455 		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
456 	}
457 
458 	/* Initialize all the compute-related registers.
459 	 *
460 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
461 	 * of registers initialized by the start_compute_cs_cmd atom.
462 	 */
463 	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
464 
465 	/* emit config state */
466 	if (rctx->b.chip_class == EVERGREEN)
467 		r600_emit_atom(rctx, &rctx->config_state.atom);
468 
469 	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
470 	r600_flush_emit(rctx);
471 
472 	/* Emit colorbuffers. */
473 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
474 	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
475 		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
476 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
477 						       (struct r600_resource*)cb->base.texture,
478 						       RADEON_USAGE_READWRITE,
479 						       RADEON_PRIO_SHADER_RW_BUFFER);
480 
481 		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
482 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
483 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
484 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
485 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
486 		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
487 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
488 		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
489 
490 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
491 		radeon_emit(cs, reloc);
492 
493 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
494 		radeon_emit(cs, reloc);
495 	}
496 	for (; i < 8 ; i++)
497 		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
498 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
499 	for (; i < 12; i++)
500 		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
501 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
502 
503 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
504 	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
505 					rctx->compute_cb_target_mask);
506 
507 
508 	/* Emit vertex buffer state */
509 	rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
510 	r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
511 
512 	/* Emit constant buffer state */
513 	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
514 
515 	/* Emit sampler state */
516 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
517 
518 	/* Emit sampler view (texture resource) state */
519 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
520 
521 	/* Emit compute shader state */
522 	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
523 
524 	/* Emit dispatch state and dispatch packet */
525 	evergreen_emit_dispatch(rctx, info);
526 
527 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
528 	 */
529 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
530 		      R600_CONTEXT_INV_VERTEX_CACHE |
531 	              R600_CONTEXT_INV_TEX_CACHE;
532 	r600_flush_emit(rctx);
533 	rctx->b.flags = 0;
534 
535 	if (rctx->b.chip_class >= CAYMAN) {
536 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
537 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
538 		/* DEALLOC_STATE prevents the GPU from hanging when a
539 		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
540 		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
541 		 */
542 		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
543 		radeon_emit(cs, 0);
544 	}
545 
546 #if 0
547 	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
548 	for (i = 0; i < cs->cdw; i++) {
549 		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
550 	}
551 #endif
552 
553 }
554 
555 
556 /**
557  * Emit function for r600_cs_shader_state atom
558  */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)559 void evergreen_emit_cs_shader(struct r600_context *rctx,
560 			      struct r600_atom *atom)
561 {
562 	struct r600_cs_shader_state *state =
563 					(struct r600_cs_shader_state*)atom;
564 	struct r600_pipe_compute *shader = state->shader;
565 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
566 	uint64_t va;
567 	struct r600_resource *code_bo;
568 	unsigned ngpr, nstack;
569 
570 	code_bo = shader->code_bo;
571 	va = shader->code_bo->gpu_address + state->pc;
572 	ngpr = shader->bc.ngpr;
573 	nstack = shader->bc.nstack;
574 
575 	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
576 	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
577 	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
578 			S_0288D4_NUM_GPRS(ngpr)
579 			| S_0288D4_STACK_SIZE(nstack));
580 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
581 
582 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
583 	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
584 					      code_bo, RADEON_USAGE_READ,
585 					      RADEON_PRIO_SHADER_BINARY));
586 }
587 
evergreen_launch_grid(struct pipe_context * ctx,const struct pipe_grid_info * info)588 static void evergreen_launch_grid(struct pipe_context *ctx,
589 				  const struct pipe_grid_info *info)
590 {
591 	struct r600_context *rctx = (struct r600_context *)ctx;
592 #ifdef HAVE_OPENCL
593 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
594 	boolean use_kill;
595 
596 	rctx->cs_shader_state.pc = info->pc;
597 	/* Get the config information for this kernel. */
598 	r600_shader_binary_read_config(&shader->binary, &shader->bc,
599                                   info->pc, &use_kill);
600 #endif
601 
602 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
603 
604 
605 	evergreen_compute_upload_input(ctx, info);
606 	compute_emit_cs(rctx, info);
607 }
608 
evergreen_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** surfaces)609 static void evergreen_set_compute_resources(struct pipe_context *ctx,
610 					    unsigned start, unsigned count,
611 					    struct pipe_surface **surfaces)
612 {
613 	struct r600_context *rctx = (struct r600_context *)ctx;
614 	struct r600_surface **resources = (struct r600_surface **)surfaces;
615 
616 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
617 			start, count);
618 
619 	for (unsigned i = 0; i < count; i++) {
620 		/* The First four vertex buffers are reserved for parameters and
621 		 * global buffers. */
622 		unsigned vtx_id = 4 + i;
623 		if (resources[i]) {
624 			struct r600_resource_global *buffer =
625 				(struct r600_resource_global*)
626 				resources[i]->base.texture;
627 			if (resources[i]->base.writable) {
628 				assert(i+1 < 12);
629 
630 				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
631 				(struct r600_resource *)resources[i]->base.texture,
632 				buffer->chunk->start_in_dw*4,
633 				resources[i]->base.texture->width0);
634 			}
635 
636 			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
637 					buffer->chunk->start_in_dw * 4,
638 					resources[i]->base.texture);
639 		}
640 	}
641 }
642 
evergreen_set_global_binding(struct pipe_context * ctx,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)643 static void evergreen_set_global_binding(struct pipe_context *ctx,
644 					 unsigned first, unsigned n,
645 					 struct pipe_resource **resources,
646 					 uint32_t **handles)
647 {
648 	struct r600_context *rctx = (struct r600_context *)ctx;
649 	struct compute_memory_pool *pool = rctx->screen->global_pool;
650 	struct r600_resource_global **buffers =
651 		(struct r600_resource_global **)resources;
652 	unsigned i;
653 
654 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
655 			first, n);
656 
657 	if (!resources) {
658 		/* XXX: Unset */
659 		return;
660 	}
661 
662 	/* We mark these items for promotion to the pool if they
663 	 * aren't already there */
664 	for (i = first; i < first + n; i++) {
665 		struct compute_memory_item *item = buffers[i]->chunk;
666 
667 		if (!is_item_in_pool(item))
668 			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
669 	}
670 
671 	if (compute_memory_finalize_pending(pool, ctx) == -1) {
672 		/* XXX: Unset */
673 		return;
674 	}
675 
676 	for (i = first; i < first + n; i++)
677 	{
678 		uint32_t buffer_offset;
679 		uint32_t handle;
680 		assert(resources[i]->target == PIPE_BUFFER);
681 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
682 
683 		buffer_offset = util_le32_to_cpu(*(handles[i]));
684 		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
685 
686 		*(handles[i]) = util_cpu_to_le32(handle);
687 	}
688 
689 	/* globals for writing */
690 	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
691 	/* globals for reading */
692 	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
693 				(struct pipe_resource*)pool->bo);
694 
695 	/* constants for reading, LLVM puts them in text segment */
696 	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
697 				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
698 }
699 
700 /**
701  * This function initializes all the compute specific registers that need to
702  * be initialized for each compute command stream.  Registers that are common
703  * to both compute and 3D will be initialized at the beginning of each compute
704  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
705  * packet requires that the shader type bit be set, we must initialize all
706  * context registers needed for compute in this function.  The registers
707  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
708  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
709  * on the GPU family.
710  */
evergreen_init_atom_start_compute_cs(struct r600_context * rctx)711 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
712 {
713 	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
714 	int num_threads;
715 	int num_stack_entries;
716 
717 	/* since all required registers are initialized in the
718 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
719 	 */
720 	r600_init_command_buffer(cb, 256);
721 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
722 
723 	/* This must be first. */
724 	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
725 	r600_store_value(cb, 0x80000000);
726 	r600_store_value(cb, 0x80000000);
727 
728 	/* We're setting config registers here. */
729 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
730 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
731 
732 	switch (rctx->b.family) {
733 	case CHIP_CEDAR:
734 	default:
735 		num_threads = 128;
736 		num_stack_entries = 256;
737 		break;
738 	case CHIP_REDWOOD:
739 		num_threads = 128;
740 		num_stack_entries = 256;
741 		break;
742 	case CHIP_JUNIPER:
743 		num_threads = 128;
744 		num_stack_entries = 512;
745 		break;
746 	case CHIP_CYPRESS:
747 	case CHIP_HEMLOCK:
748 		num_threads = 128;
749 		num_stack_entries = 512;
750 		break;
751 	case CHIP_PALM:
752 		num_threads = 128;
753 		num_stack_entries = 256;
754 		break;
755 	case CHIP_SUMO:
756 		num_threads = 128;
757 		num_stack_entries = 256;
758 		break;
759 	case CHIP_SUMO2:
760 		num_threads = 128;
761 		num_stack_entries = 512;
762 		break;
763 	case CHIP_BARTS:
764 		num_threads = 128;
765 		num_stack_entries = 512;
766 		break;
767 	case CHIP_TURKS:
768 		num_threads = 128;
769 		num_stack_entries = 256;
770 		break;
771 	case CHIP_CAICOS:
772 		num_threads = 128;
773 		num_stack_entries = 256;
774 		break;
775 	}
776 
777 	/* Config Registers */
778 	if (rctx->b.chip_class < CAYMAN)
779 		evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
780 					   rctx->screen->b.info.drm_minor);
781 	else
782 		cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
783 					rctx->screen->b.info.drm_minor);
784 
785 	/* The primitive type always needs to be POINTLIST for compute. */
786 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
787 						V_008958_DI_PT_POINTLIST);
788 
789 	if (rctx->b.chip_class < CAYMAN) {
790 
791 		/* These registers control which simds can be used by each stage.
792 		 * The default for these registers is 0xffffffff, which means
793 		 * all simds are available for each stage.  It's possible we may
794 		 * want to play around with these in the future, but for now
795 		 * the default value is fine.
796 		 *
797 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
798 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
799 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
800 		 */
801 
802 		/* XXX: We may need to adjust the thread and stack resource
803 		 * values for 3D/compute interop */
804 
805 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
806 
807 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
808 		 * Set the number of threads used by the PS/VS/GS/ES stage to
809 		 * 0.
810 		 */
811 		r600_store_value(cb, 0);
812 
813 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
814 		 * Set the number of threads used by the CS (aka LS) stage to
815 		 * the maximum number of threads and set the number of threads
816 		 * for the HS stage to 0. */
817 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
818 
819 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
820 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
821 		r600_store_value(cb, 0);
822 
823 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
824 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
825 		r600_store_value(cb, 0);
826 
827 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
828 		 * Set the Contol Flow stack entries to 0 for the HS stage, and
829 		 * set it to the maximum value for the CS (aka LS) stage. */
830 		r600_store_value(cb,
831 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
832 	}
833 	/* Give the compute shader all the available LDS space.
834 	 * NOTE: This only sets the maximum number of dwords that a compute
835 	 * shader can allocate.  When a shader is executed, we still need to
836 	 * allocate the appropriate amount of LDS dwords using the
837 	 * CM_R_0288E8_SQ_LDS_ALLOC register.
838 	 */
839 	if (rctx->b.chip_class < CAYMAN) {
840 		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
841 			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
842 	} else {
843 		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
844 			S_0286FC_NUM_PS_LDS(0) |
845 			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
846 	}
847 
848 	/* Context Registers */
849 
850 	if (rctx->b.chip_class < CAYMAN) {
851 		/* workaround for hw issues with dyn gpr - must set all limits
852 		 * to 240 instead of 0, 0x1e == 240 / 8
853 		 */
854 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
855 				S_028838_PS_GPRS(0x1e) |
856 				S_028838_VS_GPRS(0x1e) |
857 				S_028838_GS_GPRS(0x1e) |
858 				S_028838_ES_GPRS(0x1e) |
859 				S_028838_HS_GPRS(0x1e) |
860 				S_028838_LS_GPRS(0x1e));
861 	}
862 
863 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
864 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
865 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
866 
867 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
868 
869 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
870 						S_0286E8_TID_IN_GROUP_ENA
871 						| S_0286E8_TGID_ENA
872 						| S_0286E8_DISABLE_INDEX_PACK)
873 						;
874 
875 	/* The LOOP_CONST registers are an optimizations for loops that allows
876 	 * you to store the initial counter, increment value, and maximum
877 	 * counter value in a register so that hardware can calculate the
878 	 * correct number of iterations for the loop, so that you don't need
879 	 * to have the loop counter in your shader code.  We don't currently use
880 	 * this optimization, so we must keep track of the counter in the
881 	 * shader and use a break instruction to exit loops.  However, the
882 	 * hardware will still uses this register to determine when to exit a
883 	 * loop, so we need to initialize the counter to 0, set the increment
884 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
885 	 * is the maximum value allowed.  This gives us a maximum of 4096
886 	 * iterations for our loops, but hopefully our break instruction will
887 	 * execute before some time before the 4096th iteration.
888 	 */
889 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
890 }
891 
evergreen_init_compute_state_functions(struct r600_context * rctx)892 void evergreen_init_compute_state_functions(struct r600_context *rctx)
893 {
894 	rctx->b.b.create_compute_state = evergreen_create_compute_state;
895 	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
896 	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
897 //	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
898 	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
899 	rctx->b.b.set_global_binding = evergreen_set_global_binding;
900 	rctx->b.b.launch_grid = evergreen_launch_grid;
901 
902 }
903 
r600_compute_global_transfer_map(struct pipe_context * ctx,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** ptransfer)904 static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
905 					      struct pipe_resource *resource,
906 					      unsigned level,
907 					      unsigned usage,
908 					      const struct pipe_box *box,
909 					      struct pipe_transfer **ptransfer)
910 {
911 	struct r600_context *rctx = (struct r600_context*)ctx;
912 	struct compute_memory_pool *pool = rctx->screen->global_pool;
913 	struct r600_resource_global* buffer =
914 		(struct r600_resource_global*)resource;
915 
916 	struct compute_memory_item *item = buffer->chunk;
917 	struct pipe_resource *dst = NULL;
918 	unsigned offset = box->x;
919 
920 	if (is_item_in_pool(item)) {
921 		compute_memory_demote_item(pool, item, ctx);
922 	}
923 	else {
924 		if (item->real_buffer == NULL) {
925 			item->real_buffer =
926 					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
927 		}
928 	}
929 
930 	dst = (struct pipe_resource*)item->real_buffer;
931 
932 	if (usage & PIPE_TRANSFER_READ)
933 		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
934 
935 	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
936 			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
937 			"width = %u, height = %u, depth = %u)\n", level, usage,
938 			box->x, box->y, box->z, box->width, box->height,
939 			box->depth);
940 	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
941 		"%u (box.x)\n", item->id, box->x);
942 
943 
944 	assert(resource->target == PIPE_BUFFER);
945 	assert(resource->bind & PIPE_BIND_GLOBAL);
946 	assert(box->x >= 0);
947 	assert(box->y == 0);
948 	assert(box->z == 0);
949 
950 	///TODO: do it better, mapping is not possible if the pool is too big
951 	return pipe_buffer_map_range(ctx, dst,
952 			offset, box->width, usage, ptransfer);
953 }
954 
r600_compute_global_transfer_unmap(struct pipe_context * ctx,struct pipe_transfer * transfer)955 static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
956 					       struct pipe_transfer *transfer)
957 {
958 	/* struct r600_resource_global are not real resources, they just map
959 	 * to an offset within the compute memory pool.  The function
960 	 * r600_compute_global_transfer_map() maps the memory pool
961 	 * resource rather than the struct r600_resource_global passed to
962 	 * it as an argument and then initalizes ptransfer->resource with
963 	 * the memory pool resource (via pipe_buffer_map_range).
964 	 * When transfer_unmap is called it uses the memory pool's
965 	 * vtable which calls r600_buffer_transfer_map() rather than
966 	 * this function.
967 	 */
968 	assert (!"This function should not be called");
969 }
970 
r600_compute_global_transfer_flush_region(struct pipe_context * ctx,struct pipe_transfer * transfer,const struct pipe_box * box)971 static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
972 						      struct pipe_transfer *transfer,
973 						      const struct pipe_box *box)
974 {
975 	assert(0 && "TODO");
976 }
977 
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)978 static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
979 					       struct pipe_resource *res)
980 {
981 	struct r600_resource_global* buffer = NULL;
982 	struct r600_screen* rscreen = NULL;
983 
984 	assert(res->target == PIPE_BUFFER);
985 	assert(res->bind & PIPE_BIND_GLOBAL);
986 
987 	buffer = (struct r600_resource_global*)res;
988 	rscreen = (struct r600_screen*)screen;
989 
990 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
991 
992 	buffer->chunk = NULL;
993 	free(res);
994 }
995 
996 static const struct u_resource_vtbl r600_global_buffer_vtbl =
997 {
998 	u_default_resource_get_handle, /* get_handle */
999 	r600_compute_global_buffer_destroy, /* resource_destroy */
1000 	r600_compute_global_transfer_map, /* transfer_map */
1001 	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1002 	r600_compute_global_transfer_unmap, /* transfer_unmap */
1003 };
1004 
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)1005 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1006 							const struct pipe_resource *templ)
1007 {
1008 	struct r600_resource_global* result = NULL;
1009 	struct r600_screen* rscreen = NULL;
1010 	int size_in_dw = 0;
1011 
1012 	assert(templ->target == PIPE_BUFFER);
1013 	assert(templ->bind & PIPE_BIND_GLOBAL);
1014 	assert(templ->array_size == 1 || templ->array_size == 0);
1015 	assert(templ->depth0 == 1 || templ->depth0 == 0);
1016 	assert(templ->height0 == 1 || templ->height0 == 0);
1017 
1018 	result = (struct r600_resource_global*)
1019 	CALLOC(sizeof(struct r600_resource_global), 1);
1020 	rscreen = (struct r600_screen*)screen;
1021 
1022 	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1023 	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1024 			templ->array_size);
1025 
1026 	result->base.b.vtbl = &r600_global_buffer_vtbl;
1027 	result->base.b.b = *templ;
1028 	result->base.b.b.screen = screen;
1029 	pipe_reference_init(&result->base.b.b.reference, 1);
1030 
1031 	size_in_dw = (templ->width0+3) / 4;
1032 
1033 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1034 
1035 	if (result->chunk == NULL)
1036 	{
1037 		free(result);
1038 		return NULL;
1039 	}
1040 
1041 	return &result->base.b.b;
1042 }
1043