• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nir_serialize.h"
7 #include "pipe/p_defines.h"
8 #include "r600_asm.h"
9 #include "r600_isa.h"
10 #include "r600_sq.h"
11 #include "r600_formats.h"
12 #include "r600_opcodes.h"
13 #include "r600_sfn.h"
14 #include "r600_shader.h"
15 #include "r600_dump.h"
16 #include "r600d.h"
17 #include "sfn/sfn_nir.h"
18 
19 #include "pipe/p_shader_tokens.h"
20 #include "tgsi/tgsi_parse.h"
21 #include "tgsi/tgsi_scan.h"
22 #include "tgsi/tgsi_dump.h"
23 #include "tgsi/tgsi_from_mesa.h"
24 #include "nir/tgsi_to_nir.h"
25 #include "nir/nir_to_tgsi_info.h"
26 #include "compiler/nir/nir.h"
27 #include "util/macros.h"
28 #include "util/u_bitcast.h"
29 #include "util/u_dump.h"
30 #include "util/u_endian.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include <assert.h>
34 #include <stdio.h>
35 #include <errno.h>
36 
37 /* CAYMAN notes
38 Why CAYMAN got loops for lots of instructions is explained here.
39 
40 -These 8xx t-slot only ops are implemented in all vector slots.
41 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
42 These 8xx t-slot only opcodes become vector ops, with all four
43 slots expecting the arguments on sources a and b. Result is
44 broadcast to all channels.
45 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
46 These 8xx t-slot only opcodes become vector ops in the z, y, and
47 x slots.
48 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
49 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
50 SQRT_IEEE/_64
51 SIN/COS
52 The w slot may have an independent co-issued operation, or if the
53 result is required to be in the w slot, the opcode above may be
54 issued in the w slot as well.
55 The compiler must issue the source argument to slots z, y, and x
56 */
57 
58 /* Contents of r0 on entry to various shaders
59 
60  VS - .x = VertexID
61       .y = RelVertexID (??)
62       .w = InstanceID
63 
64  GS - r0.xyw, r1.xyz = per-vertex offsets
65       r0.z = PrimitiveID
66 
67  TCS - .x = PatchID
68        .y = RelPatchID (??)
69        .z = InvocationID
70        .w = tess factor base.
71 
72  TES - .x = TessCoord.x
73      - .y = TessCoord.y
74      - .z = RelPatchID (??)
75      - .w = PrimitiveID
76 
77  PS - face_gpr.z = SampleMask
78       face_gpr.w = SampleID
79 */
80 
r600_dump_streamout(struct pipe_stream_output_info * so)81 static void r600_dump_streamout(struct pipe_stream_output_info *so)
82 {
83 	unsigned i;
84 
85 	fprintf(stderr, "STREAMOUT\n");
86 	for (i = 0; i < so->num_outputs; i++) {
87 		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
88 				so->output[i].start_component;
89 		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
90 			i,
91 			so->output[i].stream,
92 			so->output[i].output_buffer,
93 			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
94 			so->output[i].register_index,
95 			mask & 1 ? "x" : "",
96 		        mask & 2 ? "y" : "",
97 		        mask & 4 ? "z" : "",
98 		        mask & 8 ? "w" : "",
99 			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
100 	}
101 }
102 
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)103 static int store_shader(struct pipe_context *ctx,
104 			struct r600_pipe_shader *shader)
105 {
106 	struct r600_context *rctx = (struct r600_context *)ctx;
107 	uint32_t *ptr, i;
108 
109 	if (shader->bo == NULL) {
110 		shader->bo = (struct r600_resource*)
111 			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
112 		if (shader->bo == NULL) {
113 			return -ENOMEM;
114 		}
115 		ptr = r600_buffer_map_sync_with_rings(
116 			&rctx->b, shader->bo,
117 			PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
118 		if (UTIL_ARCH_BIG_ENDIAN) {
119 			for (i = 0; i < shader->shader.bc.ndw; ++i) {
120 				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
121 			}
122 		} else {
123 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
124 		}
125 		rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
126 	}
127 
128 	return 0;
129 }
130 
131 extern const struct nir_shader_compiler_options r600_nir_options;
132 static int nshader = 0;
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)133 int r600_pipe_shader_create(struct pipe_context *ctx,
134 			    struct r600_pipe_shader *shader,
135 			    union r600_shader_key key)
136 {
137 	struct r600_context *rctx = (struct r600_context *)ctx;
138 	struct r600_pipe_shader_selector *sel = shader->selector;
139 	int r;
140 	const nir_shader_compiler_options *nir_options =
141 		(const nir_shader_compiler_options *)
142 			ctx->screen->get_compiler_options(ctx->screen,
143 		                                     PIPE_SHADER_IR_NIR,
144 		                                     shader->shader.processor_type);
145 	if (!sel->nir && !(sel->ir_type == PIPE_SHADER_IR_TGSI)) {
146 		assert(sel->nir_blob);
147 		struct blob_reader blob_reader;
148 		blob_reader_init(&blob_reader, sel->nir_blob, sel->nir_blob_size);
149 		sel->nir = nir_deserialize(NULL, nir_options, &blob_reader);
150 	}
151 
152 	int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
153 		tgsi_get_processor_type(sel->tokens):
154 		pipe_shader_type_from_mesa(sel->nir->info.stage);
155 
156 	bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
157 
158 	unsigned export_shader;
159 
160 	shader->shader.bc.isa = rctx->isa;
161 
162 	{
163 		glsl_type_singleton_init_or_ref();
164 		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
165 			if (sel->nir)
166 				ralloc_free(sel->nir);
167 			if (sel->nir_blob) {
168 				free(sel->nir_blob);
169 				sel->nir_blob = NULL;
170 			}
171 			sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
172 			/* Lower int64 ops because we have some r600 built-in shaders that use it */
173 			if (nir_options->lower_int64_options) {
174 				NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
175 				NIR_PASS_V(sel->nir, nir_lower_int64);
176 			}
177 			NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
178 		}
179 		nir_tgsi_scan_shader(sel->nir, &sel->info, true);
180 
181 		r = r600_shader_from_nir(rctx, shader, &key);
182 
183 		glsl_type_singleton_decref();
184 
185 		if (r) {
186 			fprintf(stderr, "--Failed shader--------------------------------------------------\n");
187 
188 			if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
189 				fprintf(stderr, "--TGSI--------------------------------------------------------\n");
190 				tgsi_dump(sel->tokens, 0);
191 			}
192 
193 			fprintf(stderr, "--NIR --------------------------------------------------------\n");
194 			nir_print_shader(sel->nir, stderr);
195 
196 			R600_ERR("translation from NIR failed !\n");
197 			goto error;
198 		}
199 	}
200 
201 	if (dump) {
202 		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
203 			fprintf(stderr, "--TGSI--------------------------------------------------------\n");
204 			tgsi_dump(sel->tokens, 0);
205 		}
206 
207 		if (sel->so.num_outputs) {
208 			r600_dump_streamout(&sel->so);
209 		}
210 	}
211 
212 	/* Check if the bytecode has already been built. */
213 	if (!shader->shader.bc.bytecode) {
214 		r = r600_bytecode_build(&shader->shader.bc);
215 		if (r) {
216 			R600_ERR("building bytecode failed !\n");
217 			goto error;
218 		}
219 	}
220 
221 	if (dump) {
222 		fprintf(stderr, "--------------------------------------------------------------\n");
223 		r600_bytecode_disasm(&shader->shader.bc);
224 		fprintf(stderr, "______________________________________________________________\n");
225 
226                 print_shader_info(stderr, nshader++, &shader->shader);
227 		print_pipe_info(stderr, &sel->info);
228 	}
229 
230 	if (shader->gs_copy_shader) {
231 		if (dump) {
232 			// dump copy shader
233 			r600_bytecode_disasm(&shader->gs_copy_shader->shader.bc);
234                 }
235 
236 		if ((r = store_shader(ctx, shader->gs_copy_shader)))
237 			goto error;
238 	}
239 
240 	/* Store the shader in a buffer. */
241 	if ((r = store_shader(ctx, shader)))
242 		goto error;
243 
244 	/* Build state. */
245 	switch (shader->shader.processor_type) {
246 	case PIPE_SHADER_TESS_CTRL:
247 		evergreen_update_hs_state(ctx, shader);
248 		break;
249 	case PIPE_SHADER_TESS_EVAL:
250 		if (key.tes.as_es)
251 			evergreen_update_es_state(ctx, shader);
252 		else
253 			evergreen_update_vs_state(ctx, shader);
254 		break;
255 	case PIPE_SHADER_GEOMETRY:
256 		if (rctx->b.gfx_level >= EVERGREEN) {
257 			evergreen_update_gs_state(ctx, shader);
258 			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
259 		} else {
260 			r600_update_gs_state(ctx, shader);
261 			r600_update_vs_state(ctx, shader->gs_copy_shader);
262 		}
263 		break;
264 	case PIPE_SHADER_VERTEX:
265 		export_shader = key.vs.as_es;
266 		if (rctx->b.gfx_level >= EVERGREEN) {
267 			if (key.vs.as_ls)
268 				evergreen_update_ls_state(ctx, shader);
269 			else if (key.vs.as_es)
270 				evergreen_update_es_state(ctx, shader);
271 			else
272 				evergreen_update_vs_state(ctx, shader);
273 		} else {
274 			if (export_shader)
275 				r600_update_es_state(ctx, shader);
276 			else
277 				r600_update_vs_state(ctx, shader);
278 		}
279 		break;
280 	case PIPE_SHADER_FRAGMENT:
281 		if (rctx->b.gfx_level >= EVERGREEN) {
282 			evergreen_update_ps_state(ctx, shader);
283 		} else {
284 			r600_update_ps_state(ctx, shader);
285 		}
286 		break;
287 	case PIPE_SHADER_COMPUTE:
288 		evergreen_update_ls_state(ctx, shader);
289 		break;
290 	default:
291 		r = -EINVAL;
292 		goto error;
293 	}
294 
295 	util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack",
296 		           _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)),
297 	                   shader->shader.bc.ndw,
298 	                   shader->shader.bc.ngpr,
299 			   shader->shader.bc.nalu_groups,
300 			   shader->shader.num_loops,
301 			   shader->shader.bc.ncf,
302 			   shader->shader.bc.nstack);
303 
304 	if (!sel->nir_blob && sel->nir && sel->ir_type != PIPE_SHADER_IR_TGSI) {
305 		struct blob blob;
306 		blob_init(&blob);
307 		nir_serialize(&blob, sel->nir, false);
308 		sel->nir_blob = malloc(blob.size);
309 		memcpy(sel->nir_blob, blob.data, blob.size);
310 		sel->nir_blob_size = blob.size;
311 		blob_finish(&blob);
312 	}
313 	ralloc_free(sel->nir);
314 	sel->nir = NULL;
315 
316 	return 0;
317 
318 error:
319 	r600_pipe_shader_destroy(ctx, shader);
320 	return r;
321 }
322 
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)323 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
324 {
325 	r600_resource_reference(&shader->bo, NULL);
326 	if (list_is_linked(&shader->shader.bc.cf))
327 		r600_bytecode_clear(&shader->shader.bc);
328 	r600_release_command_buffer(&shader->command_buffer);
329 
330 	if (shader->shader.arrays)
331 		free(shader->shader.arrays);
332 }
333 
334 struct r600_shader_ctx {
335 	unsigned				type;
336 	unsigned				temp_reg;
337 	struct r600_bytecode			*bc;
338 	struct r600_shader			*shader;
339 	uint32_t				max_driver_temp_used;
340 	unsigned				enabled_stream_buffers_mask;
341 };
342 
r600_create_vertex_fetch_shader(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * elements)343 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
344 				      unsigned count,
345 				      const struct pipe_vertex_element *elements)
346 {
347 	struct r600_context *rctx = (struct r600_context *)ctx;
348 	struct r600_bytecode bc;
349 	struct r600_bytecode_vtx vtx;
350 	const struct util_format_description *desc;
351 	unsigned fetch_resource_start = rctx->b.gfx_level >= EVERGREEN ? 0 : 160;
352 	unsigned format, num_format, format_comp, endian;
353 	uint32_t *bytecode;
354 	int i, j, r, fs_size;
355 	uint32_t buffer_mask = 0;
356 	struct r600_fetch_shader *shader;
357 
358 	assert(count < 32);
359 
360 	/* Allocate the CSO. */
361 	shader = CALLOC_STRUCT(r600_fetch_shader);
362 	if (unlikely(!shader))
363 		return NULL;
364 
365 	memset(&bc, 0, sizeof(bc));
366 	r600_bytecode_init(&bc, rctx->b.gfx_level, rctx->b.family,
367 			   rctx->screen->has_compressed_msaa_texturing);
368 
369 	bc.isa = rctx->isa;
370 
371 	for (i = 0; i < count; i++) {
372 		if (elements[i].instance_divisor > 1) {
373 			if (rctx->b.gfx_level == CAYMAN) {
374 				for (j = 0; j < 4; j++) {
375 					struct r600_bytecode_alu alu;
376 					memset(&alu, 0, sizeof(alu));
377 					alu.op = ALU_OP2_MULHI_UINT;
378 					alu.src[0].sel = 0;
379 					alu.src[0].chan = 3;
380 					alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
381 					alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
382 					alu.dst.sel = i + 1;
383 					alu.dst.chan = j;
384 					alu.dst.write = j == 3;
385 					alu.last = j == 3;
386 					if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
387 						goto fail;
388 				}
389 			} else {
390 				struct r600_bytecode_alu alu;
391 				memset(&alu, 0, sizeof(alu));
392 				alu.op = ALU_OP2_MULHI_UINT;
393 				alu.src[0].sel = 0;
394 				alu.src[0].chan = 3;
395 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
396 				alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
397 				alu.dst.sel = i + 1;
398 				alu.dst.chan = 3;
399 				alu.dst.write = 1;
400 				alu.last = 1;
401 				if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
402 					goto fail;
403 			}
404 		}
405 		shader->strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
406 		buffer_mask |= BITFIELD_BIT(elements[i].vertex_buffer_index);
407 	}
408 
409 	for (i = 0; i < count; i++) {
410 		r600_vertex_data_type(elements[i].src_format,
411 				      &format, &num_format, &format_comp, &endian);
412 
413 		desc = util_format_description(elements[i].src_format);
414 
415 		if (unlikely(elements[i].src_offset > 65535)) {
416 			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
417 			goto fail;
418 		}
419 
420 		memset(&vtx, 0, sizeof(vtx));
421 		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
422 		vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA;
423 		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
424 		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
425 		vtx.mega_fetch_count = 0x1F;
426 		vtx.dst_gpr = i + 1;
427 		vtx.dst_sel_x = desc->swizzle[0];
428 		vtx.dst_sel_y = desc->swizzle[1];
429 		vtx.dst_sel_z = desc->swizzle[2];
430 		vtx.dst_sel_w = desc->swizzle[3];
431 		vtx.data_format = format;
432 		vtx.num_format_all = num_format;
433 		vtx.format_comp_all = format_comp;
434 		vtx.offset = elements[i].src_offset;
435 		vtx.endian = endian;
436 
437 		if (unlikely(r = r600_bytecode_add_vtx(&bc, &vtx)))
438 			goto fail;
439 
440 		if (unlikely(rctx->b.gfx_level >= EVERGREEN &&
441 			     desc->nr_channels == 3 &&
442 			     (format == FMT_8_8_8_8 ||
443 			      format == FMT_16_16_16_16 ||
444 			      format == FMT_16_16_16_16_FLOAT))) {
445 			if (format == FMT_8_8_8_8)
446 				shader->width_correction[elements[i].vertex_buffer_index] = 4 - 3;
447 			else
448 				shader->width_correction[elements[i].vertex_buffer_index] = 8 - 6;
449 		}
450 	}
451 
452 	r600_bytecode_add_cfinst(&bc, CF_OP_RET);
453 
454 	if (unlikely(r = r600_bytecode_build(&bc)))
455 		goto fail;
456 
457 	if (rctx->screen->b.debug_flags & DBG_FS) {
458 		fprintf(stderr, "--------------------------------------------------------------\n");
459 		fprintf(stderr, "Vertex elements state:\n");
460 		for (i = 0; i < count; i++) {
461 			fprintf(stderr, "   ");
462 			util_dump_vertex_element(stderr, elements+i);
463 			fprintf(stderr, "\n");
464 		}
465 
466                 r600_bytecode_disasm(&bc);
467 	}
468 
469 	fs_size = bc.ndw*4;
470 
471 	shader->buffer_mask = buffer_mask;
472 
473 	u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256,
474 			     &shader->offset,
475 			     (struct pipe_resource**)&shader->buffer);
476 	if (unlikely(!shader->buffer))
477 		goto fail;
478 
479 	bytecode = r600_buffer_map_sync_with_rings
480 		(&rctx->b, shader->buffer,
481 		PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
482 	bytecode += shader->offset / 4;
483 
484 	if (UTIL_ARCH_BIG_ENDIAN) {
485 		for (i = 0; i < fs_size / 4; ++i) {
486 			bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
487 		}
488 	} else {
489 		memcpy(bytecode, bc.bytecode, fs_size);
490 	}
491 	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf);
492 
493 	r600_bytecode_clear(&bc);
494 	return shader;
495 
496  fail:
497 	r600_bytecode_clear(&bc);
498 	FREE(shader);
499 	return NULL;
500 
501 }
502 
eg_get_interpolator_index(unsigned interpolate,unsigned location)503 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
504 {
505 	if (interpolate == TGSI_INTERPOLATE_COLOR ||
506 		interpolate == TGSI_INTERPOLATE_LINEAR ||
507 		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
508 	{
509 		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
510 		int loc;
511 
512 		switch(location) {
513 		case TGSI_INTERPOLATE_LOC_CENTER:
514 			loc = 1;
515 			break;
516 		case TGSI_INTERPOLATE_LOC_CENTROID:
517 			loc = 2;
518 			break;
519 		case TGSI_INTERPOLATE_LOC_SAMPLE:
520 		default:
521 			loc = 0; break;
522 		}
523 
524 		return is_linear * 3 + loc;
525 	}
526 
527 	return -1;
528 }
529 
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)530 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
531 {
532 	switch (semantic_name) {
533 	case TGSI_SEMANTIC_POSITION:
534 		return 0;
535        case TGSI_SEMANTIC_PSIZE:
536 		return 1;
537        case TGSI_SEMANTIC_CLIPDIST:
538 		assert(index <= 1);
539 		return 2 + index;
540        case TGSI_SEMANTIC_TEXCOORD:
541 		return 4 + index;
542        case TGSI_SEMANTIC_COLOR:
543 		return 12 + index;
544        case TGSI_SEMANTIC_BCOLOR:
545 		return 14 + index;
546        case TGSI_SEMANTIC_CLIPVERTEX:
547 		return 16;
548        case TGSI_SEMANTIC_GENERIC:
549 		if (index <= 63-17)
550 			return 17 + index;
551 		else
552 			/* same explanation as in the default statement,
553 			 * the only user hitting this is st/nine.
554 			 */
555 			return 0;
556 
557 	/* patch indices are completely separate and thus start from 0 */
558 	case TGSI_SEMANTIC_TESSOUTER:
559 		return 0;
560 	case TGSI_SEMANTIC_TESSINNER:
561 		return 1;
562 	case TGSI_SEMANTIC_PATCH:
563 		return 2 + index;
564 
565 	default:
566 		/* Don't fail here. The result of this function is only used
567 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
568 		 * occur, but this function is called for all vertex shaders
569 		 * before it's known whether LS will be compiled or not.
570 		 */
571 		return 0;
572 	}
573 }
574 
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)575 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
576                           int stream, unsigned *stream_item_size UNUSED)
577 {
578 	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
579 	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
580 	int j, r;
581 	unsigned i;
582 
583 	/* Sanity checking. */
584 	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
585 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
586 		r = -EINVAL;
587 		goto out_err;
588 	}
589 	for (i = 0; i < so->num_outputs; i++) {
590 		if (so->output[i].output_buffer >= 4) {
591 			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
592 				 so->output[i].output_buffer);
593 			r = -EINVAL;
594 			goto out_err;
595 		}
596 	}
597 
598 	if (so->num_outputs && ctx->bc->cf_last->op != CF_OP_ALU &&
599             ctx->bc->cf_last->op != CF_OP_ALU_PUSH_BEFORE)
600 		ctx->bc->force_add_cf = 1;
601 	/* Initialize locations where the outputs are stored. */
602 	for (i = 0; i < so->num_outputs; i++) {
603 
604 		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
605 		start_comp[i] = so->output[i].start_component;
606 		/* Lower outputs with dst_offset < start_component.
607 		 *
608 		 * We can only output 4D vectors with a write mask, e.g. we can
609 		 * only output the W component at offset 3, etc. If we want
610 		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
611 		 * to move it to X and output X. */
612 		if (so->output[i].dst_offset < so->output[i].start_component) {
613 			unsigned tmp = ctx->temp_reg + ctx->max_driver_temp_used++;
614 
615 			for (j = 0; j < so->output[i].num_components; j++) {
616 				struct r600_bytecode_alu alu;
617 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
618 				alu.op = ALU_OP1_MOV;
619 				alu.src[0].sel = so_gpr[i];
620 				alu.src[0].chan = so->output[i].start_component + j;
621 
622 				alu.dst.sel = tmp;
623 				alu.dst.chan = j;
624 				alu.dst.write = 1;
625 				if (j == so->output[i].num_components - 1)
626 					alu.last = 1;
627 				r = r600_bytecode_add_alu(ctx->bc, &alu);
628 				if (r)
629 					return r;
630 			}
631 			start_comp[i] = 0;
632 			so_gpr[i] = tmp;
633 		}
634 	}
635 
636 	/* Write outputs to buffers. */
637 	for (i = 0; i < so->num_outputs; i++) {
638 		struct r600_bytecode_output output;
639 
640 		if (stream != -1 && stream != so->output[i].stream)
641 			continue;
642 
643 		memset(&output, 0, sizeof(struct r600_bytecode_output));
644 		output.gpr = so_gpr[i];
645 		output.elem_size = so->output[i].num_components - 1;
646 		if (output.elem_size == 2)
647 			output.elem_size = 3; // 3 not supported, write 4 with junk at end
648 		output.array_base = so->output[i].dst_offset - start_comp[i];
649 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
650 		output.burst_count = 1;
651 		/* array_size is an upper limit for the burst_count
652 		 * with MEM_STREAM instructions */
653 		output.array_size = 0xFFF;
654 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
655 
656 		if (ctx->bc->gfx_level >= EVERGREEN) {
657 			switch (so->output[i].output_buffer) {
658 			case 0:
659 				output.op = CF_OP_MEM_STREAM0_BUF0;
660 				break;
661 			case 1:
662 				output.op = CF_OP_MEM_STREAM0_BUF1;
663 				break;
664 			case 2:
665 				output.op = CF_OP_MEM_STREAM0_BUF2;
666 				break;
667 			case 3:
668 				output.op = CF_OP_MEM_STREAM0_BUF3;
669 				break;
670 			}
671 			output.op += so->output[i].stream * 4;
672 			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
673 			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
674 		} else {
675 			switch (so->output[i].output_buffer) {
676 			case 0:
677 				output.op = CF_OP_MEM_STREAM0;
678 				break;
679 			case 1:
680 				output.op = CF_OP_MEM_STREAM1;
681 				break;
682 			case 2:
683 				output.op = CF_OP_MEM_STREAM2;
684 				break;
685 			case 3:
686 				output.op = CF_OP_MEM_STREAM3;
687 					break;
688 			}
689 			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
690 		}
691 		r = r600_bytecode_add_output(ctx->bc, &output);
692 		if (r)
693 			goto out_err;
694 	}
695 	return 0;
696 out_err:
697 	return r;
698 }
699 
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)700 int generate_gs_copy_shader(struct r600_context *rctx,
701                             struct r600_pipe_shader *gs,
702                             struct pipe_stream_output_info *so)
703 {
704 	struct r600_shader_ctx ctx = {};
705 	struct r600_shader *gs_shader = &gs->shader;
706 	struct r600_pipe_shader *cshader;
707 	unsigned ocnt = gs_shader->noutput;
708 	struct r600_bytecode_alu alu;
709 	struct r600_bytecode_vtx vtx;
710 	struct r600_bytecode_output output;
711 	struct r600_bytecode_cf *cf_jump, *cf_pop,
712 		*last_exp_pos = NULL, *last_exp_param = NULL;
713 	int next_clip_pos = 61, next_param = 0;
714 	unsigned i, j;
715 	int ring;
716 	bool only_ring_0 = true;
717 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
718 	if (!cshader)
719 		return 0;
720 
721 	memcpy(cshader->shader.output, gs_shader->output, ocnt *
722 	       sizeof(struct r600_shader_io));
723 
724 	cshader->shader.noutput = ocnt;
725 
726 	ctx.shader = &cshader->shader;
727 	ctx.bc = &ctx.shader->bc;
728 	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
729 
730 	r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family,
731 			   rctx->screen->has_compressed_msaa_texturing);
732 
733 	ctx.bc->isa = rctx->isa;
734 
735 	cf_jump = NULL;
736 	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
737 
738 	/* R0.x = R0.x & 0x3fffffff */
739 	memset(&alu, 0, sizeof(alu));
740 	alu.op = ALU_OP2_AND_INT;
741 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
742 	alu.src[1].value = 0x3fffffff;
743 	alu.dst.write = 1;
744 	r600_bytecode_add_alu(ctx.bc, &alu);
745 
746 	/* R0.y = R0.x >> 30 */
747 	memset(&alu, 0, sizeof(alu));
748 	alu.op = ALU_OP2_LSHR_INT;
749 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
750 	alu.src[1].value = 0x1e;
751 	alu.dst.chan = 1;
752 	alu.dst.write = 1;
753 	alu.last = 1;
754 	r600_bytecode_add_alu(ctx.bc, &alu);
755 
756 	/* fetch vertex data from GSVS ring */
757 	for (i = 0; i < ocnt; ++i) {
758 		struct r600_shader_io *out = &ctx.shader->output[i];
759 
760 		out->gpr = i + 1;
761 		out->ring_offset = i * 16;
762 
763 		memset(&vtx, 0, sizeof(vtx));
764 		vtx.op = FETCH_OP_VFETCH;
765 		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
766 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
767 		vtx.mega_fetch_count = 16;
768 		vtx.offset = out->ring_offset;
769 		vtx.dst_gpr = out->gpr;
770 		vtx.src_gpr = 0;
771 		vtx.dst_sel_x = 0;
772 		vtx.dst_sel_y = 1;
773 		vtx.dst_sel_z = 2;
774 		vtx.dst_sel_w = 3;
775 		if (rctx->b.gfx_level >= EVERGREEN) {
776 			vtx.use_const_fields = 1;
777 		} else {
778 			vtx.data_format = FMT_32_32_32_32_FLOAT;
779 		}
780 
781 		r600_bytecode_add_vtx(ctx.bc, &vtx);
782 	}
783 	ctx.temp_reg = i + 1;
784 	for (ring = 3; ring >= 0; --ring) {
785 		bool enabled = false;
786 		for (i = 0; i < so->num_outputs; i++) {
787 			if (so->output[i].stream == ring) {
788 				enabled = true;
789 				if (ring > 0)
790 					only_ring_0 = false;
791 				break;
792 			}
793 		}
794 		if (ring != 0 && !enabled) {
795 			cshader->shader.ring_item_sizes[ring] = 0;
796 			continue;
797 		}
798 
799 		if (cf_jump) {
800 			// Patch up jump label
801 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
802 			cf_pop = ctx.bc->cf_last;
803 
804 			cf_jump->cf_addr = cf_pop->id + 2;
805 			cf_jump->pop_count = 1;
806 			cf_pop->cf_addr = cf_pop->id + 2;
807 			cf_pop->pop_count = 1;
808 		}
809 
810 		/* PRED_SETE_INT __, R0.y, ring */
811 		memset(&alu, 0, sizeof(alu));
812 		alu.op = ALU_OP2_PRED_SETE_INT;
813 		alu.src[0].chan = 1;
814 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
815 		alu.src[1].value = ring;
816 		alu.execute_mask = 1;
817 		alu.update_pred = 1;
818 		alu.last = 1;
819 		ctx.bc->force_add_cf = 1;
820 		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
821 
822 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
823 		cf_jump = ctx.bc->cf_last;
824 
825 		if (enabled)
826 			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
827 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
828 	}
829 
830 	/* bc adds nops - copy it */
831 	if (ctx.bc->gfx_level == R600) {
832 		ctx.bc->force_add_cf = 1;
833 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
834 		alu.op = ALU_OP0_NOP;
835 		alu.last = 1;
836 		r600_bytecode_add_alu(ctx.bc, &alu);
837 
838 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
839 	}
840 
841 	/* export vertex data */
842 	/* XXX factor out common code with r600_shader_from_tgsi ? */
843 	for (i = 0; i < ocnt; ++i) {
844 		struct r600_shader_io *out = &ctx.shader->output[i];
845 		/* The actual parameter export indices will be calculated here, ignore the copied ones. */
846 		out->export_param = -1;
847 		bool instream0 = true;
848 		if (out->varying_slot == VARYING_SLOT_CLIP_VERTEX)
849 			continue;
850 
851 		for (j = 0; j < so->num_outputs; j++) {
852 			if (so->output[j].register_index == i) {
853 				if (so->output[j].stream == 0)
854 					break;
855 				if (so->output[j].stream > 0)
856 					instream0 = false;
857 			}
858 		}
859 		if (!instream0)
860 			continue;
861 		memset(&output, 0, sizeof(output));
862 		output.gpr = out->gpr;
863 		output.elem_size = 3;
864 		output.swizzle_x = 0;
865 		output.swizzle_y = 1;
866 		output.swizzle_z = 2;
867 		output.swizzle_w = 3;
868 		output.burst_count = 1;
869 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
870 		output.op = CF_OP_EXPORT;
871 		switch (out->varying_slot) {
872 		case VARYING_SLOT_POS:
873 			output.array_base = 60;
874 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
875 			break;
876 
877 		case VARYING_SLOT_PSIZ:
878 			output.array_base = 61;
879 			if (next_clip_pos == 61)
880 				next_clip_pos = 62;
881 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
882 			output.swizzle_y = 7;
883 			output.swizzle_z = 7;
884 			output.swizzle_w = 7;
885 			ctx.shader->vs_out_misc_write = 1;
886 			ctx.shader->vs_out_point_size = 1;
887 			break;
888 		case VARYING_SLOT_LAYER:
889 			if (out->spi_sid) {
890 				/* duplicate it as PARAM to pass to the pixel shader */
891 				output.array_base = next_param++;
892 				out->export_param = output.array_base;
893 				r600_bytecode_add_output(ctx.bc, &output);
894 				last_exp_param = ctx.bc->cf_last;
895 			}
896 			output.array_base = 61;
897 			if (next_clip_pos == 61)
898 				next_clip_pos = 62;
899 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
900 			output.swizzle_x = 7;
901 			output.swizzle_y = 7;
902 			output.swizzle_z = 0;
903 			output.swizzle_w = 7;
904 			ctx.shader->vs_out_misc_write = 1;
905 			ctx.shader->vs_out_layer = 1;
906 			break;
907 		case VARYING_SLOT_VIEWPORT:
908 			if (out->spi_sid) {
909 				/* duplicate it as PARAM to pass to the pixel shader */
910 				output.array_base = next_param++;
911 				out->export_param = output.array_base;
912 				r600_bytecode_add_output(ctx.bc, &output);
913 				last_exp_param = ctx.bc->cf_last;
914 			}
915 			output.array_base = 61;
916 			if (next_clip_pos == 61)
917 				next_clip_pos = 62;
918 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
919 			ctx.shader->vs_out_misc_write = 1;
920 			ctx.shader->vs_out_viewport = 1;
921 			output.swizzle_x = 7;
922 			output.swizzle_y = 7;
923 			output.swizzle_z = 7;
924 			output.swizzle_w = 0;
925 			break;
926 		case VARYING_SLOT_CLIP_DIST0:
927 		case VARYING_SLOT_CLIP_DIST1:
928 			/* spi_sid is 0 for clipdistance outputs that were generated
929 			 * for clipvertex - we don't need to pass them to PS */
930 			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
931 			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
932 			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
933 			if (out->spi_sid) {
934 				/* duplicate it as PARAM to pass to the pixel shader */
935 				output.array_base = next_param++;
936 				out->export_param = output.array_base;
937 				r600_bytecode_add_output(ctx.bc, &output);
938 				last_exp_param = ctx.bc->cf_last;
939 			}
940 			output.array_base = next_clip_pos++;
941 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
942 			break;
943 		case VARYING_SLOT_FOGC:
944 			output.swizzle_y = 4; /* 0 */
945 			output.swizzle_z = 4; /* 0 */
946 			output.swizzle_w = 5; /* 1 */
947 			break;
948 		default:
949 			break;
950 		}
951 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
952 			output.array_base = next_param++;
953 			out->export_param = output.array_base;
954 		}
955 		r600_bytecode_add_output(ctx.bc, &output);
956 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
957 			last_exp_param = ctx.bc->cf_last;
958 		else
959 			last_exp_pos = ctx.bc->cf_last;
960 	}
961 
962 	if (!last_exp_pos) {
963 		memset(&output, 0, sizeof(output));
964 		output.gpr = 0;
965 		output.elem_size = 3;
966 		output.swizzle_x = 7;
967 		output.swizzle_y = 7;
968 		output.swizzle_z = 7;
969 		output.swizzle_w = 7;
970 		output.burst_count = 1;
971 		output.type = 2;
972 		output.op = CF_OP_EXPORT;
973 		output.array_base = 60;
974 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
975 		r600_bytecode_add_output(ctx.bc, &output);
976 		last_exp_pos = ctx.bc->cf_last;
977 	}
978 
979 	if (!last_exp_param) {
980 		memset(&output, 0, sizeof(output));
981 		output.gpr = 0;
982 		output.elem_size = 3;
983 		output.swizzle_x = 7;
984 		output.swizzle_y = 7;
985 		output.swizzle_z = 7;
986 		output.swizzle_w = 7;
987 		output.burst_count = 1;
988 		output.type = 2;
989 		output.op = CF_OP_EXPORT;
990 		output.array_base = next_param++;
991 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
992 		r600_bytecode_add_output(ctx.bc, &output);
993 		last_exp_param = ctx.bc->cf_last;
994 	}
995 
996 	last_exp_pos->op = CF_OP_EXPORT_DONE;
997 	last_exp_param->op = CF_OP_EXPORT_DONE;
998 
999 	assert(next_param > 0);
1000 	cshader->shader.highest_export_param = next_param - 1;
1001 
1002 	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1003 	cf_pop = ctx.bc->cf_last;
1004 
1005 	cf_jump->cf_addr = cf_pop->id + 2;
1006 	cf_jump->pop_count = 1;
1007 	cf_pop->cf_addr = cf_pop->id + 2;
1008 	cf_pop->pop_count = 1;
1009 
1010 	if (ctx.bc->gfx_level == CAYMAN)
1011 		cm_bytecode_add_cf_end(ctx.bc);
1012 	else {
1013 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1014 		ctx.bc->cf_last->end_of_program = 1;
1015 	}
1016 
1017 	gs->gs_copy_shader = cshader;
1018 	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1019 
1020 	ctx.bc->nstack = 1;
1021 
1022 	return r600_bytecode_build(ctx.bc);
1023 }
1024 
1025