• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 #include "nir_serialize.h"
24 #include "pipe/p_defines.h"
25 #include "r600_asm.h"
26 #include "r600_isa.h"
27 #include "r600_sq.h"
28 #include "r600_formats.h"
29 #include "r600_opcodes.h"
30 #include "r600_sfn.h"
31 #include "r600_shader.h"
32 #include "r600_dump.h"
33 #include "r600d.h"
34 #include "sfn/sfn_nir.h"
35 
36 #include "pipe/p_shader_tokens.h"
37 #include "tgsi/tgsi_parse.h"
38 #include "tgsi/tgsi_scan.h"
39 #include "tgsi/tgsi_dump.h"
40 #include "tgsi/tgsi_from_mesa.h"
41 #include "nir/tgsi_to_nir.h"
42 #include "nir/nir_to_tgsi_info.h"
43 #include "compiler/nir/nir.h"
44 #include "util/macros.h"
45 #include "util/u_bitcast.h"
46 #include "util/u_dump.h"
47 #include "util/u_endian.h"
48 #include "util/u_memory.h"
49 #include "util/u_math.h"
50 #include <assert.h>
51 #include <stdio.h>
52 #include <errno.h>
53 
54 /* CAYMAN notes
55 Why CAYMAN got loops for lots of instructions is explained here.
56 
57 -These 8xx t-slot only ops are implemented in all vector slots.
58 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
59 These 8xx t-slot only opcodes become vector ops, with all four
60 slots expecting the arguments on sources a and b. Result is
61 broadcast to all channels.
62 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
63 These 8xx t-slot only opcodes become vector ops in the z, y, and
64 x slots.
65 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
66 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
67 SQRT_IEEE/_64
68 SIN/COS
69 The w slot may have an independent co-issued operation, or if the
70 result is required to be in the w slot, the opcode above may be
71 issued in the w slot as well.
72 The compiler must issue the source argument to slots z, y, and x
73 */
74 
75 /* Contents of r0 on entry to various shaders
76 
77  VS - .x = VertexID
78       .y = RelVertexID (??)
79       .w = InstanceID
80 
81  GS - r0.xyw, r1.xyz = per-vertex offsets
82       r0.z = PrimitiveID
83 
84  TCS - .x = PatchID
85        .y = RelPatchID (??)
86        .z = InvocationID
87        .w = tess factor base.
88 
89  TES - .x = TessCoord.x
90      - .y = TessCoord.y
91      - .z = RelPatchID (??)
92      - .w = PrimitiveID
93 
94  PS - face_gpr.z = SampleMask
95       face_gpr.w = SampleID
96 */
97 
r600_dump_streamout(struct pipe_stream_output_info * so)98 static void r600_dump_streamout(struct pipe_stream_output_info *so)
99 {
100 	unsigned i;
101 
102 	fprintf(stderr, "STREAMOUT\n");
103 	for (i = 0; i < so->num_outputs; i++) {
104 		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
105 				so->output[i].start_component;
106 		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
107 			i,
108 			so->output[i].stream,
109 			so->output[i].output_buffer,
110 			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
111 			so->output[i].register_index,
112 			mask & 1 ? "x" : "",
113 		        mask & 2 ? "y" : "",
114 		        mask & 4 ? "z" : "",
115 		        mask & 8 ? "w" : "",
116 			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
117 	}
118 }
119 
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)120 static int store_shader(struct pipe_context *ctx,
121 			struct r600_pipe_shader *shader)
122 {
123 	struct r600_context *rctx = (struct r600_context *)ctx;
124 	uint32_t *ptr, i;
125 
126 	if (shader->bo == NULL) {
127 		shader->bo = (struct r600_resource*)
128 			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
129 		if (shader->bo == NULL) {
130 			return -ENOMEM;
131 		}
132 		ptr = r600_buffer_map_sync_with_rings(
133 			&rctx->b, shader->bo,
134 			PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
135 		if (UTIL_ARCH_BIG_ENDIAN) {
136 			for (i = 0; i < shader->shader.bc.ndw; ++i) {
137 				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
138 			}
139 		} else {
140 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
141 		}
142 		rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
143 	}
144 
145 	return 0;
146 }
147 
148 extern const struct nir_shader_compiler_options r600_nir_options;
149 static int nshader = 0;
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)150 int r600_pipe_shader_create(struct pipe_context *ctx,
151 			    struct r600_pipe_shader *shader,
152 			    union r600_shader_key key)
153 {
154 	struct r600_context *rctx = (struct r600_context *)ctx;
155 	struct r600_pipe_shader_selector *sel = shader->selector;
156 	int r;
157 	const nir_shader_compiler_options *nir_options =
158 		(const nir_shader_compiler_options *)
159 			ctx->screen->get_compiler_options(ctx->screen,
160 		                                     PIPE_SHADER_IR_NIR,
161 		                                     shader->shader.processor_type);
162 	if (!sel->nir && !(sel->ir_type == PIPE_SHADER_IR_TGSI)) {
163 		assert(sel->nir_blob);
164 		struct blob_reader blob_reader;
165 		blob_reader_init(&blob_reader, sel->nir_blob, sel->nir_blob_size);
166 		sel->nir = nir_deserialize(NULL, nir_options, &blob_reader);
167 	}
168 
169 	int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
170 		tgsi_get_processor_type(sel->tokens):
171 		pipe_shader_type_from_mesa(sel->nir->info.stage);
172 
173 	bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
174 
175 	unsigned export_shader;
176 
177 	shader->shader.bc.isa = rctx->isa;
178 
179 	{
180 		glsl_type_singleton_init_or_ref();
181 		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
182 			if (sel->nir)
183 				ralloc_free(sel->nir);
184 			if (sel->nir_blob) {
185 				free(sel->nir_blob);
186 				sel->nir_blob = NULL;
187 			}
188 			sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
189 			/* Lower int64 ops because we have some r600 built-in shaders that use it */
190 			if (nir_options->lower_int64_options) {
191 				NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
192 				NIR_PASS_V(sel->nir, nir_lower_int64);
193 			}
194 			NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
195 		}
196 		nir_tgsi_scan_shader(sel->nir, &sel->info, true);
197 
198 		r = r600_shader_from_nir(rctx, shader, &key);
199 
200 		glsl_type_singleton_decref();
201 
202 		if (r) {
203 			fprintf(stderr, "--Failed shader--------------------------------------------------\n");
204 
205 			if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
206 				fprintf(stderr, "--TGSI--------------------------------------------------------\n");
207 				tgsi_dump(sel->tokens, 0);
208 			}
209 
210 			fprintf(stderr, "--NIR --------------------------------------------------------\n");
211 			nir_print_shader(sel->nir, stderr);
212 
213 			R600_ERR("translation from NIR failed !\n");
214 			goto error;
215 		}
216 	}
217 
218 	if (dump) {
219 		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
220 			fprintf(stderr, "--TGSI--------------------------------------------------------\n");
221 			tgsi_dump(sel->tokens, 0);
222 		}
223 
224 		if (sel->so.num_outputs) {
225 			r600_dump_streamout(&sel->so);
226 		}
227 	}
228 
229 	/* Check if the bytecode has already been built. */
230 	if (!shader->shader.bc.bytecode) {
231 		r = r600_bytecode_build(&shader->shader.bc);
232 		if (r) {
233 			R600_ERR("building bytecode failed !\n");
234 			goto error;
235 		}
236 	}
237 
238 	if (dump) {
239 		fprintf(stderr, "--------------------------------------------------------------\n");
240 		r600_bytecode_disasm(&shader->shader.bc);
241 		fprintf(stderr, "______________________________________________________________\n");
242 
243                 print_shader_info(stderr, nshader++, &shader->shader);
244 		print_pipe_info(stderr, &sel->info);
245 	}
246 
247 	if (shader->gs_copy_shader) {
248 		if (dump) {
249 			// dump copy shader
250 			r600_bytecode_disasm(&shader->gs_copy_shader->shader.bc);
251                 }
252 
253 		if ((r = store_shader(ctx, shader->gs_copy_shader)))
254 			goto error;
255 	}
256 
257 	/* Store the shader in a buffer. */
258 	if ((r = store_shader(ctx, shader)))
259 		goto error;
260 
261 	/* Build state. */
262 	switch (shader->shader.processor_type) {
263 	case PIPE_SHADER_TESS_CTRL:
264 		evergreen_update_hs_state(ctx, shader);
265 		break;
266 	case PIPE_SHADER_TESS_EVAL:
267 		if (key.tes.as_es)
268 			evergreen_update_es_state(ctx, shader);
269 		else
270 			evergreen_update_vs_state(ctx, shader);
271 		break;
272 	case PIPE_SHADER_GEOMETRY:
273 		if (rctx->b.gfx_level >= EVERGREEN) {
274 			evergreen_update_gs_state(ctx, shader);
275 			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
276 		} else {
277 			r600_update_gs_state(ctx, shader);
278 			r600_update_vs_state(ctx, shader->gs_copy_shader);
279 		}
280 		break;
281 	case PIPE_SHADER_VERTEX:
282 		export_shader = key.vs.as_es;
283 		if (rctx->b.gfx_level >= EVERGREEN) {
284 			if (key.vs.as_ls)
285 				evergreen_update_ls_state(ctx, shader);
286 			else if (key.vs.as_es)
287 				evergreen_update_es_state(ctx, shader);
288 			else
289 				evergreen_update_vs_state(ctx, shader);
290 		} else {
291 			if (export_shader)
292 				r600_update_es_state(ctx, shader);
293 			else
294 				r600_update_vs_state(ctx, shader);
295 		}
296 		break;
297 	case PIPE_SHADER_FRAGMENT:
298 		if (rctx->b.gfx_level >= EVERGREEN) {
299 			evergreen_update_ps_state(ctx, shader);
300 		} else {
301 			r600_update_ps_state(ctx, shader);
302 		}
303 		break;
304 	case PIPE_SHADER_COMPUTE:
305 		evergreen_update_ls_state(ctx, shader);
306 		break;
307 	default:
308 		r = -EINVAL;
309 		goto error;
310 	}
311 
312 	util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack",
313 		           _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)),
314 	                   shader->shader.bc.ndw,
315 	                   shader->shader.bc.ngpr,
316 			   shader->shader.bc.nalu_groups,
317 			   shader->shader.num_loops,
318 			   shader->shader.bc.ncf,
319 			   shader->shader.bc.nstack);
320 
321 	if (!sel->nir_blob && sel->nir && sel->ir_type != PIPE_SHADER_IR_TGSI) {
322 		struct blob blob;
323 		blob_init(&blob);
324 		nir_serialize(&blob, sel->nir, false);
325 		sel->nir_blob = malloc(blob.size);
326 		memcpy(sel->nir_blob, blob.data, blob.size);
327 		sel->nir_blob_size = blob.size;
328 		blob_finish(&blob);
329 	}
330 	ralloc_free(sel->nir);
331 	sel->nir = NULL;
332 
333 	return 0;
334 
335 error:
336 	r600_pipe_shader_destroy(ctx, shader);
337 	return r;
338 }
339 
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)340 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
341 {
342 	r600_resource_reference(&shader->bo, NULL);
343 	if (list_is_linked(&shader->shader.bc.cf))
344 		r600_bytecode_clear(&shader->shader.bc);
345 	r600_release_command_buffer(&shader->command_buffer);
346 
347 	if (shader->shader.arrays)
348 		free(shader->shader.arrays);
349 }
350 
351 struct r600_shader_src {
352 	unsigned				sel;
353 	unsigned				swizzle[4];
354 	unsigned				neg;
355 	unsigned				abs;
356 	unsigned				rel;
357 	unsigned				kc_bank;
358 	bool					kc_rel; /* true if cache bank is indexed */
359 	uint32_t				value[4];
360 };
361 
362 struct eg_interp {
363 	bool					enabled;
364 	unsigned				ij_index;
365 };
366 
367 struct r600_shader_ctx {
368 	unsigned				type;
369 	unsigned				temp_reg;
370 	struct r600_bytecode			*bc;
371 	struct r600_shader			*shader;
372 	uint32_t				max_driver_temp_used;
373 	unsigned				enabled_stream_buffers_mask;
374 };
375 
r600_create_vertex_fetch_shader(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * elements)376 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
377 				      unsigned count,
378 				      const struct pipe_vertex_element *elements)
379 {
380 	struct r600_context *rctx = (struct r600_context *)ctx;
381 	struct r600_bytecode bc;
382 	struct r600_bytecode_vtx vtx;
383 	const struct util_format_description *desc;
384 	unsigned fetch_resource_start = rctx->b.gfx_level >= EVERGREEN ? 0 : 160;
385 	unsigned format, num_format, format_comp, endian;
386 	uint32_t *bytecode;
387 	int i, j, r, fs_size;
388 	uint32_t buffer_mask = 0;
389 	struct r600_fetch_shader *shader;
390 	unsigned strides[PIPE_MAX_ATTRIBS];
391 
392 	assert(count < 32);
393 
394 	memset(&bc, 0, sizeof(bc));
395 	r600_bytecode_init(&bc, rctx->b.gfx_level, rctx->b.family,
396 			   rctx->screen->has_compressed_msaa_texturing);
397 
398 	bc.isa = rctx->isa;
399 
400 	for (i = 0; i < count; i++) {
401 		if (elements[i].instance_divisor > 1) {
402 			if (rctx->b.gfx_level == CAYMAN) {
403 				for (j = 0; j < 4; j++) {
404 					struct r600_bytecode_alu alu;
405 					memset(&alu, 0, sizeof(alu));
406 					alu.op = ALU_OP2_MULHI_UINT;
407 					alu.src[0].sel = 0;
408 					alu.src[0].chan = 3;
409 					alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
410 					alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
411 					alu.dst.sel = i + 1;
412 					alu.dst.chan = j;
413 					alu.dst.write = j == 3;
414 					alu.last = j == 3;
415 					if ((r = r600_bytecode_add_alu(&bc, &alu))) {
416 						r600_bytecode_clear(&bc);
417 						return NULL;
418 					}
419 				}
420 			} else {
421 				struct r600_bytecode_alu alu;
422 				memset(&alu, 0, sizeof(alu));
423 				alu.op = ALU_OP2_MULHI_UINT;
424 				alu.src[0].sel = 0;
425 				alu.src[0].chan = 3;
426 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
427 				alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
428 				alu.dst.sel = i + 1;
429 				alu.dst.chan = 3;
430 				alu.dst.write = 1;
431 				alu.last = 1;
432 				if ((r = r600_bytecode_add_alu(&bc, &alu))) {
433 					r600_bytecode_clear(&bc);
434 					return NULL;
435 				}
436 			}
437 		}
438 		strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
439 		buffer_mask |= BITFIELD_BIT(elements[i].vertex_buffer_index);
440 	}
441 
442 	for (i = 0; i < count; i++) {
443 		r600_vertex_data_type(elements[i].src_format,
444 				      &format, &num_format, &format_comp, &endian);
445 
446 		desc = util_format_description(elements[i].src_format);
447 
448 		if (elements[i].src_offset > 65535) {
449 			r600_bytecode_clear(&bc);
450 			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
451 			return NULL;
452 		}
453 
454 		memset(&vtx, 0, sizeof(vtx));
455 		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
456 		vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA;
457 		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
458 		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
459 		vtx.mega_fetch_count = 0x1F;
460 		vtx.dst_gpr = i + 1;
461 		vtx.dst_sel_x = desc->swizzle[0];
462 		vtx.dst_sel_y = desc->swizzle[1];
463 		vtx.dst_sel_z = desc->swizzle[2];
464 		vtx.dst_sel_w = desc->swizzle[3];
465 		vtx.data_format = format;
466 		vtx.num_format_all = num_format;
467 		vtx.format_comp_all = format_comp;
468 		vtx.offset = elements[i].src_offset;
469 		vtx.endian = endian;
470 
471 		if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
472 			r600_bytecode_clear(&bc);
473 			return NULL;
474 		}
475 	}
476 
477 	r600_bytecode_add_cfinst(&bc, CF_OP_RET);
478 
479 	if ((r = r600_bytecode_build(&bc))) {
480 		r600_bytecode_clear(&bc);
481 		return NULL;
482 	}
483 
484 	if (rctx->screen->b.debug_flags & DBG_FS) {
485 		fprintf(stderr, "--------------------------------------------------------------\n");
486 		fprintf(stderr, "Vertex elements state:\n");
487 		for (i = 0; i < count; i++) {
488 			fprintf(stderr, "   ");
489 			util_dump_vertex_element(stderr, elements+i);
490 			fprintf(stderr, "\n");
491 		}
492 
493                 r600_bytecode_disasm(&bc);
494 	}
495 
496 	fs_size = bc.ndw*4;
497 
498 	/* Allocate the CSO. */
499 	shader = CALLOC_STRUCT(r600_fetch_shader);
500 	if (!shader) {
501 		r600_bytecode_clear(&bc);
502 		return NULL;
503 	}
504 	memcpy(shader->strides, strides, sizeof(strides));
505 	shader->buffer_mask = buffer_mask;
506 
507 	u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256,
508 			     &shader->offset,
509 			     (struct pipe_resource**)&shader->buffer);
510 	if (!shader->buffer) {
511 		r600_bytecode_clear(&bc);
512 		FREE(shader);
513 		return NULL;
514 	}
515 
516 	bytecode = r600_buffer_map_sync_with_rings
517 		(&rctx->b, shader->buffer,
518 		PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
519 	bytecode += shader->offset / 4;
520 
521 	if (UTIL_ARCH_BIG_ENDIAN) {
522 		for (i = 0; i < fs_size / 4; ++i) {
523 			bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
524 		}
525 	} else {
526 		memcpy(bytecode, bc.bytecode, fs_size);
527 	}
528 	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf);
529 
530 	r600_bytecode_clear(&bc);
531 	return shader;
532 }
533 
eg_get_interpolator_index(unsigned interpolate,unsigned location)534 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
535 {
536 	if (interpolate == TGSI_INTERPOLATE_COLOR ||
537 		interpolate == TGSI_INTERPOLATE_LINEAR ||
538 		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
539 	{
540 		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
541 		int loc;
542 
543 		switch(location) {
544 		case TGSI_INTERPOLATE_LOC_CENTER:
545 			loc = 1;
546 			break;
547 		case TGSI_INTERPOLATE_LOC_CENTROID:
548 			loc = 2;
549 			break;
550 		case TGSI_INTERPOLATE_LOC_SAMPLE:
551 		default:
552 			loc = 0; break;
553 		}
554 
555 		return is_linear * 3 + loc;
556 	}
557 
558 	return -1;
559 }
560 
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)561 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
562 {
563 	switch (semantic_name) {
564 	case TGSI_SEMANTIC_POSITION:
565 		return 0;
566        case TGSI_SEMANTIC_PSIZE:
567 		return 1;
568        case TGSI_SEMANTIC_CLIPDIST:
569 		assert(index <= 1);
570 		return 2 + index;
571        case TGSI_SEMANTIC_TEXCOORD:
572 		return 4 + index;
573        case TGSI_SEMANTIC_COLOR:
574 		return 12 + index;
575        case TGSI_SEMANTIC_BCOLOR:
576 		return 14 + index;
577        case TGSI_SEMANTIC_CLIPVERTEX:
578 		return 16;
579        case TGSI_SEMANTIC_GENERIC:
580 		if (index <= 63-17)
581 			return 17 + index;
582 		else
583 			/* same explanation as in the default statement,
584 			 * the only user hitting this is st/nine.
585 			 */
586 			return 0;
587 
588 	/* patch indices are completely separate and thus start from 0 */
589 	case TGSI_SEMANTIC_TESSOUTER:
590 		return 0;
591 	case TGSI_SEMANTIC_TESSINNER:
592 		return 1;
593 	case TGSI_SEMANTIC_PATCH:
594 		return 2 + index;
595 
596 	default:
597 		/* Don't fail here. The result of this function is only used
598 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
599 		 * occur, but this function is called for all vertex shaders
600 		 * before it's known whether LS will be compiled or not.
601 		 */
602 		return 0;
603 	}
604 }
605 
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)606 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
607                           int stream, unsigned *stream_item_size UNUSED)
608 {
609 	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
610 	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
611 	int j, r;
612 	unsigned i;
613 
614 	/* Sanity checking. */
615 	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
616 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
617 		r = -EINVAL;
618 		goto out_err;
619 	}
620 	for (i = 0; i < so->num_outputs; i++) {
621 		if (so->output[i].output_buffer >= 4) {
622 			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
623 				 so->output[i].output_buffer);
624 			r = -EINVAL;
625 			goto out_err;
626 		}
627 	}
628 
629 	if (so->num_outputs && ctx->bc->cf_last->op != CF_OP_ALU &&
630             ctx->bc->cf_last->op != CF_OP_ALU_PUSH_BEFORE)
631 		ctx->bc->force_add_cf = 1;
632 	/* Initialize locations where the outputs are stored. */
633 	for (i = 0; i < so->num_outputs; i++) {
634 
635 		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
636 		start_comp[i] = so->output[i].start_component;
637 		/* Lower outputs with dst_offset < start_component.
638 		 *
639 		 * We can only output 4D vectors with a write mask, e.g. we can
640 		 * only output the W component at offset 3, etc. If we want
641 		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
642 		 * to move it to X and output X. */
643 		if (so->output[i].dst_offset < so->output[i].start_component) {
644 			unsigned tmp = ctx->temp_reg + ctx->max_driver_temp_used++;
645 
646 			for (j = 0; j < so->output[i].num_components; j++) {
647 				struct r600_bytecode_alu alu;
648 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
649 				alu.op = ALU_OP1_MOV;
650 				alu.src[0].sel = so_gpr[i];
651 				alu.src[0].chan = so->output[i].start_component + j;
652 
653 				alu.dst.sel = tmp;
654 				alu.dst.chan = j;
655 				alu.dst.write = 1;
656 				if (j == so->output[i].num_components - 1)
657 					alu.last = 1;
658 				r = r600_bytecode_add_alu(ctx->bc, &alu);
659 				if (r)
660 					return r;
661 			}
662 			start_comp[i] = 0;
663 			so_gpr[i] = tmp;
664 		}
665 	}
666 
667 	/* Write outputs to buffers. */
668 	for (i = 0; i < so->num_outputs; i++) {
669 		struct r600_bytecode_output output;
670 
671 		if (stream != -1 && stream != so->output[i].stream)
672 			continue;
673 
674 		memset(&output, 0, sizeof(struct r600_bytecode_output));
675 		output.gpr = so_gpr[i];
676 		output.elem_size = so->output[i].num_components - 1;
677 		if (output.elem_size == 2)
678 			output.elem_size = 3; // 3 not supported, write 4 with junk at end
679 		output.array_base = so->output[i].dst_offset - start_comp[i];
680 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
681 		output.burst_count = 1;
682 		/* array_size is an upper limit for the burst_count
683 		 * with MEM_STREAM instructions */
684 		output.array_size = 0xFFF;
685 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
686 
687 		if (ctx->bc->gfx_level >= EVERGREEN) {
688 			switch (so->output[i].output_buffer) {
689 			case 0:
690 				output.op = CF_OP_MEM_STREAM0_BUF0;
691 				break;
692 			case 1:
693 				output.op = CF_OP_MEM_STREAM0_BUF1;
694 				break;
695 			case 2:
696 				output.op = CF_OP_MEM_STREAM0_BUF2;
697 				break;
698 			case 3:
699 				output.op = CF_OP_MEM_STREAM0_BUF3;
700 				break;
701 			}
702 			output.op += so->output[i].stream * 4;
703 			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
704 			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
705 		} else {
706 			switch (so->output[i].output_buffer) {
707 			case 0:
708 				output.op = CF_OP_MEM_STREAM0;
709 				break;
710 			case 1:
711 				output.op = CF_OP_MEM_STREAM1;
712 				break;
713 			case 2:
714 				output.op = CF_OP_MEM_STREAM2;
715 				break;
716 			case 3:
717 				output.op = CF_OP_MEM_STREAM3;
718 					break;
719 			}
720 			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
721 		}
722 		r = r600_bytecode_add_output(ctx->bc, &output);
723 		if (r)
724 			goto out_err;
725 	}
726 	return 0;
727 out_err:
728 	return r;
729 }
730 
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)731 int generate_gs_copy_shader(struct r600_context *rctx,
732                             struct r600_pipe_shader *gs,
733                             struct pipe_stream_output_info *so)
734 {
735 	struct r600_shader_ctx ctx = {};
736 	struct r600_shader *gs_shader = &gs->shader;
737 	struct r600_pipe_shader *cshader;
738 	unsigned ocnt = gs_shader->noutput;
739 	struct r600_bytecode_alu alu;
740 	struct r600_bytecode_vtx vtx;
741 	struct r600_bytecode_output output;
742 	struct r600_bytecode_cf *cf_jump, *cf_pop,
743 		*last_exp_pos = NULL, *last_exp_param = NULL;
744 	int next_clip_pos = 61, next_param = 0;
745 	unsigned i, j;
746 	int ring;
747 	bool only_ring_0 = true;
748 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
749 	if (!cshader)
750 		return 0;
751 
752 	memcpy(cshader->shader.output, gs_shader->output, ocnt *
753 	       sizeof(struct r600_shader_io));
754 
755 	cshader->shader.noutput = ocnt;
756 
757 	ctx.shader = &cshader->shader;
758 	ctx.bc = &ctx.shader->bc;
759 	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
760 
761 	r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family,
762 			   rctx->screen->has_compressed_msaa_texturing);
763 
764 	ctx.bc->isa = rctx->isa;
765 
766 	cf_jump = NULL;
767 	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
768 
769 	/* R0.x = R0.x & 0x3fffffff */
770 	memset(&alu, 0, sizeof(alu));
771 	alu.op = ALU_OP2_AND_INT;
772 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
773 	alu.src[1].value = 0x3fffffff;
774 	alu.dst.write = 1;
775 	r600_bytecode_add_alu(ctx.bc, &alu);
776 
777 	/* R0.y = R0.x >> 30 */
778 	memset(&alu, 0, sizeof(alu));
779 	alu.op = ALU_OP2_LSHR_INT;
780 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
781 	alu.src[1].value = 0x1e;
782 	alu.dst.chan = 1;
783 	alu.dst.write = 1;
784 	alu.last = 1;
785 	r600_bytecode_add_alu(ctx.bc, &alu);
786 
787 	/* fetch vertex data from GSVS ring */
788 	for (i = 0; i < ocnt; ++i) {
789 		struct r600_shader_io *out = &ctx.shader->output[i];
790 
791 		out->gpr = i + 1;
792 		out->ring_offset = i * 16;
793 
794 		memset(&vtx, 0, sizeof(vtx));
795 		vtx.op = FETCH_OP_VFETCH;
796 		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
797 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
798 		vtx.mega_fetch_count = 16;
799 		vtx.offset = out->ring_offset;
800 		vtx.dst_gpr = out->gpr;
801 		vtx.src_gpr = 0;
802 		vtx.dst_sel_x = 0;
803 		vtx.dst_sel_y = 1;
804 		vtx.dst_sel_z = 2;
805 		vtx.dst_sel_w = 3;
806 		if (rctx->b.gfx_level >= EVERGREEN) {
807 			vtx.use_const_fields = 1;
808 		} else {
809 			vtx.data_format = FMT_32_32_32_32_FLOAT;
810 		}
811 
812 		r600_bytecode_add_vtx(ctx.bc, &vtx);
813 	}
814 	ctx.temp_reg = i + 1;
815 	for (ring = 3; ring >= 0; --ring) {
816 		bool enabled = false;
817 		for (i = 0; i < so->num_outputs; i++) {
818 			if (so->output[i].stream == ring) {
819 				enabled = true;
820 				if (ring > 0)
821 					only_ring_0 = false;
822 				break;
823 			}
824 		}
825 		if (ring != 0 && !enabled) {
826 			cshader->shader.ring_item_sizes[ring] = 0;
827 			continue;
828 		}
829 
830 		if (cf_jump) {
831 			// Patch up jump label
832 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
833 			cf_pop = ctx.bc->cf_last;
834 
835 			cf_jump->cf_addr = cf_pop->id + 2;
836 			cf_jump->pop_count = 1;
837 			cf_pop->cf_addr = cf_pop->id + 2;
838 			cf_pop->pop_count = 1;
839 		}
840 
841 		/* PRED_SETE_INT __, R0.y, ring */
842 		memset(&alu, 0, sizeof(alu));
843 		alu.op = ALU_OP2_PRED_SETE_INT;
844 		alu.src[0].chan = 1;
845 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
846 		alu.src[1].value = ring;
847 		alu.execute_mask = 1;
848 		alu.update_pred = 1;
849 		alu.last = 1;
850 		ctx.bc->force_add_cf = 1;
851 		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
852 
853 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
854 		cf_jump = ctx.bc->cf_last;
855 
856 		if (enabled)
857 			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
858 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
859 	}
860 
861 	/* bc adds nops - copy it */
862 	if (ctx.bc->gfx_level == R600) {
863 		ctx.bc->force_add_cf = 1;
864 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
865 		alu.op = ALU_OP0_NOP;
866 		alu.last = 1;
867 		r600_bytecode_add_alu(ctx.bc, &alu);
868 
869 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
870 	}
871 
872 	/* export vertex data */
873 	/* XXX factor out common code with r600_shader_from_tgsi ? */
874 	for (i = 0; i < ocnt; ++i) {
875 		struct r600_shader_io *out = &ctx.shader->output[i];
876 		/* The actual parameter export indices will be calculated here, ignore the copied ones. */
877 		out->export_param = -1;
878 		bool instream0 = true;
879 		if (out->varying_slot == VARYING_SLOT_CLIP_VERTEX)
880 			continue;
881 
882 		for (j = 0; j < so->num_outputs; j++) {
883 			if (so->output[j].register_index == i) {
884 				if (so->output[j].stream == 0)
885 					break;
886 				if (so->output[j].stream > 0)
887 					instream0 = false;
888 			}
889 		}
890 		if (!instream0)
891 			continue;
892 		memset(&output, 0, sizeof(output));
893 		output.gpr = out->gpr;
894 		output.elem_size = 3;
895 		output.swizzle_x = 0;
896 		output.swizzle_y = 1;
897 		output.swizzle_z = 2;
898 		output.swizzle_w = 3;
899 		output.burst_count = 1;
900 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
901 		output.op = CF_OP_EXPORT;
902 		switch (out->varying_slot) {
903 		case VARYING_SLOT_POS:
904 			output.array_base = 60;
905 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
906 			break;
907 
908 		case VARYING_SLOT_PSIZ:
909 			output.array_base = 61;
910 			if (next_clip_pos == 61)
911 				next_clip_pos = 62;
912 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
913 			output.swizzle_y = 7;
914 			output.swizzle_z = 7;
915 			output.swizzle_w = 7;
916 			ctx.shader->vs_out_misc_write = 1;
917 			ctx.shader->vs_out_point_size = 1;
918 			break;
919 		case VARYING_SLOT_LAYER:
920 			if (out->spi_sid) {
921 				/* duplicate it as PARAM to pass to the pixel shader */
922 				output.array_base = next_param++;
923 				out->export_param = output.array_base;
924 				r600_bytecode_add_output(ctx.bc, &output);
925 				last_exp_param = ctx.bc->cf_last;
926 			}
927 			output.array_base = 61;
928 			if (next_clip_pos == 61)
929 				next_clip_pos = 62;
930 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
931 			output.swizzle_x = 7;
932 			output.swizzle_y = 7;
933 			output.swizzle_z = 0;
934 			output.swizzle_w = 7;
935 			ctx.shader->vs_out_misc_write = 1;
936 			ctx.shader->vs_out_layer = 1;
937 			break;
938 		case VARYING_SLOT_VIEWPORT:
939 			if (out->spi_sid) {
940 				/* duplicate it as PARAM to pass to the pixel shader */
941 				output.array_base = next_param++;
942 				out->export_param = output.array_base;
943 				r600_bytecode_add_output(ctx.bc, &output);
944 				last_exp_param = ctx.bc->cf_last;
945 			}
946 			output.array_base = 61;
947 			if (next_clip_pos == 61)
948 				next_clip_pos = 62;
949 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
950 			ctx.shader->vs_out_misc_write = 1;
951 			ctx.shader->vs_out_viewport = 1;
952 			output.swizzle_x = 7;
953 			output.swizzle_y = 7;
954 			output.swizzle_z = 7;
955 			output.swizzle_w = 0;
956 			break;
957 		case VARYING_SLOT_CLIP_DIST0:
958 		case VARYING_SLOT_CLIP_DIST1:
959 			/* spi_sid is 0 for clipdistance outputs that were generated
960 			 * for clipvertex - we don't need to pass them to PS */
961 			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
962 			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
963 			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
964 			if (out->spi_sid) {
965 				/* duplicate it as PARAM to pass to the pixel shader */
966 				output.array_base = next_param++;
967 				out->export_param = output.array_base;
968 				r600_bytecode_add_output(ctx.bc, &output);
969 				last_exp_param = ctx.bc->cf_last;
970 			}
971 			output.array_base = next_clip_pos++;
972 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
973 			break;
974 		case VARYING_SLOT_FOGC:
975 			output.swizzle_y = 4; /* 0 */
976 			output.swizzle_z = 4; /* 0 */
977 			output.swizzle_w = 5; /* 1 */
978 			break;
979 		default:
980 			break;
981 		}
982 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
983 			output.array_base = next_param++;
984 			out->export_param = output.array_base;
985 		}
986 		r600_bytecode_add_output(ctx.bc, &output);
987 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
988 			last_exp_param = ctx.bc->cf_last;
989 		else
990 			last_exp_pos = ctx.bc->cf_last;
991 	}
992 
993 	if (!last_exp_pos) {
994 		memset(&output, 0, sizeof(output));
995 		output.gpr = 0;
996 		output.elem_size = 3;
997 		output.swizzle_x = 7;
998 		output.swizzle_y = 7;
999 		output.swizzle_z = 7;
1000 		output.swizzle_w = 7;
1001 		output.burst_count = 1;
1002 		output.type = 2;
1003 		output.op = CF_OP_EXPORT;
1004 		output.array_base = 60;
1005 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1006 		r600_bytecode_add_output(ctx.bc, &output);
1007 		last_exp_pos = ctx.bc->cf_last;
1008 	}
1009 
1010 	if (!last_exp_param) {
1011 		memset(&output, 0, sizeof(output));
1012 		output.gpr = 0;
1013 		output.elem_size = 3;
1014 		output.swizzle_x = 7;
1015 		output.swizzle_y = 7;
1016 		output.swizzle_z = 7;
1017 		output.swizzle_w = 7;
1018 		output.burst_count = 1;
1019 		output.type = 2;
1020 		output.op = CF_OP_EXPORT;
1021 		output.array_base = next_param++;
1022 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1023 		r600_bytecode_add_output(ctx.bc, &output);
1024 		last_exp_param = ctx.bc->cf_last;
1025 	}
1026 
1027 	last_exp_pos->op = CF_OP_EXPORT_DONE;
1028 	last_exp_param->op = CF_OP_EXPORT_DONE;
1029 
1030 	assert(next_param > 0);
1031 	cshader->shader.highest_export_param = next_param - 1;
1032 
1033 	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1034 	cf_pop = ctx.bc->cf_last;
1035 
1036 	cf_jump->cf_addr = cf_pop->id + 2;
1037 	cf_jump->pop_count = 1;
1038 	cf_pop->cf_addr = cf_pop->id + 2;
1039 	cf_pop->pop_count = 1;
1040 
1041 	if (ctx.bc->gfx_level == CAYMAN)
1042 		cm_bytecode_add_cf_end(ctx.bc);
1043 	else {
1044 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1045 		ctx.bc->cf_last->end_of_program = 1;
1046 	}
1047 
1048 	gs->gs_copy_shader = cshader;
1049 	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1050 
1051 	ctx.bc->nstack = 1;
1052 
1053 	return r600_bytecode_build(ctx.bc);
1054 }
1055 
1056