1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600_dump.h"
28 #include "r600d.h"
29 #include "sfn/sfn_nir.h"
30
31 #include "sb/sb_public.h"
32
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/tgsi_info.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "tgsi/tgsi_scan.h"
37 #include "tgsi/tgsi_dump.h"
38 #include "tgsi/tgsi_from_mesa.h"
39 #include "nir/tgsi_to_nir.h"
40 #include "nir/nir_to_tgsi_info.h"
41 #include "compiler/nir/nir.h"
42 #include "util/u_bitcast.h"
43 #include "util/u_memory.h"
44 #include "util/u_math.h"
45 #include <stdio.h>
46 #include <errno.h>
47
48 /* CAYMAN notes
49 Why CAYMAN got loops for lots of instructions is explained here.
50
51 -These 8xx t-slot only ops are implemented in all vector slots.
52 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53 These 8xx t-slot only opcodes become vector ops, with all four
54 slots expecting the arguments on sources a and b. Result is
55 broadcast to all channels.
56 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57 These 8xx t-slot only opcodes become vector ops in the z, y, and
58 x slots.
59 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61 SQRT_IEEE/_64
62 SIN/COS
63 The w slot may have an independent co-issued operation, or if the
64 result is required to be in the w slot, the opcode above may be
65 issued in the w slot as well.
66 The compiler must issue the source argument to slots z, y, and x
67 */
68
69 /* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72 .y = RelVertexID (??)
73 .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76 r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79 .y = RelPatchID (??)
80 .z = InvocationID
81 .w = tess factor base.
82
83 TES - .x = TessCoord.x
84 - .y = TessCoord.y
85 - .z = RelPatchID (??)
86 - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89 face_gpr.w = SampleID
90 */
91 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92 static int r600_shader_from_tgsi(struct r600_context *rctx,
93 struct r600_pipe_shader *pipeshader,
94 union r600_shader_key key);
95
r600_add_gpr_array(struct r600_shader * ps,int start_gpr,int size,unsigned comp_mask)96 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97 int size, unsigned comp_mask) {
98
99 if (!size)
100 return;
101
102 if (ps->num_arrays == ps->max_arrays) {
103 ps->max_arrays += 64;
104 ps->arrays = realloc(ps->arrays, ps->max_arrays *
105 sizeof(struct r600_shader_array));
106 }
107
108 int n = ps->num_arrays;
109 ++ps->num_arrays;
110
111 ps->arrays[n].comp_mask = comp_mask;
112 ps->arrays[n].gpr_start = start_gpr;
113 ps->arrays[n].gpr_count = size;
114 }
115
r600_dump_streamout(struct pipe_stream_output_info * so)116 static void r600_dump_streamout(struct pipe_stream_output_info *so)
117 {
118 unsigned i;
119
120 fprintf(stderr, "STREAMOUT\n");
121 for (i = 0; i < so->num_outputs; i++) {
122 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123 so->output[i].start_component;
124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125 i,
126 so->output[i].stream,
127 so->output[i].output_buffer,
128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129 so->output[i].register_index,
130 mask & 1 ? "x" : "",
131 mask & 2 ? "y" : "",
132 mask & 4 ? "z" : "",
133 mask & 8 ? "w" : "",
134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135 }
136 }
137
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)138 static int store_shader(struct pipe_context *ctx,
139 struct r600_pipe_shader *shader)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 uint32_t *ptr, i;
143
144 if (shader->bo == NULL) {
145 shader->bo = (struct r600_resource*)
146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147 if (shader->bo == NULL) {
148 return -ENOMEM;
149 }
150 ptr = r600_buffer_map_sync_with_rings(
151 &rctx->b, shader->bo,
152 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
153 if (R600_BIG_ENDIAN) {
154 for (i = 0; i < shader->shader.bc.ndw; ++i) {
155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156 }
157 } else {
158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159 }
160 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
161 }
162
163 return 0;
164 }
165
166 extern const struct nir_shader_compiler_options r600_nir_options;
167 static int nshader = 0;
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)168 int r600_pipe_shader_create(struct pipe_context *ctx,
169 struct r600_pipe_shader *shader,
170 union r600_shader_key key)
171 {
172 struct r600_context *rctx = (struct r600_context *)ctx;
173 struct r600_pipe_shader_selector *sel = shader->selector;
174 int r;
175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178 tgsi_get_processor_type(sel->tokens):
179 pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) ||
183 (rctx->screen->b.debug_flags & DBG_NIR_SB);
184 unsigned sb_disasm;
185 unsigned export_shader;
186
187 shader->shader.bc.isa = rctx->isa;
188
189 if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) {
190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191 r = r600_shader_from_tgsi(rctx, shader, key);
192 if (r) {
193 R600_ERR("translation from TGSI failed !\n");
194 goto error;
195 }
196 } else {
197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198 if (sel->nir)
199 ralloc_free(sel->nir);
200 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
201 const nir_shader_compiler_options *nir_options =
202 (const nir_shader_compiler_options *)
203 ctx->screen->get_compiler_options(ctx->screen,
204 PIPE_SHADER_IR_NIR,
205 shader->shader.processor_type);
206 /* Lower int64 ops because we have some r600 build-in shaders that use it */
207 if (nir_options->lower_int64_options) {
208 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
209 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
210 NIR_PASS_V(sel->nir, nir_lower_int64);
211 NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL);
212 }
213 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
214 }
215 nir_tgsi_scan_shader(sel->nir, &sel->info, true);
216
217 r = r600_shader_from_nir(rctx, shader, &key);
218 if (r) {
219 fprintf(stderr, "--Failed shader--------------------------------------------------\n");
220
221 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
222 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
223 tgsi_dump(sel->tokens, 0);
224 }
225
226 if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) {
227 fprintf(stderr, "--NIR --------------------------------------------------------\n");
228 nir_print_shader(sel->nir, stderr);
229 }
230
231 R600_ERR("translation from NIR failed !\n");
232 goto error;
233 }
234 }
235
236 if (dump) {
237 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
238 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
239 tgsi_dump(sel->tokens, 0);
240 }
241
242 if (sel->so.num_outputs) {
243 r600_dump_streamout(&sel->so);
244 }
245 }
246
247 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
248 /* only disable for vertex shaders in tess paths */
249 if (key.vs.as_ls)
250 use_sb = 0;
251 }
252 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
253 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
254 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
255
256 /* disable SB for shaders using doubles */
257 use_sb &= !shader->shader.uses_doubles;
258
259 use_sb &= !shader->shader.uses_atomics;
260 use_sb &= !shader->shader.uses_images;
261 use_sb &= !shader->shader.uses_helper_invocation;
262
263 /* SB can't handle READ_SCRATCH properly */
264 use_sb &= !(shader->shader.needs_scratch_space && rscreen->b.gfx_level < R700);
265
266 /* sb has bugs in array reg allocation
267 * (dEQP-GLES2.functional.shaders.struct.local.struct_array_dynamic_index_fragment
268 * with NTT)
269 */
270 use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_TEMPORARY));
271 use_sb &= !(shader->shader.indirect_files & (1 << TGSI_FILE_CONSTANT));
272
273 /* sb has scheduling assertion fails with interpolate_at. */
274 use_sb &= !shader->shader.uses_interpolate_at_sample;
275
276 /* Check if the bytecode has already been built. */
277 if (!shader->shader.bc.bytecode) {
278 r = r600_bytecode_build(&shader->shader.bc);
279 if (r) {
280 R600_ERR("building bytecode failed !\n");
281 goto error;
282 }
283 }
284
285 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
286 if (dump && !sb_disasm) {
287 fprintf(stderr, "--------------------------------------------------------------\n");
288 r600_bytecode_disasm(&shader->shader.bc);
289 fprintf(stderr, "______________________________________________________________\n");
290 } else if ((dump && sb_disasm) || use_sb) {
291 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
292 dump, use_sb);
293 if (r) {
294 R600_ERR("r600_sb_bytecode_process failed !\n");
295 goto error;
296 }
297 }
298
299 if (dump) {
300 print_shader_info(stderr, nshader++, &shader->shader);
301 print_pipe_info(stderr, &sel->info);
302 }
303
304 if (shader->gs_copy_shader) {
305 if (dump) {
306 // dump copy shader
307 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
308 &shader->gs_copy_shader->shader, dump, 0);
309 if (r)
310 goto error;
311 }
312
313 if ((r = store_shader(ctx, shader->gs_copy_shader)))
314 goto error;
315 }
316
317 /* Store the shader in a buffer. */
318 if ((r = store_shader(ctx, shader)))
319 goto error;
320
321 /* Build state. */
322 switch (shader->shader.processor_type) {
323 case PIPE_SHADER_TESS_CTRL:
324 evergreen_update_hs_state(ctx, shader);
325 break;
326 case PIPE_SHADER_TESS_EVAL:
327 if (key.tes.as_es)
328 evergreen_update_es_state(ctx, shader);
329 else
330 evergreen_update_vs_state(ctx, shader);
331 break;
332 case PIPE_SHADER_GEOMETRY:
333 if (rctx->b.gfx_level >= EVERGREEN) {
334 evergreen_update_gs_state(ctx, shader);
335 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
336 } else {
337 r600_update_gs_state(ctx, shader);
338 r600_update_vs_state(ctx, shader->gs_copy_shader);
339 }
340 break;
341 case PIPE_SHADER_VERTEX:
342 export_shader = key.vs.as_es;
343 if (rctx->b.gfx_level >= EVERGREEN) {
344 if (key.vs.as_ls)
345 evergreen_update_ls_state(ctx, shader);
346 else if (key.vs.as_es)
347 evergreen_update_es_state(ctx, shader);
348 else
349 evergreen_update_vs_state(ctx, shader);
350 } else {
351 if (export_shader)
352 r600_update_es_state(ctx, shader);
353 else
354 r600_update_vs_state(ctx, shader);
355 }
356 break;
357 case PIPE_SHADER_FRAGMENT:
358 if (rctx->b.gfx_level >= EVERGREEN) {
359 evergreen_update_ps_state(ctx, shader);
360 } else {
361 r600_update_ps_state(ctx, shader);
362 }
363 break;
364 case PIPE_SHADER_COMPUTE:
365 evergreen_update_ls_state(ctx, shader);
366 break;
367 default:
368 r = -EINVAL;
369 goto error;
370 }
371
372 util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack",
373 _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)),
374 shader->shader.bc.ndw,
375 shader->shader.bc.ngpr,
376 shader->shader.bc.nalu_groups,
377 shader->shader.num_loops,
378 shader->shader.bc.ncf,
379 shader->shader.bc.nstack);
380
381 return 0;
382
383 error:
384 r600_pipe_shader_destroy(ctx, shader);
385 return r;
386 }
387
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)388 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
389 {
390 r600_resource_reference(&shader->bo, NULL);
391 if (list_is_linked(&shader->shader.bc.cf))
392 r600_bytecode_clear(&shader->shader.bc);
393 r600_release_command_buffer(&shader->command_buffer);
394 }
395
396 /*
397 * tgsi -> r600 shader
398 */
399 struct r600_shader_tgsi_instruction;
400
401 struct r600_shader_src {
402 unsigned sel;
403 unsigned swizzle[4];
404 unsigned neg;
405 unsigned abs;
406 unsigned rel;
407 unsigned kc_bank;
408 boolean kc_rel; /* true if cache bank is indexed */
409 uint32_t value[4];
410 };
411
412 struct eg_interp {
413 boolean enabled;
414 unsigned ij_index;
415 };
416
417 struct r600_shader_ctx {
418 struct tgsi_shader_info info;
419 struct tgsi_array_info *array_infos;
420 /* flag for each tgsi temp array if its been spilled or not */
421 bool *spilled_arrays;
422 struct tgsi_parse_context parse;
423 const struct tgsi_token *tokens;
424 unsigned type;
425 unsigned file_offset[TGSI_FILE_COUNT];
426 unsigned temp_reg;
427 const struct r600_shader_tgsi_instruction *inst_info;
428 struct r600_bytecode *bc;
429 struct r600_shader *shader;
430 struct r600_shader_src src[4];
431 uint32_t *literals;
432 uint32_t nliterals;
433 uint32_t max_driver_temp_used;
434 /* needed for evergreen interpolation */
435 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
436 /* evergreen/cayman also store sample mask in face register */
437 int face_gpr;
438 /* sample id is .w component stored in fixed point position register */
439 int fixed_pt_position_gpr;
440 int colors_used;
441 boolean clip_vertex_write;
442 unsigned cv_output;
443 unsigned edgeflag_output;
444 int helper_invoc_reg;
445 int cs_block_size_reg;
446 int cs_grid_size_reg;
447 bool cs_block_size_loaded, cs_grid_size_loaded;
448 int fragcoord_input;
449 int next_ring_offset;
450 int gs_out_ring_offset;
451 int gs_next_vertex;
452 struct r600_shader *gs_for_vs;
453 int gs_export_gpr_tregs[4];
454 int gs_rotated_input[2];
455 const struct pipe_stream_output_info *gs_stream_output_info;
456 unsigned enabled_stream_buffers_mask;
457 unsigned tess_input_info; /* temp with tess input offsets */
458 unsigned tess_output_info; /* temp with tess input offsets */
459 unsigned thread_id_gpr; /* temp with thread id calculated for images */
460 };
461
462 struct r600_shader_tgsi_instruction {
463 unsigned op;
464 int (*process)(struct r600_shader_ctx *ctx);
465 };
466
467 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
468 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
469 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
470 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
471 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
472 static int tgsi_else(struct r600_shader_ctx *ctx);
473 static int tgsi_endif(struct r600_shader_ctx *ctx);
474 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
475 static int tgsi_endloop(struct r600_shader_ctx *ctx);
476 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
477 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
478 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
479 unsigned int dst_reg);
480 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
481 const struct r600_shader_src *shader_src,
482 unsigned chan);
483 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
484 unsigned dst_reg, unsigned mask);
485
ctx_needs_stack_workaround_8xx(struct r600_shader_ctx * ctx)486 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
487 {
488 if (ctx->bc->family == CHIP_HEMLOCK ||
489 ctx->bc->family == CHIP_CYPRESS ||
490 ctx->bc->family == CHIP_JUNIPER)
491 return false;
492 return true;
493 }
494
tgsi_last_instruction(unsigned writemask)495 static int tgsi_last_instruction(unsigned writemask)
496 {
497 int i, lasti = 0;
498
499 for (i = 0; i < 4; i++) {
500 if (writemask & (1 << i)) {
501 lasti = i;
502 }
503 }
504 return lasti;
505 }
506
tgsi_is_supported(struct r600_shader_ctx * ctx)507 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
508 {
509 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
510 unsigned j;
511
512 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
513 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
514 return -EINVAL;
515 }
516 #if 0
517 if (i->Instruction.Label) {
518 R600_ERR("label unsupported\n");
519 return -EINVAL;
520 }
521 #endif
522 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
523 if (i->Src[j].Register.Dimension) {
524 switch (i->Src[j].Register.File) {
525 case TGSI_FILE_CONSTANT:
526 case TGSI_FILE_HW_ATOMIC:
527 break;
528 case TGSI_FILE_INPUT:
529 if (ctx->type == PIPE_SHADER_GEOMETRY ||
530 ctx->type == PIPE_SHADER_TESS_CTRL ||
531 ctx->type == PIPE_SHADER_TESS_EVAL)
532 break;
533 FALLTHROUGH;
534 case TGSI_FILE_OUTPUT:
535 if (ctx->type == PIPE_SHADER_TESS_CTRL)
536 break;
537 FALLTHROUGH;
538 default:
539 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
540 i->Src[j].Register.File,
541 i->Src[j].Register.Dimension);
542 return -EINVAL;
543 }
544 }
545 }
546 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
547 if (i->Dst[j].Register.Dimension) {
548 if (ctx->type == PIPE_SHADER_TESS_CTRL)
549 continue;
550 R600_ERR("unsupported dst (dimension)\n");
551 return -EINVAL;
552 }
553 }
554 return 0;
555 }
556
eg_get_interpolator_index(unsigned interpolate,unsigned location)557 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
558 {
559 if (interpolate == TGSI_INTERPOLATE_COLOR ||
560 interpolate == TGSI_INTERPOLATE_LINEAR ||
561 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
562 {
563 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
564 int loc;
565
566 switch(location) {
567 case TGSI_INTERPOLATE_LOC_CENTER:
568 loc = 1;
569 break;
570 case TGSI_INTERPOLATE_LOC_CENTROID:
571 loc = 2;
572 break;
573 case TGSI_INTERPOLATE_LOC_SAMPLE:
574 default:
575 loc = 0; break;
576 }
577
578 return is_linear * 3 + loc;
579 }
580
581 return -1;
582 }
583
evergreen_interp_assign_ij_index(struct r600_shader_ctx * ctx,int input)584 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
585 int input)
586 {
587 int i = eg_get_interpolator_index(
588 ctx->shader->input[input].interpolate,
589 ctx->shader->input[input].interpolate_location);
590 assert(i >= 0);
591 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
592 }
593
evergreen_interp_alu(struct r600_shader_ctx * ctx,int input)594 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
595 {
596 int i, r;
597 struct r600_bytecode_alu alu;
598 int gpr = 0, base_chan = 0;
599 int ij_index = ctx->shader->input[input].ij_index;
600
601 /* work out gpr and base_chan from index */
602 gpr = ij_index / 2;
603 base_chan = (2 * (ij_index % 2)) + 1;
604
605 for (i = 0; i < 8; i++) {
606 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
607
608 if (i < 4)
609 alu.op = ALU_OP2_INTERP_ZW;
610 else
611 alu.op = ALU_OP2_INTERP_XY;
612
613 if ((i > 1) && (i < 6)) {
614 alu.dst.sel = ctx->shader->input[input].gpr;
615 alu.dst.write = 1;
616 }
617
618 alu.dst.chan = i % 4;
619
620 alu.src[0].sel = gpr;
621 alu.src[0].chan = (base_chan - (i % 2));
622
623 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
624
625 alu.bank_swizzle_force = SQ_ALU_VEC_210;
626 if ((i % 4) == 3)
627 alu.last = 1;
628 r = r600_bytecode_add_alu(ctx->bc, &alu);
629 if (r)
630 return r;
631 }
632 return 0;
633 }
634
evergreen_interp_flat(struct r600_shader_ctx * ctx,int input)635 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
636 {
637 int i, r;
638 struct r600_bytecode_alu alu;
639
640 for (i = 0; i < 4; i++) {
641 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
642
643 alu.op = ALU_OP1_INTERP_LOAD_P0;
644
645 alu.dst.sel = ctx->shader->input[input].gpr;
646 alu.dst.write = 1;
647
648 alu.dst.chan = i;
649
650 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
651 alu.src[0].chan = i;
652
653 if (i == 3)
654 alu.last = 1;
655 r = r600_bytecode_add_alu(ctx->bc, &alu);
656 if (r)
657 return r;
658 }
659 return 0;
660 }
661
662 /*
663 * Special export handling in shaders
664 *
665 * shader export ARRAY_BASE for EXPORT_POS:
666 * 60 is position
667 * 61 is misc vector
668 * 62, 63 are clip distance vectors
669 *
670 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
671 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
672 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
673 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
674 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
675 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
676 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
677 * exclusive from render target index)
678 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
679 *
680 *
681 * shader export ARRAY_BASE for EXPORT_PIXEL:
682 * 0-7 CB targets
683 * 61 computed Z vector
684 *
685 * The use of the values exported in the computed Z vector are controlled
686 * by DB_SHADER_CONTROL:
687 * Z_EXPORT_ENABLE - Z as a float in RED
688 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
689 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
690 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
691 * DB_SOURCE_FORMAT - export control restrictions
692 *
693 */
694
695
696 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
r600_spi_sid(struct r600_shader_io * io)697 static int r600_spi_sid(struct r600_shader_io * io)
698 {
699 int index, name = io->name;
700
701 /* These params are handled differently, they don't need
702 * semantic indices, so we'll use 0 for them.
703 */
704 if (name == TGSI_SEMANTIC_POSITION ||
705 name == TGSI_SEMANTIC_PSIZE ||
706 name == TGSI_SEMANTIC_EDGEFLAG ||
707 name == TGSI_SEMANTIC_FACE ||
708 name == TGSI_SEMANTIC_SAMPLEMASK)
709 index = 0;
710 else {
711 if (name == TGSI_SEMANTIC_GENERIC) {
712 /* For generic params simply use sid from tgsi */
713 index = 9 + io->sid;
714 } else if (name == TGSI_SEMANTIC_TEXCOORD) {
715 index = io->sid;
716 } else {
717 /* For non-generic params - pack name and sid into 8 bits */
718 index = 0x80 | (name<<3) | (io->sid);
719 }
720
721 /* Make sure that all really used indices have nonzero value, so
722 * we can just compare it to 0 later instead of comparing the name
723 * with different values to detect special cases. */
724 index++;
725 }
726
727 return index;
728 };
729
730 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)731 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
732 {
733 switch (semantic_name) {
734 case TGSI_SEMANTIC_POSITION:
735 return 0;
736 case TGSI_SEMANTIC_PSIZE:
737 return 1;
738 case TGSI_SEMANTIC_CLIPDIST:
739 assert(index <= 1);
740 return 2 + index;
741 case TGSI_SEMANTIC_TEXCOORD:
742 return 4 + index;
743 case TGSI_SEMANTIC_COLOR:
744 return 12 + index;
745 case TGSI_SEMANTIC_BCOLOR:
746 return 14 + index;
747 case TGSI_SEMANTIC_CLIPVERTEX:
748 return 16;
749 case TGSI_SEMANTIC_GENERIC:
750 if (index <= 63-17)
751 return 17 + index;
752 else
753 /* same explanation as in the default statement,
754 * the only user hitting this is st/nine.
755 */
756 return 0;
757
758 /* patch indices are completely separate and thus start from 0 */
759 case TGSI_SEMANTIC_TESSOUTER:
760 return 0;
761 case TGSI_SEMANTIC_TESSINNER:
762 return 1;
763 case TGSI_SEMANTIC_PATCH:
764 return 2 + index;
765
766 default:
767 /* Don't fail here. The result of this function is only used
768 * for LS, TCS, TES, and GS, where legacy GL semantics can't
769 * occur, but this function is called for all vertex shaders
770 * before it's known whether LS will be compiled or not.
771 */
772 return 0;
773 }
774 }
775
776 /* turn input into interpolate on EG */
evergreen_interp_input(struct r600_shader_ctx * ctx,int index)777 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
778 {
779 int r = 0;
780
781 if (ctx->shader->input[index].spi_sid) {
782 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
783 if (ctx->shader->input[index].interpolate > 0) {
784 evergreen_interp_assign_ij_index(ctx, index);
785 r = evergreen_interp_alu(ctx, index);
786 } else {
787 r = evergreen_interp_flat(ctx, index);
788 }
789 }
790 return r;
791 }
792
select_twoside_color(struct r600_shader_ctx * ctx,int front,int back)793 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
794 {
795 struct r600_bytecode_alu alu;
796 int i, r;
797 int gpr_front = ctx->shader->input[front].gpr;
798 int gpr_back = ctx->shader->input[back].gpr;
799
800 for (i = 0; i < 4; i++) {
801 memset(&alu, 0, sizeof(alu));
802 alu.op = ALU_OP3_CNDGT;
803 alu.is_op3 = 1;
804 alu.dst.write = 1;
805 alu.dst.sel = gpr_front;
806 alu.src[0].sel = ctx->face_gpr;
807 alu.src[1].sel = gpr_front;
808 alu.src[2].sel = gpr_back;
809
810 alu.dst.chan = i;
811 alu.src[1].chan = i;
812 alu.src[2].chan = i;
813 alu.last = (i==3);
814
815 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
816 return r;
817 }
818
819 return 0;
820 }
821
822 /* execute a single slot ALU calculation */
single_alu_op2(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val)823 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
824 int dst_sel, int dst_chan,
825 int src0_sel, unsigned src0_chan_val,
826 int src1_sel, unsigned src1_chan_val)
827 {
828 struct r600_bytecode_alu alu;
829 int r, i;
830
831 if (ctx->bc->gfx_level == CAYMAN && op == ALU_OP2_MULLO_INT) {
832 for (i = 0; i < 4; i++) {
833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
834 alu.op = op;
835 alu.src[0].sel = src0_sel;
836 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
837 alu.src[0].value = src0_chan_val;
838 else
839 alu.src[0].chan = src0_chan_val;
840 alu.src[1].sel = src1_sel;
841 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
842 alu.src[1].value = src1_chan_val;
843 else
844 alu.src[1].chan = src1_chan_val;
845 alu.dst.sel = dst_sel;
846 alu.dst.chan = i;
847 alu.dst.write = i == dst_chan;
848 alu.last = (i == 3);
849 r = r600_bytecode_add_alu(ctx->bc, &alu);
850 if (r)
851 return r;
852 }
853 return 0;
854 }
855
856 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
857 alu.op = op;
858 alu.src[0].sel = src0_sel;
859 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
860 alu.src[0].value = src0_chan_val;
861 else
862 alu.src[0].chan = src0_chan_val;
863 alu.src[1].sel = src1_sel;
864 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
865 alu.src[1].value = src1_chan_val;
866 else
867 alu.src[1].chan = src1_chan_val;
868 alu.dst.sel = dst_sel;
869 alu.dst.chan = dst_chan;
870 alu.dst.write = 1;
871 alu.last = 1;
872 r = r600_bytecode_add_alu(ctx->bc, &alu);
873 if (r)
874 return r;
875 return 0;
876 }
877
878 /* execute a single slot ALU calculation */
single_alu_op3(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val,int src2_sel,unsigned src2_chan_val)879 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
880 int dst_sel, int dst_chan,
881 int src0_sel, unsigned src0_chan_val,
882 int src1_sel, unsigned src1_chan_val,
883 int src2_sel, unsigned src2_chan_val)
884 {
885 struct r600_bytecode_alu alu;
886 int r;
887
888 /* validate this for other ops */
889 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
890 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
891 alu.op = op;
892 alu.src[0].sel = src0_sel;
893 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
894 alu.src[0].value = src0_chan_val;
895 else
896 alu.src[0].chan = src0_chan_val;
897 alu.src[1].sel = src1_sel;
898 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
899 alu.src[1].value = src1_chan_val;
900 else
901 alu.src[1].chan = src1_chan_val;
902 alu.src[2].sel = src2_sel;
903 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
904 alu.src[2].value = src2_chan_val;
905 else
906 alu.src[2].chan = src2_chan_val;
907 alu.dst.sel = dst_sel;
908 alu.dst.chan = dst_chan;
909 alu.is_op3 = 1;
910 alu.last = 1;
911 r = r600_bytecode_add_alu(ctx->bc, &alu);
912 if (r)
913 return r;
914 return 0;
915 }
916
917 /* put it in temp_reg.x */
get_lds_offset0(struct r600_shader_ctx * ctx,int rel_patch_chan,int temp_reg,bool is_patch_var)918 static int get_lds_offset0(struct r600_shader_ctx *ctx,
919 int rel_patch_chan,
920 int temp_reg, bool is_patch_var)
921 {
922 int r;
923
924 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
925 /* ADD
926 Dimension - patch0_offset (input_vals.z),
927 Non-dim - patch0_data_offset (input_vals.w)
928 */
929 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
930 temp_reg, 0,
931 ctx->tess_output_info, 0,
932 0, rel_patch_chan,
933 ctx->tess_output_info, is_patch_var ? 3 : 2);
934 if (r)
935 return r;
936 return 0;
937 }
938
get_address_file_reg(struct r600_shader_ctx * ctx,int index)939 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
940 {
941 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
942 }
943
r600_get_temp(struct r600_shader_ctx * ctx)944 static int r600_get_temp(struct r600_shader_ctx *ctx)
945 {
946 return ctx->temp_reg + ctx->max_driver_temp_used++;
947 }
948
vs_add_primid_output(struct r600_shader_ctx * ctx,int prim_id_sid)949 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
950 {
951 int i;
952 i = ctx->shader->noutput++;
953 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
954 ctx->shader->output[i].sid = 0;
955 ctx->shader->output[i].gpr = 0;
956 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
957 ctx->shader->output[i].write_mask = 0x4;
958 ctx->shader->output[i].spi_sid = prim_id_sid;
959
960 return 0;
961 }
962
tgsi_barrier(struct r600_shader_ctx * ctx)963 static int tgsi_barrier(struct r600_shader_ctx *ctx)
964 {
965 struct r600_bytecode_alu alu;
966 int r;
967
968 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
969 alu.op = ctx->inst_info->op;
970 alu.last = 1;
971
972 r = r600_bytecode_add_alu(ctx->bc, &alu);
973 if (r)
974 return r;
975
976 /* XXX: Need to implement GWS ops to sync across wavefronts */
977
978 return 0;
979 }
980
tgsi_membar(struct r600_shader_ctx * ctx)981 static int tgsi_membar(struct r600_shader_ctx *ctx)
982 {
983 /* Wait for any SSBO/image stores to land. */
984 return r600_bytecode_wait_acks(ctx->bc);
985 }
986
choose_spill_arrays(struct r600_shader_ctx * ctx,int * regno,unsigned * scratch_space_needed)987 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
988 {
989 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
990 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
991 unsigned narrays_left = n;
992 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
993
994 *scratch_space_needed = 0;
995 while (*regno > 124 && narrays_left) {
996 unsigned i;
997 unsigned largest = 0;
998 unsigned largest_index = 0;
999
1000 for (i = 0; i < n; i++) {
1001 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1002 if (!spilled[i] && size > largest) {
1003 largest = size;
1004 largest_index = i;
1005 }
1006 }
1007
1008 spilled[largest_index] = true;
1009 *regno -= largest;
1010 *scratch_space_needed += largest;
1011
1012 narrays_left --;
1013 }
1014
1015 if (narrays_left == 0) {
1016 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
1017 }
1018 }
1019
1020 /* Take spilled temp arrays into account when translating tgsi register
1021 * indexes into r600 gprs if spilled is false, or scratch array offset if
1022 * spilled is true */
map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx * ctx,unsigned tgsi_reg_index,bool * spilled)1023 static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
1024 {
1025 unsigned i;
1026 unsigned spilled_size = 0;
1027
1028 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1029 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1030 if (ctx->spilled_arrays[i]) {
1031 /* vec4 index into spilled scratch memory */
1032 *spilled = true;
1033 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1034 }
1035 else {
1036 /* regular GPR array */
1037 *spilled = false;
1038 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1039 }
1040 }
1041
1042 if (tgsi_reg_index < ctx->array_infos[i].range.First)
1043 break;
1044 if (ctx->spilled_arrays[i]) {
1045 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1046 }
1047 }
1048
1049 /* regular GPR index, minus the holes from spilled arrays */
1050 *spilled = false;
1051
1052 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1053 }
1054
1055 /* look up spill area base offset and array size for a spilled temp array */
get_spilled_array_base_and_size(struct r600_shader_ctx * ctx,unsigned tgsi_reg_index,unsigned * array_base,unsigned * array_size)1056 static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1057 unsigned *array_base, unsigned *array_size)
1058 {
1059 unsigned i;
1060 unsigned offset = 0;
1061
1062 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1063 if (ctx->spilled_arrays[i]) {
1064 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1065
1066 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1067 *array_base = offset;
1068 *array_size = size - 1; /* hw counts from 1 */
1069
1070 return;
1071 }
1072
1073 offset += size;
1074 }
1075 }
1076 }
1077
tgsi_declaration(struct r600_shader_ctx * ctx)1078 static int tgsi_declaration(struct r600_shader_ctx *ctx)
1079 {
1080 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1081 int r, i, j, count = d->Range.Last - d->Range.First + 1;
1082
1083 switch (d->Declaration.File) {
1084 case TGSI_FILE_INPUT:
1085 for (j = 0; j < count; j++) {
1086 i = ctx->shader->ninput + j;
1087 assert(i < ARRAY_SIZE(ctx->shader->input));
1088 ctx->shader->input[i].name = d->Semantic.Name;
1089 ctx->shader->input[i].sid = d->Semantic.Index + j;
1090 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1091 ctx->shader->input[i].interpolate_location = d->Interp.Location;
1092 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1093 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1094 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1095 switch (ctx->shader->input[i].name) {
1096 case TGSI_SEMANTIC_FACE:
1097 if (ctx->face_gpr != -1)
1098 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1099 else
1100 ctx->face_gpr = ctx->shader->input[i].gpr;
1101 break;
1102 case TGSI_SEMANTIC_COLOR:
1103 ctx->colors_used++;
1104 break;
1105 case TGSI_SEMANTIC_POSITION:
1106 ctx->fragcoord_input = i;
1107 break;
1108 case TGSI_SEMANTIC_PRIMID:
1109 /* set this for now */
1110 ctx->shader->gs_prim_id_input = true;
1111 ctx->shader->ps_prim_id_input = i;
1112 break;
1113 }
1114 if (ctx->bc->gfx_level >= EVERGREEN) {
1115 if ((r = evergreen_interp_input(ctx, i)))
1116 return r;
1117 }
1118 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1119 /* FIXME probably skip inputs if they aren't passed in the ring */
1120 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1121 ctx->next_ring_offset += 16;
1122 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1123 ctx->shader->gs_prim_id_input = true;
1124 }
1125 }
1126 ctx->shader->ninput += count;
1127 break;
1128 case TGSI_FILE_OUTPUT:
1129 for (j = 0; j < count; j++) {
1130 i = ctx->shader->noutput + j;
1131 assert(i < ARRAY_SIZE(ctx->shader->output));
1132 ctx->shader->output[i].name = d->Semantic.Name;
1133 ctx->shader->output[i].sid = d->Semantic.Index + j;
1134 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1135 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1136 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1137 if (ctx->type == PIPE_SHADER_VERTEX ||
1138 ctx->type == PIPE_SHADER_GEOMETRY ||
1139 ctx->type == PIPE_SHADER_TESS_EVAL) {
1140 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1141 switch (d->Semantic.Name) {
1142 case TGSI_SEMANTIC_CLIPDIST:
1143 break;
1144 case TGSI_SEMANTIC_PSIZE:
1145 ctx->shader->vs_out_misc_write = 1;
1146 ctx->shader->vs_out_point_size = 1;
1147 break;
1148 case TGSI_SEMANTIC_EDGEFLAG:
1149 ctx->shader->vs_out_misc_write = 1;
1150 ctx->shader->vs_out_edgeflag = 1;
1151 ctx->edgeflag_output = i;
1152 break;
1153 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1154 ctx->shader->vs_out_misc_write = 1;
1155 ctx->shader->vs_out_viewport = 1;
1156 break;
1157 case TGSI_SEMANTIC_LAYER:
1158 ctx->shader->vs_out_misc_write = 1;
1159 ctx->shader->vs_out_layer = 1;
1160 break;
1161 case TGSI_SEMANTIC_CLIPVERTEX:
1162 ctx->clip_vertex_write = TRUE;
1163 ctx->cv_output = i;
1164 break;
1165 }
1166 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1167 ctx->gs_out_ring_offset += 16;
1168 }
1169 }
1170 }
1171 ctx->shader->noutput += count;
1172 break;
1173 case TGSI_FILE_TEMPORARY:
1174 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1175 if (d->Array.ArrayID) {
1176 bool spilled;
1177 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1178 d->Range.First,
1179 &spilled);
1180
1181 if (!spilled) {
1182 r600_add_gpr_array(ctx->shader, idx,
1183 d->Range.Last - d->Range.First + 1, 0x0F);
1184 }
1185 }
1186 }
1187 break;
1188
1189 case TGSI_FILE_CONSTANT:
1190 case TGSI_FILE_SAMPLER:
1191 case TGSI_FILE_SAMPLER_VIEW:
1192 case TGSI_FILE_ADDRESS:
1193 case TGSI_FILE_BUFFER:
1194 case TGSI_FILE_IMAGE:
1195 case TGSI_FILE_MEMORY:
1196 break;
1197
1198 case TGSI_FILE_HW_ATOMIC:
1199 i = ctx->shader->nhwatomic_ranges;
1200 ctx->shader->atomics[i].start = d->Range.First;
1201 ctx->shader->atomics[i].end = d->Range.Last;
1202 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1203 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1204 ctx->shader->nhwatomic_ranges++;
1205 ctx->shader->nhwatomic += count;
1206 break;
1207
1208 case TGSI_FILE_SYSTEM_VALUE:
1209 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1210 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1211 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1212 break; /* Already handled from allocate_system_value_inputs */
1213 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1214 break;
1215 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1216 break;
1217 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1218 break;
1219 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1220 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1221 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1222 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1223 unsigned temp_reg = r600_get_temp(ctx);
1224
1225 r = get_lds_offset0(ctx, 2, temp_reg, true);
1226 if (r)
1227 return r;
1228
1229 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1230 temp_reg, 0,
1231 temp_reg, 0,
1232 V_SQ_ALU_SRC_LITERAL, param * 16);
1233 if (r)
1234 return r;
1235
1236 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1237 }
1238 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1239 /* MOV r1.x, r0.x;
1240 MOV r1.y, r0.y;
1241 */
1242 for (i = 0; i < 2; i++) {
1243 struct r600_bytecode_alu alu;
1244 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1245 alu.op = ALU_OP1_MOV;
1246 alu.src[0].sel = 0;
1247 alu.src[0].chan = 0 + i;
1248 alu.dst.sel = 1;
1249 alu.dst.chan = 0 + i;
1250 alu.dst.write = 1;
1251 alu.last = (i == 1) ? 1 : 0;
1252 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1253 return r;
1254 }
1255 /* ADD r1.z, 1.0f, -r0.x */
1256 struct r600_bytecode_alu alu;
1257 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1258 alu.op = ALU_OP2_ADD;
1259 alu.src[0].sel = V_SQ_ALU_SRC_1;
1260 alu.src[1].sel = 1;
1261 alu.src[1].chan = 0;
1262 alu.src[1].neg = 1;
1263 alu.dst.sel = 1;
1264 alu.dst.chan = 2;
1265 alu.dst.write = 1;
1266 alu.last = 1;
1267 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1268 return r;
1269
1270 /* ADD r1.z, r1.z, -r1.y */
1271 alu.op = ALU_OP2_ADD;
1272 alu.src[0].sel = 1;
1273 alu.src[0].chan = 2;
1274 alu.src[1].sel = 1;
1275 alu.src[1].chan = 1;
1276 alu.src[1].neg = 1;
1277 alu.dst.sel = 1;
1278 alu.dst.chan = 2;
1279 alu.dst.write = 1;
1280 alu.last = 1;
1281 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1282 return r;
1283 break;
1284 }
1285 break;
1286 default:
1287 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1288 return -EINVAL;
1289 }
1290 return 0;
1291 }
1292
allocate_system_value_inputs(struct r600_shader_ctx * ctx,int gpr_offset)1293 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1294 {
1295 struct tgsi_parse_context parse;
1296 struct {
1297 boolean enabled;
1298 int *reg;
1299 unsigned name, alternate_name;
1300 } inputs[2] = {
1301 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1302
1303 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1304 };
1305 int num_regs = 0;
1306 unsigned k, i;
1307
1308 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1309 return 0;
1310 }
1311
1312 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1313 while (!tgsi_parse_end_of_tokens(&parse)) {
1314 tgsi_parse_token(&parse);
1315
1316 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1317 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1318 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1319 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1320 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1321 {
1322 int interpolate, location, k;
1323
1324 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1325 location = TGSI_INTERPOLATE_LOC_CENTER;
1326 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1327 location = TGSI_INTERPOLATE_LOC_CENTER;
1328 /* Needs sample positions, currently those are always available */
1329 } else {
1330 location = TGSI_INTERPOLATE_LOC_CENTROID;
1331 }
1332
1333 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1334 k = eg_get_interpolator_index(interpolate, location);
1335 if (k >= 0)
1336 ctx->eg_interpolators[k].enabled = true;
1337 }
1338 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1339 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1340 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1341 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1342 if (d->Semantic.Name == inputs[k].name ||
1343 d->Semantic.Name == inputs[k].alternate_name) {
1344 inputs[k].enabled = true;
1345 }
1346 }
1347 }
1348 }
1349 }
1350
1351 tgsi_parse_free(&parse);
1352
1353 if (ctx->info.reads_samplemask &&
1354 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1355 inputs[1].enabled = true;
1356 }
1357
1358 if (ctx->bc->gfx_level >= EVERGREEN) {
1359 int num_baryc = 0;
1360 /* assign gpr to each interpolator according to priority */
1361 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1362 if (ctx->eg_interpolators[i].enabled) {
1363 ctx->eg_interpolators[i].ij_index = num_baryc;
1364 num_baryc++;
1365 }
1366 }
1367 num_baryc = (num_baryc + 1) >> 1;
1368 gpr_offset += num_baryc;
1369 }
1370
1371 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1372 boolean enabled = inputs[i].enabled;
1373 int *reg = inputs[i].reg;
1374 unsigned name = inputs[i].name;
1375
1376 if (enabled) {
1377 int gpr = gpr_offset + num_regs++;
1378 ctx->shader->nsys_inputs++;
1379
1380 // add to inputs, allocate a gpr
1381 k = ctx->shader->ninput++;
1382 ctx->shader->input[k].name = name;
1383 ctx->shader->input[k].sid = 0;
1384 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1385 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1386 *reg = ctx->shader->input[k].gpr = gpr;
1387 }
1388 }
1389
1390 return gpr_offset + num_regs;
1391 }
1392
1393 /*
1394 * for evergreen we need to scan the shader to find the number of GPRs we need to
1395 * reserve for interpolation and system values
1396 *
1397 * we need to know if we are going to emit any sample or centroid inputs
1398 * if perspective and linear are required
1399 */
evergreen_gpr_count(struct r600_shader_ctx * ctx)1400 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1401 {
1402 unsigned i;
1403
1404 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1405
1406 /*
1407 * Could get this information from the shader info. But right now
1408 * we interpolate all declared inputs, whereas the shader info will
1409 * only contain the bits if the inputs are actually used, so it might
1410 * not be safe...
1411 */
1412 for (i = 0; i < ctx->info.num_inputs; i++) {
1413 int k;
1414 /* skip position/face/mask/sampleid */
1415 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1416 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1417 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1418 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1419 continue;
1420
1421 k = eg_get_interpolator_index(
1422 ctx->info.input_interpolate[i],
1423 ctx->info.input_interpolate_loc[i]);
1424 if (k >= 0)
1425 ctx->eg_interpolators[k].enabled = TRUE;
1426 }
1427
1428 /* XXX PULL MODEL and LINE STIPPLE */
1429
1430 return allocate_system_value_inputs(ctx, 0);
1431 }
1432
1433 /* sample_id_sel == NULL means fetch for current sample */
load_sample_position(struct r600_shader_ctx * ctx,struct r600_shader_src * sample_id,int chan_sel)1434 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1435 {
1436 struct r600_bytecode_vtx vtx;
1437 int r, t1;
1438
1439 t1 = r600_get_temp(ctx);
1440
1441 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1442 vtx.op = FETCH_OP_VFETCH;
1443 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1444 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1445 if (sample_id == NULL) {
1446 assert(ctx->fixed_pt_position_gpr != -1);
1447
1448 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1449 vtx.src_sel_x = 3;
1450 }
1451 else {
1452 struct r600_bytecode_alu alu;
1453
1454 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1455 alu.op = ALU_OP1_MOV;
1456 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1457 alu.dst.sel = t1;
1458 alu.dst.write = 1;
1459 alu.last = 1;
1460 r = r600_bytecode_add_alu(ctx->bc, &alu);
1461 if (r)
1462 return r;
1463
1464 vtx.src_gpr = t1;
1465 vtx.src_sel_x = 0;
1466 }
1467 vtx.mega_fetch_count = 16;
1468 vtx.dst_gpr = t1;
1469 vtx.dst_sel_x = 0;
1470 vtx.dst_sel_y = 1;
1471 vtx.dst_sel_z = 2;
1472 vtx.dst_sel_w = 3;
1473 vtx.data_format = FMT_32_32_32_32_FLOAT;
1474 vtx.num_format_all = 2;
1475 vtx.format_comp_all = 1;
1476 vtx.use_const_fields = 0;
1477 vtx.offset = 0;
1478 vtx.endian = r600_endian_swap(32);
1479 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1480
1481 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1482 if (r)
1483 return r;
1484
1485 return t1;
1486 }
1487
eg_load_helper_invocation(struct r600_shader_ctx * ctx)1488 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1489 {
1490 int r;
1491 struct r600_bytecode_alu alu;
1492
1493 /* do a vtx fetch with wqm set on the vtx fetch */
1494 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1495 alu.op = ALU_OP1_MOV;
1496 alu.dst.sel = ctx->helper_invoc_reg;
1497 alu.dst.chan = 0;
1498 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1499 alu.src[0].value = 0xffffffff;
1500 alu.dst.write = 1;
1501 alu.last = 1;
1502 r = r600_bytecode_add_alu(ctx->bc, &alu);
1503 if (r)
1504 return r;
1505
1506 /* do a vtx fetch in VPM mode */
1507 struct r600_bytecode_vtx vtx;
1508 memset(&vtx, 0, sizeof(vtx));
1509 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1510 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1511 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1512 vtx.src_gpr = 0;
1513 vtx.mega_fetch_count = 16; /* no idea here really... */
1514 vtx.dst_gpr = ctx->helper_invoc_reg;
1515 vtx.dst_sel_x = 4;
1516 vtx.dst_sel_y = 7; /* SEL_Y */
1517 vtx.dst_sel_z = 7; /* SEL_Z */
1518 vtx.dst_sel_w = 7; /* SEL_W */
1519 vtx.data_format = FMT_32;
1520 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1521 return r;
1522 ctx->bc->cf_last->vpm = 1;
1523 return 0;
1524 }
1525
cm_load_helper_invocation(struct r600_shader_ctx * ctx)1526 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1527 {
1528 int r;
1529 struct r600_bytecode_alu alu;
1530
1531 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1532 alu.op = ALU_OP1_MOV;
1533 alu.dst.sel = ctx->helper_invoc_reg;
1534 alu.dst.chan = 0;
1535 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1536 alu.src[0].value = 0xffffffff;
1537 alu.dst.write = 1;
1538 alu.last = 1;
1539 r = r600_bytecode_add_alu(ctx->bc, &alu);
1540 if (r)
1541 return r;
1542
1543 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1544 alu.op = ALU_OP1_MOV;
1545 alu.dst.sel = ctx->helper_invoc_reg;
1546 alu.dst.chan = 0;
1547 alu.src[0].sel = V_SQ_ALU_SRC_0;
1548 alu.dst.write = 1;
1549 alu.last = 1;
1550 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1551 if (r)
1552 return r;
1553
1554 return ctx->helper_invoc_reg;
1555 }
1556
load_block_grid_size(struct r600_shader_ctx * ctx,bool load_block)1557 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1558 {
1559 struct r600_bytecode_vtx vtx;
1560 int r, t1;
1561
1562 if (ctx->cs_block_size_loaded)
1563 return ctx->cs_block_size_reg;
1564 if (ctx->cs_grid_size_loaded)
1565 return ctx->cs_grid_size_reg;
1566
1567 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1568 struct r600_bytecode_alu alu;
1569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1570 alu.op = ALU_OP1_MOV;
1571 alu.src[0].sel = V_SQ_ALU_SRC_0;
1572 alu.dst.sel = t1;
1573 alu.dst.write = 1;
1574 alu.last = 1;
1575 r = r600_bytecode_add_alu(ctx->bc, &alu);
1576 if (r)
1577 return r;
1578
1579 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1580 vtx.op = FETCH_OP_VFETCH;
1581 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1582 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1583 vtx.src_gpr = t1;
1584 vtx.src_sel_x = 0;
1585
1586 vtx.mega_fetch_count = 16;
1587 vtx.dst_gpr = t1;
1588 vtx.dst_sel_x = 0;
1589 vtx.dst_sel_y = 1;
1590 vtx.dst_sel_z = 2;
1591 vtx.dst_sel_w = 7;
1592 vtx.data_format = FMT_32_32_32_32;
1593 vtx.num_format_all = 1;
1594 vtx.format_comp_all = 0;
1595 vtx.use_const_fields = 0;
1596 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1597 vtx.endian = r600_endian_swap(32);
1598 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1599
1600 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1601 if (r)
1602 return r;
1603
1604 if (load_block)
1605 ctx->cs_block_size_loaded = true;
1606 else
1607 ctx->cs_grid_size_loaded = true;
1608 return t1;
1609 }
1610
tgsi_src(struct r600_shader_ctx * ctx,const struct tgsi_full_src_register * tgsi_src,struct r600_shader_src * r600_src)1611 static void tgsi_src(struct r600_shader_ctx *ctx,
1612 const struct tgsi_full_src_register *tgsi_src,
1613 struct r600_shader_src *r600_src)
1614 {
1615 memset(r600_src, 0, sizeof(*r600_src));
1616 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1617 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1618 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1619 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1620 r600_src->neg = tgsi_src->Register.Negate;
1621 r600_src->abs = tgsi_src->Register.Absolute;
1622
1623 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1624 bool spilled;
1625 unsigned idx;
1626
1627 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1628
1629 if (spilled) {
1630 int reg = r600_get_temp(ctx);
1631 int r;
1632
1633 r600_src->sel = reg;
1634
1635 if (ctx->bc->gfx_level < R700) {
1636 struct r600_bytecode_output cf;
1637
1638 memset(&cf, 0, sizeof(struct r600_bytecode_output));
1639 cf.op = CF_OP_MEM_SCRATCH;
1640 cf.elem_size = 3;
1641 cf.gpr = reg;
1642 cf.comp_mask = 0xF;
1643 cf.swizzle_x = 0;
1644 cf.swizzle_y = 1;
1645 cf.swizzle_z = 2;
1646 cf.swizzle_w = 3;
1647 cf.burst_count = 1;
1648
1649 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1650 &cf.array_base, &cf.array_size);
1651
1652 if (tgsi_src->Register.Indirect) {
1653 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1654 cf.index_gpr = ctx->bc->ar_reg;
1655 }
1656 else {
1657 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1658 cf.array_base += idx;
1659 cf.array_size = 0;
1660 }
1661
1662 r = r600_bytecode_add_output(ctx->bc, &cf);
1663 }
1664 else {
1665 struct r600_bytecode_vtx vtx;
1666
1667 r600_bytecode_wait_acks(ctx->bc);
1668
1669 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1670 vtx.op = FETCH_OP_READ_SCRATCH;
1671 vtx.dst_gpr = reg;
1672 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1673 vtx.elem_size = 3;
1674 vtx.data_format = FMT_32_32_32_32;
1675 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1676 vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1677 vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1678 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1679 vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1680
1681 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1682 &vtx.array_base, &vtx.array_size);
1683
1684 if (tgsi_src->Register.Indirect) {
1685 vtx.indexed = 1;
1686 vtx.src_gpr = ctx->bc->ar_reg;
1687 }
1688 else {
1689 vtx.array_base += idx;
1690 vtx.array_size = 0;
1691 }
1692
1693 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1694 }
1695
1696 if (r)
1697 return;
1698 }
1699 else {
1700 if (tgsi_src->Register.Indirect)
1701 r600_src->rel = V_SQ_REL_RELATIVE;
1702
1703 r600_src->sel = idx;
1704 }
1705
1706 return;
1707 }
1708
1709 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1710 int index;
1711 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1712 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1713 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1714
1715 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1716 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel);
1717 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1718 return;
1719 }
1720 index = tgsi_src->Register.Index;
1721 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1722 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1723 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1724 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1725 r600_src->swizzle[0] = 2; // Z value
1726 r600_src->swizzle[1] = 2;
1727 r600_src->swizzle[2] = 2;
1728 r600_src->swizzle[3] = 2;
1729 r600_src->sel = ctx->face_gpr;
1730 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1731 r600_src->swizzle[0] = 3; // W value
1732 r600_src->swizzle[1] = 3;
1733 r600_src->swizzle[2] = 3;
1734 r600_src->swizzle[3] = 3;
1735 r600_src->sel = ctx->fixed_pt_position_gpr;
1736 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1737 r600_src->swizzle[0] = 0;
1738 r600_src->swizzle[1] = 1;
1739 r600_src->swizzle[2] = 4;
1740 r600_src->swizzle[3] = 4;
1741 r600_src->sel = load_sample_position(ctx, NULL, -1);
1742 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1743 r600_src->swizzle[0] = 3;
1744 r600_src->swizzle[1] = 3;
1745 r600_src->swizzle[2] = 3;
1746 r600_src->swizzle[3] = 3;
1747 r600_src->sel = 0;
1748 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1749 r600_src->swizzle[0] = 0;
1750 r600_src->swizzle[1] = 0;
1751 r600_src->swizzle[2] = 0;
1752 r600_src->swizzle[3] = 0;
1753 r600_src->sel = 0;
1754 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1755 r600_src->sel = 0;
1756 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1757 r600_src->sel = 1;
1758 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1759 r600_src->swizzle[0] = 3;
1760 r600_src->swizzle[1] = 3;
1761 r600_src->swizzle[2] = 3;
1762 r600_src->swizzle[3] = 3;
1763 r600_src->sel = 1;
1764 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1765 r600_src->swizzle[0] = 2;
1766 r600_src->swizzle[1] = 2;
1767 r600_src->swizzle[2] = 2;
1768 r600_src->swizzle[3] = 2;
1769 r600_src->sel = 0;
1770 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1771 r600_src->sel = 1;
1772 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1773 r600_src->sel = 3;
1774 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1775 r600_src->sel = 2;
1776 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1777 r600_src->sel = ctx->tess_input_info;
1778 r600_src->swizzle[0] = 2;
1779 r600_src->swizzle[1] = 2;
1780 r600_src->swizzle[2] = 2;
1781 r600_src->swizzle[3] = 2;
1782 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1783 r600_src->sel = 0;
1784 r600_src->swizzle[0] = 0;
1785 r600_src->swizzle[1] = 0;
1786 r600_src->swizzle[2] = 0;
1787 r600_src->swizzle[3] = 0;
1788 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1789 r600_src->sel = 0;
1790 r600_src->swizzle[0] = 3;
1791 r600_src->swizzle[1] = 3;
1792 r600_src->swizzle[2] = 3;
1793 r600_src->swizzle[3] = 3;
1794 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1795 r600_src->sel = load_block_grid_size(ctx, false);
1796 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1797 r600_src->sel = load_block_grid_size(ctx, true);
1798 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1799 r600_src->sel = ctx->helper_invoc_reg;
1800 r600_src->swizzle[0] = 0;
1801 r600_src->swizzle[1] = 0;
1802 r600_src->swizzle[2] = 0;
1803 r600_src->swizzle[3] = 0;
1804 }
1805 } else {
1806 if (tgsi_src->Register.Indirect)
1807 r600_src->rel = V_SQ_REL_RELATIVE;
1808 r600_src->sel = tgsi_src->Register.Index;
1809 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1810 }
1811 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1812 if (tgsi_src->Register.Dimension) {
1813 r600_src->kc_bank = tgsi_src->Dimension.Index;
1814 if (tgsi_src->Dimension.Indirect) {
1815 r600_src->kc_rel = 1;
1816 }
1817 }
1818 }
1819 }
1820
tgsi_fetch_rel_const(struct r600_shader_ctx * ctx,unsigned int cb_idx,unsigned cb_rel,unsigned int offset,unsigned ar_chan,unsigned int dst_reg)1821 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1822 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1823 unsigned int dst_reg)
1824 {
1825 struct r600_bytecode_vtx vtx;
1826 unsigned int ar_reg;
1827 int r;
1828
1829 if (offset) {
1830 struct r600_bytecode_alu alu;
1831
1832 memset(&alu, 0, sizeof(alu));
1833
1834 alu.op = ALU_OP2_ADD_INT;
1835 alu.src[0].sel = ctx->bc->ar_reg;
1836 alu.src[0].chan = ar_chan;
1837
1838 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1839 alu.src[1].value = offset;
1840
1841 alu.dst.sel = dst_reg;
1842 alu.dst.chan = ar_chan;
1843 alu.dst.write = 1;
1844 alu.last = 1;
1845
1846 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1847 return r;
1848
1849 ar_reg = dst_reg;
1850 } else {
1851 ar_reg = ctx->bc->ar_reg;
1852 }
1853
1854 memset(&vtx, 0, sizeof(vtx));
1855 vtx.buffer_id = cb_idx;
1856 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1857 vtx.src_gpr = ar_reg;
1858 vtx.src_sel_x = ar_chan;
1859 vtx.mega_fetch_count = 16;
1860 vtx.dst_gpr = dst_reg;
1861 vtx.dst_sel_x = 0; /* SEL_X */
1862 vtx.dst_sel_y = 1; /* SEL_Y */
1863 vtx.dst_sel_z = 2; /* SEL_Z */
1864 vtx.dst_sel_w = 3; /* SEL_W */
1865 vtx.data_format = FMT_32_32_32_32_FLOAT;
1866 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1867 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1868 vtx.endian = r600_endian_swap(32);
1869 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1870
1871 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1872 return r;
1873
1874 return 0;
1875 }
1876
fetch_gs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1877 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1878 {
1879 struct r600_bytecode_vtx vtx;
1880 int r;
1881 unsigned index = src->Register.Index;
1882 unsigned vtx_id = src->Dimension.Index;
1883 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1884 int offset_chan = vtx_id % 3;
1885 int t2 = 0;
1886
1887 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1888 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1889
1890 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1891 offset_chan = 3;
1892
1893 if (src->Dimension.Indirect || src->Register.Indirect)
1894 t2 = r600_get_temp(ctx);
1895
1896 if (src->Dimension.Indirect) {
1897 int treg[3];
1898 struct r600_bytecode_alu alu;
1899 int r, i;
1900 unsigned addr_reg;
1901 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1902 if (src->DimIndirect.Index > 0) {
1903 r = single_alu_op2(ctx, ALU_OP1_MOV,
1904 ctx->bc->ar_reg, 0,
1905 addr_reg, 0,
1906 0, 0);
1907 if (r)
1908 return r;
1909 }
1910 /*
1911 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1912 at least this is what fglrx seems to do. */
1913 for (i = 0; i < 3; i++) {
1914 treg[i] = r600_get_temp(ctx);
1915 }
1916 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1917
1918 for (i = 0; i < 3; i++) {
1919 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1920 alu.op = ALU_OP1_MOV;
1921 alu.src[0].sel = ctx->gs_rotated_input[0];
1922 alu.src[0].chan = i == 2 ? 3 : i;
1923 alu.dst.sel = treg[i];
1924 alu.dst.chan = 0;
1925 alu.dst.write = 1;
1926 alu.last = 1;
1927 r = r600_bytecode_add_alu(ctx->bc, &alu);
1928 if (r)
1929 return r;
1930 }
1931 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1932 alu.op = ALU_OP1_MOV;
1933 alu.src[0].sel = treg[0];
1934 alu.src[0].rel = 1;
1935 alu.dst.sel = t2;
1936 alu.dst.write = 1;
1937 alu.last = 1;
1938 r = r600_bytecode_add_alu(ctx->bc, &alu);
1939 if (r)
1940 return r;
1941 offset_reg = t2;
1942 offset_chan = 0;
1943 }
1944
1945 if (src->Register.Indirect) {
1946 int addr_reg;
1947 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1948
1949 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1950
1951 /* pull the value from index_reg */
1952 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1953 t2, 1,
1954 addr_reg, 0,
1955 V_SQ_ALU_SRC_LITERAL, first);
1956 if (r)
1957 return r;
1958 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1959 t2, 0,
1960 t2, 1,
1961 V_SQ_ALU_SRC_LITERAL, 4,
1962 offset_reg, offset_chan);
1963 if (r)
1964 return r;
1965 offset_reg = t2;
1966 offset_chan = 0;
1967 index = src->Register.Index - first;
1968 }
1969
1970 memset(&vtx, 0, sizeof(vtx));
1971 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1972 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1973 vtx.src_gpr = offset_reg;
1974 vtx.src_sel_x = offset_chan;
1975 vtx.offset = index * 16; /*bytes*/
1976 vtx.mega_fetch_count = 16;
1977 vtx.dst_gpr = dst_reg;
1978 vtx.dst_sel_x = 0; /* SEL_X */
1979 vtx.dst_sel_y = 1; /* SEL_Y */
1980 vtx.dst_sel_z = 2; /* SEL_Z */
1981 vtx.dst_sel_w = 3; /* SEL_W */
1982 if (ctx->bc->gfx_level >= EVERGREEN) {
1983 vtx.use_const_fields = 1;
1984 } else {
1985 vtx.data_format = FMT_32_32_32_32_FLOAT;
1986 }
1987
1988 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1989 return r;
1990
1991 return 0;
1992 }
1993
tgsi_split_gs_inputs(struct r600_shader_ctx * ctx)1994 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1995 {
1996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1997 unsigned i;
1998
1999 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2000 struct tgsi_full_src_register *src = &inst->Src[i];
2001
2002 if (src->Register.File == TGSI_FILE_INPUT) {
2003 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
2004 /* primitive id is in R0.z */
2005 ctx->src[i].sel = 0;
2006 ctx->src[i].swizzle[0] = 2;
2007 }
2008 }
2009 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
2010 int treg = r600_get_temp(ctx);
2011
2012 fetch_gs_input(ctx, src, treg);
2013 ctx->src[i].sel = treg;
2014 ctx->src[i].rel = 0;
2015 }
2016 }
2017 return 0;
2018 }
2019
2020
2021 /* Tessellation shaders pass outputs to the next shader using LDS.
2022 *
2023 * LS outputs = TCS(HS) inputs
2024 * TCS(HS) outputs = TES(DS) inputs
2025 *
2026 * The LDS layout is:
2027 * - TCS inputs for patch 0
2028 * - TCS inputs for patch 1
2029 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
2030 * - ...
2031 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
2032 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
2033 * - TCS outputs for patch 1
2034 * - Per-patch TCS outputs for patch 1
2035 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
2036 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2037 * - ...
2038 *
2039 * All three shaders VS(LS), TCS, TES share the same LDS space.
2040 */
2041 /* this will return with the dw address in temp_reg.x */
r600_get_byte_address(struct r600_shader_ctx * ctx,int temp_reg,const struct tgsi_full_dst_register * dst,const struct tgsi_full_src_register * src,int stride_bytes_reg,int stride_bytes_chan)2042 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2043 const struct tgsi_full_dst_register *dst,
2044 const struct tgsi_full_src_register *src,
2045 int stride_bytes_reg, int stride_bytes_chan)
2046 {
2047 struct tgsi_full_dst_register reg;
2048 ubyte *name, *index, *array_first;
2049 int r;
2050 int param;
2051 struct tgsi_shader_info *info = &ctx->info;
2052 /* Set the register description. The address computation is the same
2053 * for sources and destinations. */
2054 if (src) {
2055 reg.Register.File = src->Register.File;
2056 reg.Register.Index = src->Register.Index;
2057 reg.Register.Indirect = src->Register.Indirect;
2058 reg.Register.Dimension = src->Register.Dimension;
2059 reg.Indirect = src->Indirect;
2060 reg.Dimension = src->Dimension;
2061 reg.DimIndirect = src->DimIndirect;
2062 } else
2063 reg = *dst;
2064
2065 /* If the register is 2-dimensional (e.g. an array of vertices
2066 * in a primitive), calculate the base address of the vertex. */
2067 if (reg.Register.Dimension) {
2068 int sel, chan;
2069 if (reg.Dimension.Indirect) {
2070 unsigned addr_reg;
2071 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2072
2073 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2074 /* pull the value from index_reg */
2075 sel = addr_reg;
2076 chan = 0;
2077 } else {
2078 sel = V_SQ_ALU_SRC_LITERAL;
2079 chan = reg.Dimension.Index;
2080 }
2081
2082 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2083 temp_reg, 0,
2084 stride_bytes_reg, stride_bytes_chan,
2085 sel, chan,
2086 temp_reg, 0);
2087 if (r)
2088 return r;
2089 }
2090
2091 if (reg.Register.File == TGSI_FILE_INPUT) {
2092 name = info->input_semantic_name;
2093 index = info->input_semantic_index;
2094 array_first = info->input_array_first;
2095 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2096 name = info->output_semantic_name;
2097 index = info->output_semantic_index;
2098 array_first = info->output_array_first;
2099 } else {
2100 assert(0);
2101 return -1;
2102 }
2103 if (reg.Register.Indirect) {
2104 int addr_reg;
2105 int first;
2106 /* Add the relative address of the element. */
2107 if (reg.Indirect.ArrayID)
2108 first = array_first[reg.Indirect.ArrayID];
2109 else
2110 first = reg.Register.Index;
2111
2112 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2113
2114 /* pull the value from index_reg */
2115 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2116 temp_reg, 0,
2117 V_SQ_ALU_SRC_LITERAL, 16,
2118 addr_reg, 0,
2119 temp_reg, 0);
2120 if (r)
2121 return r;
2122
2123 param = r600_get_lds_unique_index(name[first],
2124 index[first]);
2125
2126 } else {
2127 param = r600_get_lds_unique_index(name[reg.Register.Index],
2128 index[reg.Register.Index]);
2129 }
2130
2131 /* add to base_addr - passed in temp_reg.x */
2132 if (param) {
2133 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2134 temp_reg, 0,
2135 temp_reg, 0,
2136 V_SQ_ALU_SRC_LITERAL, param * 16);
2137 if (r)
2138 return r;
2139
2140 }
2141 return 0;
2142 }
2143
do_lds_fetch_values(struct r600_shader_ctx * ctx,unsigned temp_reg,unsigned dst_reg,unsigned mask)2144 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2145 unsigned dst_reg, unsigned mask)
2146 {
2147 struct r600_bytecode_alu alu;
2148 int r, i, lasti;
2149
2150 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2151 ctx->bc->force_add_cf = 1;
2152
2153 lasti = tgsi_last_instruction(mask);
2154 for (i = 1; i <= lasti; i++) {
2155 if (!(mask & (1 << i)))
2156 continue;
2157
2158 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2159 temp_reg, i,
2160 temp_reg, 0,
2161 V_SQ_ALU_SRC_LITERAL, 4 * i);
2162 if (r)
2163 return r;
2164 }
2165 for (i = 0; i <= lasti; i++) {
2166 if (!(mask & (1 << i)))
2167 continue;
2168
2169 /* emit an LDS_READ_RET */
2170 memset(&alu, 0, sizeof(alu));
2171 alu.op = LDS_OP1_LDS_READ_RET;
2172 alu.src[0].sel = temp_reg;
2173 alu.src[0].chan = i;
2174 alu.src[1].sel = V_SQ_ALU_SRC_0;
2175 alu.src[2].sel = V_SQ_ALU_SRC_0;
2176 alu.dst.chan = 0;
2177 alu.is_lds_idx_op = true;
2178 alu.last = 1;
2179 r = r600_bytecode_add_alu(ctx->bc, &alu);
2180 if (r)
2181 return r;
2182 }
2183 for (i = 0; i <= lasti; i++) {
2184 if (!(mask & (1 << i)))
2185 continue;
2186
2187 /* then read from LDS_OQ_A_POP */
2188 memset(&alu, 0, sizeof(alu));
2189
2190 alu.op = ALU_OP1_MOV;
2191 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2192 alu.src[0].chan = 0;
2193 alu.dst.sel = dst_reg;
2194 alu.dst.chan = i;
2195 alu.dst.write = 1;
2196 alu.last = 1;
2197 r = r600_bytecode_add_alu(ctx->bc, &alu);
2198 if (r)
2199 return r;
2200 }
2201 return 0;
2202 }
2203
fetch_mask(struct tgsi_src_register * reg)2204 static int fetch_mask(struct tgsi_src_register *reg)
2205 {
2206 int mask = 0;
2207 mask |= 1 << reg->SwizzleX;
2208 mask |= 1 << reg->SwizzleY;
2209 mask |= 1 << reg->SwizzleZ;
2210 mask |= 1 << reg->SwizzleW;
2211 return mask;
2212 }
2213
fetch_tes_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)2214 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2215 {
2216 int r;
2217 unsigned temp_reg = r600_get_temp(ctx);
2218
2219 r = get_lds_offset0(ctx, 2, temp_reg,
2220 src->Register.Dimension ? false : true);
2221 if (r)
2222 return r;
2223
2224 /* the base address is now in temp.x */
2225 r = r600_get_byte_address(ctx, temp_reg,
2226 NULL, src, ctx->tess_output_info, 1);
2227 if (r)
2228 return r;
2229
2230 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2231 if (r)
2232 return r;
2233 return 0;
2234 }
2235
fetch_tcs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)2236 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2237 {
2238 int r;
2239 unsigned temp_reg = r600_get_temp(ctx);
2240
2241 /* t.x = ips * r0.y */
2242 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2243 temp_reg, 0,
2244 ctx->tess_input_info, 0,
2245 0, 1);
2246
2247 if (r)
2248 return r;
2249
2250 /* the base address is now in temp.x */
2251 r = r600_get_byte_address(ctx, temp_reg,
2252 NULL, src, ctx->tess_input_info, 1);
2253 if (r)
2254 return r;
2255
2256 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2257 if (r)
2258 return r;
2259 return 0;
2260 }
2261
fetch_tcs_output(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)2262 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2263 {
2264 int r;
2265 unsigned temp_reg = r600_get_temp(ctx);
2266
2267 r = get_lds_offset0(ctx, 1, temp_reg,
2268 src->Register.Dimension ? false : true);
2269 if (r)
2270 return r;
2271 /* the base address is now in temp.x */
2272 r = r600_get_byte_address(ctx, temp_reg,
2273 NULL, src,
2274 ctx->tess_output_info, 1);
2275 if (r)
2276 return r;
2277
2278 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2279 if (r)
2280 return r;
2281 return 0;
2282 }
2283
tgsi_split_lds_inputs(struct r600_shader_ctx * ctx)2284 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2285 {
2286 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2287 unsigned i;
2288
2289 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2290 struct tgsi_full_src_register *src = &inst->Src[i];
2291
2292 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2293 int treg = r600_get_temp(ctx);
2294 fetch_tes_input(ctx, src, treg);
2295 ctx->src[i].sel = treg;
2296 ctx->src[i].rel = 0;
2297 }
2298 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2299 int treg = r600_get_temp(ctx);
2300 fetch_tcs_input(ctx, src, treg);
2301 ctx->src[i].sel = treg;
2302 ctx->src[i].rel = 0;
2303 }
2304 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2305 int treg = r600_get_temp(ctx);
2306 fetch_tcs_output(ctx, src, treg);
2307 ctx->src[i].sel = treg;
2308 ctx->src[i].rel = 0;
2309 }
2310 }
2311 return 0;
2312 }
2313
tgsi_split_constant(struct r600_shader_ctx * ctx)2314 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2315 {
2316 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2317 struct r600_bytecode_alu alu;
2318 int i, j, k, nconst, r;
2319
2320 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2321 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2322 nconst++;
2323 }
2324 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2325 }
2326 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2327 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2328 continue;
2329 }
2330
2331 if (ctx->src[i].rel) {
2332 int chan = inst->Src[i].Indirect.Swizzle;
2333 int treg = r600_get_temp(ctx);
2334 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2335 return r;
2336
2337 ctx->src[i].kc_bank = 0;
2338 ctx->src[i].kc_rel = 0;
2339 ctx->src[i].sel = treg;
2340 ctx->src[i].rel = 0;
2341 j--;
2342 } else if (j > 0) {
2343 int treg = r600_get_temp(ctx);
2344 for (k = 0; k < 4; k++) {
2345 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2346 alu.op = ALU_OP1_MOV;
2347 alu.src[0].sel = ctx->src[i].sel;
2348 alu.src[0].chan = k;
2349 alu.src[0].rel = ctx->src[i].rel;
2350 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2351 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2352 alu.dst.sel = treg;
2353 alu.dst.chan = k;
2354 alu.dst.write = 1;
2355 if (k == 3)
2356 alu.last = 1;
2357 r = r600_bytecode_add_alu(ctx->bc, &alu);
2358 if (r)
2359 return r;
2360 }
2361 ctx->src[i].sel = treg;
2362 ctx->src[i].rel =0;
2363 j--;
2364 }
2365 }
2366 return 0;
2367 }
2368
2369 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
tgsi_split_literal_constant(struct r600_shader_ctx * ctx)2370 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2371 {
2372 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2373 struct r600_bytecode_alu alu;
2374 int i, j, k, nliteral, r;
2375
2376 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2377 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2378 nliteral++;
2379 }
2380 }
2381 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2382 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2383 int treg = r600_get_temp(ctx);
2384 for (k = 0; k < 4; k++) {
2385 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2386 alu.op = ALU_OP1_MOV;
2387 alu.src[0].sel = ctx->src[i].sel;
2388 alu.src[0].chan = k;
2389 alu.src[0].value = ctx->src[i].value[k];
2390 alu.dst.sel = treg;
2391 alu.dst.chan = k;
2392 alu.dst.write = 1;
2393 if (k == 3)
2394 alu.last = 1;
2395 r = r600_bytecode_add_alu(ctx->bc, &alu);
2396 if (r)
2397 return r;
2398 }
2399 ctx->src[i].sel = treg;
2400 j--;
2401 }
2402 }
2403 return 0;
2404 }
2405
process_twoside_color_inputs(struct r600_shader_ctx * ctx)2406 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2407 {
2408 int i, r, count = ctx->shader->ninput;
2409
2410 for (i = 0; i < count; i++) {
2411 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2412 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2413 if (r)
2414 return r;
2415 }
2416 }
2417 return 0;
2418 }
2419
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)2420 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2421 int stream, unsigned *stream_item_size UNUSED)
2422 {
2423 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2424 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2425 int j, r;
2426 unsigned i;
2427
2428 /* Sanity checking. */
2429 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2430 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2431 r = -EINVAL;
2432 goto out_err;
2433 }
2434 for (i = 0; i < so->num_outputs; i++) {
2435 if (so->output[i].output_buffer >= 4) {
2436 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2437 so->output[i].output_buffer);
2438 r = -EINVAL;
2439 goto out_err;
2440 }
2441 }
2442
2443 /* Initialize locations where the outputs are stored. */
2444 for (i = 0; i < so->num_outputs; i++) {
2445
2446 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2447 start_comp[i] = so->output[i].start_component;
2448 /* Lower outputs with dst_offset < start_component.
2449 *
2450 * We can only output 4D vectors with a write mask, e.g. we can
2451 * only output the W component at offset 3, etc. If we want
2452 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2453 * to move it to X and output X. */
2454 if (so->output[i].dst_offset < so->output[i].start_component) {
2455 unsigned tmp = r600_get_temp(ctx);
2456
2457 for (j = 0; j < so->output[i].num_components; j++) {
2458 struct r600_bytecode_alu alu;
2459 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2460 alu.op = ALU_OP1_MOV;
2461 alu.src[0].sel = so_gpr[i];
2462 alu.src[0].chan = so->output[i].start_component + j;
2463
2464 alu.dst.sel = tmp;
2465 alu.dst.chan = j;
2466 alu.dst.write = 1;
2467 if (j == so->output[i].num_components - 1)
2468 alu.last = 1;
2469 r = r600_bytecode_add_alu(ctx->bc, &alu);
2470 if (r)
2471 return r;
2472 }
2473 start_comp[i] = 0;
2474 so_gpr[i] = tmp;
2475 }
2476 }
2477
2478 /* Write outputs to buffers. */
2479 for (i = 0; i < so->num_outputs; i++) {
2480 struct r600_bytecode_output output;
2481
2482 if (stream != -1 && stream != so->output[i].stream)
2483 continue;
2484
2485 memset(&output, 0, sizeof(struct r600_bytecode_output));
2486 output.gpr = so_gpr[i];
2487 output.elem_size = so->output[i].num_components - 1;
2488 if (output.elem_size == 2)
2489 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2490 output.array_base = so->output[i].dst_offset - start_comp[i];
2491 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2492 output.burst_count = 1;
2493 /* array_size is an upper limit for the burst_count
2494 * with MEM_STREAM instructions */
2495 output.array_size = 0xFFF;
2496 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2497
2498 if (ctx->bc->gfx_level >= EVERGREEN) {
2499 switch (so->output[i].output_buffer) {
2500 case 0:
2501 output.op = CF_OP_MEM_STREAM0_BUF0;
2502 break;
2503 case 1:
2504 output.op = CF_OP_MEM_STREAM0_BUF1;
2505 break;
2506 case 2:
2507 output.op = CF_OP_MEM_STREAM0_BUF2;
2508 break;
2509 case 3:
2510 output.op = CF_OP_MEM_STREAM0_BUF3;
2511 break;
2512 }
2513 output.op += so->output[i].stream * 4;
2514 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2515 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2516 } else {
2517 switch (so->output[i].output_buffer) {
2518 case 0:
2519 output.op = CF_OP_MEM_STREAM0;
2520 break;
2521 case 1:
2522 output.op = CF_OP_MEM_STREAM1;
2523 break;
2524 case 2:
2525 output.op = CF_OP_MEM_STREAM2;
2526 break;
2527 case 3:
2528 output.op = CF_OP_MEM_STREAM3;
2529 break;
2530 }
2531 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2532 }
2533 r = r600_bytecode_add_output(ctx->bc, &output);
2534 if (r)
2535 goto out_err;
2536 }
2537 return 0;
2538 out_err:
2539 return r;
2540 }
2541
convert_edgeflag_to_int(struct r600_shader_ctx * ctx)2542 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2543 {
2544 struct r600_bytecode_alu alu;
2545 unsigned reg;
2546
2547 if (!ctx->shader->vs_out_edgeflag)
2548 return;
2549
2550 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2551
2552 /* clamp(x, 0, 1) */
2553 memset(&alu, 0, sizeof(alu));
2554 alu.op = ALU_OP1_MOV;
2555 alu.src[0].sel = reg;
2556 alu.dst.sel = reg;
2557 alu.dst.write = 1;
2558 alu.dst.clamp = 1;
2559 alu.last = 1;
2560 r600_bytecode_add_alu(ctx->bc, &alu);
2561
2562 memset(&alu, 0, sizeof(alu));
2563 alu.op = ALU_OP1_FLT_TO_INT;
2564 alu.src[0].sel = reg;
2565 alu.dst.sel = reg;
2566 alu.dst.write = 1;
2567 alu.last = 1;
2568 r600_bytecode_add_alu(ctx->bc, &alu);
2569 }
2570
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)2571 int generate_gs_copy_shader(struct r600_context *rctx,
2572 struct r600_pipe_shader *gs,
2573 struct pipe_stream_output_info *so)
2574 {
2575 struct r600_shader_ctx ctx = {};
2576 struct r600_shader *gs_shader = &gs->shader;
2577 struct r600_pipe_shader *cshader;
2578 unsigned ocnt = gs_shader->noutput;
2579 struct r600_bytecode_alu alu;
2580 struct r600_bytecode_vtx vtx;
2581 struct r600_bytecode_output output;
2582 struct r600_bytecode_cf *cf_jump, *cf_pop,
2583 *last_exp_pos = NULL, *last_exp_param = NULL;
2584 int next_clip_pos = 61, next_param = 0;
2585 unsigned i, j;
2586 int ring;
2587 bool only_ring_0 = true;
2588 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2589 if (!cshader)
2590 return 0;
2591
2592 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2593 sizeof(struct r600_shader_io));
2594
2595 cshader->shader.noutput = ocnt;
2596
2597 ctx.shader = &cshader->shader;
2598 ctx.bc = &ctx.shader->bc;
2599 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2600
2601 r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family,
2602 rctx->screen->has_compressed_msaa_texturing);
2603
2604 ctx.bc->isa = rctx->isa;
2605
2606 cf_jump = NULL;
2607 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2608
2609 /* R0.x = R0.x & 0x3fffffff */
2610 memset(&alu, 0, sizeof(alu));
2611 alu.op = ALU_OP2_AND_INT;
2612 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2613 alu.src[1].value = 0x3fffffff;
2614 alu.dst.write = 1;
2615 r600_bytecode_add_alu(ctx.bc, &alu);
2616
2617 /* R0.y = R0.x >> 30 */
2618 memset(&alu, 0, sizeof(alu));
2619 alu.op = ALU_OP2_LSHR_INT;
2620 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2621 alu.src[1].value = 0x1e;
2622 alu.dst.chan = 1;
2623 alu.dst.write = 1;
2624 alu.last = 1;
2625 r600_bytecode_add_alu(ctx.bc, &alu);
2626
2627 /* fetch vertex data from GSVS ring */
2628 for (i = 0; i < ocnt; ++i) {
2629 struct r600_shader_io *out = &ctx.shader->output[i];
2630
2631 out->gpr = i + 1;
2632 out->ring_offset = i * 16;
2633
2634 memset(&vtx, 0, sizeof(vtx));
2635 vtx.op = FETCH_OP_VFETCH;
2636 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2637 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2638 vtx.mega_fetch_count = 16;
2639 vtx.offset = out->ring_offset;
2640 vtx.dst_gpr = out->gpr;
2641 vtx.src_gpr = 0;
2642 vtx.dst_sel_x = 0;
2643 vtx.dst_sel_y = 1;
2644 vtx.dst_sel_z = 2;
2645 vtx.dst_sel_w = 3;
2646 if (rctx->b.gfx_level >= EVERGREEN) {
2647 vtx.use_const_fields = 1;
2648 } else {
2649 vtx.data_format = FMT_32_32_32_32_FLOAT;
2650 }
2651
2652 r600_bytecode_add_vtx(ctx.bc, &vtx);
2653 }
2654 ctx.temp_reg = i + 1;
2655 for (ring = 3; ring >= 0; --ring) {
2656 bool enabled = false;
2657 for (i = 0; i < so->num_outputs; i++) {
2658 if (so->output[i].stream == ring) {
2659 enabled = true;
2660 if (ring > 0)
2661 only_ring_0 = false;
2662 break;
2663 }
2664 }
2665 if (ring != 0 && !enabled) {
2666 cshader->shader.ring_item_sizes[ring] = 0;
2667 continue;
2668 }
2669
2670 if (cf_jump) {
2671 // Patch up jump label
2672 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2673 cf_pop = ctx.bc->cf_last;
2674
2675 cf_jump->cf_addr = cf_pop->id + 2;
2676 cf_jump->pop_count = 1;
2677 cf_pop->cf_addr = cf_pop->id + 2;
2678 cf_pop->pop_count = 1;
2679 }
2680
2681 /* PRED_SETE_INT __, R0.y, ring */
2682 memset(&alu, 0, sizeof(alu));
2683 alu.op = ALU_OP2_PRED_SETE_INT;
2684 alu.src[0].chan = 1;
2685 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2686 alu.src[1].value = ring;
2687 alu.execute_mask = 1;
2688 alu.update_pred = 1;
2689 alu.last = 1;
2690 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2691
2692 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2693 cf_jump = ctx.bc->cf_last;
2694
2695 if (enabled)
2696 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2697 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2698 }
2699
2700 /* bc adds nops - copy it */
2701 if (ctx.bc->gfx_level == R600) {
2702 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2703 alu.op = ALU_OP0_NOP;
2704 alu.last = 1;
2705 r600_bytecode_add_alu(ctx.bc, &alu);
2706
2707 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2708 }
2709
2710 /* export vertex data */
2711 /* XXX factor out common code with r600_shader_from_tgsi ? */
2712 for (i = 0; i < ocnt; ++i) {
2713 struct r600_shader_io *out = &ctx.shader->output[i];
2714 bool instream0 = true;
2715 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2716 continue;
2717
2718 for (j = 0; j < so->num_outputs; j++) {
2719 if (so->output[j].register_index == i) {
2720 if (so->output[j].stream == 0)
2721 break;
2722 if (so->output[j].stream > 0)
2723 instream0 = false;
2724 }
2725 }
2726 if (!instream0)
2727 continue;
2728 memset(&output, 0, sizeof(output));
2729 output.gpr = out->gpr;
2730 output.elem_size = 3;
2731 output.swizzle_x = 0;
2732 output.swizzle_y = 1;
2733 output.swizzle_z = 2;
2734 output.swizzle_w = 3;
2735 output.burst_count = 1;
2736 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2737 output.op = CF_OP_EXPORT;
2738 switch (out->name) {
2739 case TGSI_SEMANTIC_POSITION:
2740 output.array_base = 60;
2741 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2742 break;
2743
2744 case TGSI_SEMANTIC_PSIZE:
2745 output.array_base = 61;
2746 if (next_clip_pos == 61)
2747 next_clip_pos = 62;
2748 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2749 output.swizzle_y = 7;
2750 output.swizzle_z = 7;
2751 output.swizzle_w = 7;
2752 ctx.shader->vs_out_misc_write = 1;
2753 ctx.shader->vs_out_point_size = 1;
2754 break;
2755 case TGSI_SEMANTIC_LAYER:
2756 if (out->spi_sid) {
2757 /* duplicate it as PARAM to pass to the pixel shader */
2758 output.array_base = next_param++;
2759 r600_bytecode_add_output(ctx.bc, &output);
2760 last_exp_param = ctx.bc->cf_last;
2761 }
2762 output.array_base = 61;
2763 if (next_clip_pos == 61)
2764 next_clip_pos = 62;
2765 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2766 output.swizzle_x = 7;
2767 output.swizzle_y = 7;
2768 output.swizzle_z = 0;
2769 output.swizzle_w = 7;
2770 ctx.shader->vs_out_misc_write = 1;
2771 ctx.shader->vs_out_layer = 1;
2772 break;
2773 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2774 if (out->spi_sid) {
2775 /* duplicate it as PARAM to pass to the pixel shader */
2776 output.array_base = next_param++;
2777 r600_bytecode_add_output(ctx.bc, &output);
2778 last_exp_param = ctx.bc->cf_last;
2779 }
2780 output.array_base = 61;
2781 if (next_clip_pos == 61)
2782 next_clip_pos = 62;
2783 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2784 ctx.shader->vs_out_misc_write = 1;
2785 ctx.shader->vs_out_viewport = 1;
2786 output.swizzle_x = 7;
2787 output.swizzle_y = 7;
2788 output.swizzle_z = 7;
2789 output.swizzle_w = 0;
2790 break;
2791 case TGSI_SEMANTIC_CLIPDIST:
2792 /* spi_sid is 0 for clipdistance outputs that were generated
2793 * for clipvertex - we don't need to pass them to PS */
2794 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2795 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2796 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2797 if (out->spi_sid) {
2798 /* duplicate it as PARAM to pass to the pixel shader */
2799 output.array_base = next_param++;
2800 r600_bytecode_add_output(ctx.bc, &output);
2801 last_exp_param = ctx.bc->cf_last;
2802 }
2803 output.array_base = next_clip_pos++;
2804 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2805 break;
2806 case TGSI_SEMANTIC_FOG:
2807 output.swizzle_y = 4; /* 0 */
2808 output.swizzle_z = 4; /* 0 */
2809 output.swizzle_w = 5; /* 1 */
2810 break;
2811 default:
2812 output.array_base = next_param++;
2813 break;
2814 }
2815 r600_bytecode_add_output(ctx.bc, &output);
2816 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2817 last_exp_param = ctx.bc->cf_last;
2818 else
2819 last_exp_pos = ctx.bc->cf_last;
2820 }
2821
2822 if (!last_exp_pos) {
2823 memset(&output, 0, sizeof(output));
2824 output.gpr = 0;
2825 output.elem_size = 3;
2826 output.swizzle_x = 7;
2827 output.swizzle_y = 7;
2828 output.swizzle_z = 7;
2829 output.swizzle_w = 7;
2830 output.burst_count = 1;
2831 output.type = 2;
2832 output.op = CF_OP_EXPORT;
2833 output.array_base = 60;
2834 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2835 r600_bytecode_add_output(ctx.bc, &output);
2836 last_exp_pos = ctx.bc->cf_last;
2837 }
2838
2839 if (!last_exp_param) {
2840 memset(&output, 0, sizeof(output));
2841 output.gpr = 0;
2842 output.elem_size = 3;
2843 output.swizzle_x = 7;
2844 output.swizzle_y = 7;
2845 output.swizzle_z = 7;
2846 output.swizzle_w = 7;
2847 output.burst_count = 1;
2848 output.type = 2;
2849 output.op = CF_OP_EXPORT;
2850 output.array_base = next_param++;
2851 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2852 r600_bytecode_add_output(ctx.bc, &output);
2853 last_exp_param = ctx.bc->cf_last;
2854 }
2855
2856 last_exp_pos->op = CF_OP_EXPORT_DONE;
2857 last_exp_param->op = CF_OP_EXPORT_DONE;
2858
2859 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2860 cf_pop = ctx.bc->cf_last;
2861
2862 cf_jump->cf_addr = cf_pop->id + 2;
2863 cf_jump->pop_count = 1;
2864 cf_pop->cf_addr = cf_pop->id + 2;
2865 cf_pop->pop_count = 1;
2866
2867 if (ctx.bc->gfx_level == CAYMAN)
2868 cm_bytecode_add_cf_end(ctx.bc);
2869 else {
2870 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2871 ctx.bc->cf_last->end_of_program = 1;
2872 }
2873
2874 gs->gs_copy_shader = cshader;
2875 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2876
2877 ctx.bc->nstack = 1;
2878
2879 return r600_bytecode_build(ctx.bc);
2880 }
2881
emit_inc_ring_offset(struct r600_shader_ctx * ctx,int idx,bool ind)2882 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2883 {
2884 if (ind) {
2885 struct r600_bytecode_alu alu;
2886 int r;
2887
2888 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2889 alu.op = ALU_OP2_ADD_INT;
2890 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2891 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2892 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2893 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2894 alu.dst.write = 1;
2895 alu.last = 1;
2896 r = r600_bytecode_add_alu(ctx->bc, &alu);
2897 if (r)
2898 return r;
2899 }
2900 return 0;
2901 }
2902
emit_gs_ring_writes(struct r600_shader_ctx * ctx,const struct pipe_stream_output_info * so UNUSED,int stream,bool ind)2903 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2904 {
2905 struct r600_bytecode_output output;
2906 int ring_offset;
2907 unsigned i, k;
2908 int effective_stream = stream == -1 ? 0 : stream;
2909 int idx = 0;
2910
2911 for (i = 0; i < ctx->shader->noutput; i++) {
2912 if (ctx->gs_for_vs) {
2913 /* for ES we need to lookup corresponding ring offset expected by GS
2914 * (map this output to GS input by name and sid) */
2915 /* FIXME precompute offsets */
2916 ring_offset = -1;
2917 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2918 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2919 struct r600_shader_io *out = &ctx->shader->output[i];
2920 if (in->name == out->name && in->sid == out->sid)
2921 ring_offset = in->ring_offset;
2922 }
2923
2924 if (ring_offset == -1)
2925 continue;
2926 } else {
2927 ring_offset = idx * 16;
2928 idx++;
2929 }
2930
2931 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2932 continue;
2933 /* next_ring_offset after parsing input decls contains total size of
2934 * single vertex data, gs_next_vertex - current vertex index */
2935 if (!ind)
2936 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2937
2938 memset(&output, 0, sizeof(struct r600_bytecode_output));
2939 output.gpr = ctx->shader->output[i].gpr;
2940 output.elem_size = 3;
2941 output.comp_mask = 0xF;
2942 output.burst_count = 1;
2943
2944 if (ind)
2945 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2946 else
2947 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2948
2949 switch (stream) {
2950 default:
2951 case 0:
2952 output.op = CF_OP_MEM_RING; break;
2953 case 1:
2954 output.op = CF_OP_MEM_RING1; break;
2955 case 2:
2956 output.op = CF_OP_MEM_RING2; break;
2957 case 3:
2958 output.op = CF_OP_MEM_RING3; break;
2959 }
2960
2961 if (ind) {
2962 output.array_base = ring_offset >> 2; /* in dwords */
2963 output.array_size = 0xfff;
2964 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2965 } else
2966 output.array_base = ring_offset >> 2; /* in dwords */
2967 r600_bytecode_add_output(ctx->bc, &output);
2968 }
2969
2970 ++ctx->gs_next_vertex;
2971 return 0;
2972 }
2973
2974
r600_fetch_tess_io_info(struct r600_shader_ctx * ctx)2975 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2976 {
2977 int r;
2978 struct r600_bytecode_vtx vtx;
2979 int temp_val = ctx->temp_reg;
2980 /* need to store the TCS output somewhere */
2981 r = single_alu_op2(ctx, ALU_OP1_MOV,
2982 temp_val, 0,
2983 V_SQ_ALU_SRC_LITERAL, 0,
2984 0, 0);
2985 if (r)
2986 return r;
2987
2988 /* used by VS/TCS */
2989 if (ctx->tess_input_info) {
2990 /* fetch tcs input values into resv space */
2991 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2992 vtx.op = FETCH_OP_VFETCH;
2993 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2994 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2995 vtx.mega_fetch_count = 16;
2996 vtx.data_format = FMT_32_32_32_32;
2997 vtx.num_format_all = 2;
2998 vtx.format_comp_all = 1;
2999 vtx.use_const_fields = 0;
3000 vtx.endian = r600_endian_swap(32);
3001 vtx.srf_mode_all = 1;
3002 vtx.offset = 0;
3003 vtx.dst_gpr = ctx->tess_input_info;
3004 vtx.dst_sel_x = 0;
3005 vtx.dst_sel_y = 1;
3006 vtx.dst_sel_z = 2;
3007 vtx.dst_sel_w = 3;
3008 vtx.src_gpr = temp_val;
3009 vtx.src_sel_x = 0;
3010
3011 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3012 if (r)
3013 return r;
3014 }
3015
3016 /* used by TCS/TES */
3017 if (ctx->tess_output_info) {
3018 /* fetch tcs output values into resv space */
3019 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3020 vtx.op = FETCH_OP_VFETCH;
3021 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3022 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3023 vtx.mega_fetch_count = 16;
3024 vtx.data_format = FMT_32_32_32_32;
3025 vtx.num_format_all = 2;
3026 vtx.format_comp_all = 1;
3027 vtx.use_const_fields = 0;
3028 vtx.endian = r600_endian_swap(32);
3029 vtx.srf_mode_all = 1;
3030 vtx.offset = 16;
3031 vtx.dst_gpr = ctx->tess_output_info;
3032 vtx.dst_sel_x = 0;
3033 vtx.dst_sel_y = 1;
3034 vtx.dst_sel_z = 2;
3035 vtx.dst_sel_w = 3;
3036 vtx.src_gpr = temp_val;
3037 vtx.src_sel_x = 0;
3038
3039 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3040 if (r)
3041 return r;
3042 }
3043 return 0;
3044 }
3045
emit_lds_vs_writes(struct r600_shader_ctx * ctx)3046 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3047 {
3048 int j, r;
3049 int temp_reg;
3050 unsigned i;
3051
3052 /* fetch tcs input values into input_vals */
3053 ctx->tess_input_info = r600_get_temp(ctx);
3054 ctx->tess_output_info = 0;
3055 r = r600_fetch_tess_io_info(ctx);
3056 if (r)
3057 return r;
3058
3059 temp_reg = r600_get_temp(ctx);
3060 /* dst reg contains LDS address stride * idx */
3061 /* MUL vertexID, vertex_dw_stride */
3062 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3063 temp_reg, 0,
3064 ctx->tess_input_info, 1,
3065 0, 1); /* rel id in r0.y? */
3066 if (r)
3067 return r;
3068
3069 for (i = 0; i < ctx->shader->noutput; i++) {
3070 struct r600_bytecode_alu alu;
3071 int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3072 ctx->shader->output[i].sid);
3073
3074 if (param) {
3075 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3076 temp_reg, 1,
3077 temp_reg, 0,
3078 V_SQ_ALU_SRC_LITERAL, param * 16);
3079 if (r)
3080 return r;
3081 }
3082
3083 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3084 temp_reg, 2,
3085 temp_reg, param ? 1 : 0,
3086 V_SQ_ALU_SRC_LITERAL, 8);
3087 if (r)
3088 return r;
3089
3090
3091 for (j = 0; j < 2; j++) {
3092 int chan = (j == 1) ? 2 : (param ? 1 : 0);
3093 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3094 alu.op = LDS_OP3_LDS_WRITE_REL;
3095 alu.src[0].sel = temp_reg;
3096 alu.src[0].chan = chan;
3097 alu.src[1].sel = ctx->shader->output[i].gpr;
3098 alu.src[1].chan = j * 2;
3099 alu.src[2].sel = ctx->shader->output[i].gpr;
3100 alu.src[2].chan = (j * 2) + 1;
3101 alu.last = 1;
3102 alu.dst.chan = 0;
3103 alu.lds_idx = 1;
3104 alu.is_lds_idx_op = true;
3105 r = r600_bytecode_add_alu(ctx->bc, &alu);
3106 if (r)
3107 return r;
3108 }
3109 }
3110 return 0;
3111 }
3112
r600_store_tcs_output(struct r600_shader_ctx * ctx)3113 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3114 {
3115 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3116 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3117 int i, r, lasti;
3118 int temp_reg = r600_get_temp(ctx);
3119 struct r600_bytecode_alu alu;
3120 unsigned write_mask = dst->Register.WriteMask;
3121
3122 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3123 return 0;
3124
3125 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3126 if (r)
3127 return r;
3128
3129 /* the base address is now in temp.x */
3130 r = r600_get_byte_address(ctx, temp_reg,
3131 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3132 if (r)
3133 return r;
3134
3135 /* LDS write */
3136 lasti = tgsi_last_instruction(write_mask);
3137 for (i = 1; i <= lasti; i++) {
3138
3139 if (!(write_mask & (1 << i)))
3140 continue;
3141 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3142 temp_reg, i,
3143 temp_reg, 0,
3144 V_SQ_ALU_SRC_LITERAL, 4 * i);
3145 if (r)
3146 return r;
3147 }
3148
3149 for (i = 0; i <= lasti; i++) {
3150 if (!(write_mask & (1 << i)))
3151 continue;
3152
3153 if ((i == 0 && ((write_mask & 3) == 3)) ||
3154 (i == 2 && ((write_mask & 0xc) == 0xc))) {
3155 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3156 alu.op = LDS_OP3_LDS_WRITE_REL;
3157 alu.src[0].sel = temp_reg;
3158 alu.src[0].chan = i;
3159
3160 alu.src[1].sel = dst->Register.Index;
3161 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3162 alu.src[1].chan = i;
3163
3164 alu.src[2].sel = dst->Register.Index;
3165 alu.src[2].sel += ctx->file_offset[dst->Register.File];
3166 alu.src[2].chan = i + 1;
3167 alu.lds_idx = 1;
3168 alu.dst.chan = 0;
3169 alu.last = 1;
3170 alu.is_lds_idx_op = true;
3171 r = r600_bytecode_add_alu(ctx->bc, &alu);
3172 if (r)
3173 return r;
3174 i += 1;
3175 continue;
3176 }
3177 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3178 alu.op = LDS_OP2_LDS_WRITE;
3179 alu.src[0].sel = temp_reg;
3180 alu.src[0].chan = i;
3181
3182 alu.src[1].sel = dst->Register.Index;
3183 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3184 alu.src[1].chan = i;
3185
3186 alu.src[2].sel = V_SQ_ALU_SRC_0;
3187 alu.dst.chan = 0;
3188 alu.last = 1;
3189 alu.is_lds_idx_op = true;
3190 r = r600_bytecode_add_alu(ctx->bc, &alu);
3191 if (r)
3192 return r;
3193 }
3194 return 0;
3195 }
3196
r600_tess_factor_read(struct r600_shader_ctx * ctx,int output_idx,int nc)3197 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3198 int output_idx, int nc)
3199 {
3200 int param;
3201 unsigned temp_reg = r600_get_temp(ctx);
3202 unsigned name = ctx->shader->output[output_idx].name;
3203 int dreg = ctx->shader->output[output_idx].gpr;
3204 int r;
3205
3206 param = r600_get_lds_unique_index(name, 0);
3207 r = get_lds_offset0(ctx, 1, temp_reg, true);
3208 if (r)
3209 return r;
3210
3211 if (param) {
3212 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3213 temp_reg, 0,
3214 temp_reg, 0,
3215 V_SQ_ALU_SRC_LITERAL, param * 16);
3216 if (r)
3217 return r;
3218 }
3219
3220 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3221 return 0;
3222 }
3223
r600_emit_tess_factor(struct r600_shader_ctx * ctx)3224 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3225 {
3226 int stride, outer_comps, inner_comps;
3227 int tessinner_idx = -1, tessouter_idx = -1;
3228 int i, r;
3229 unsigned j;
3230 int temp_reg = r600_get_temp(ctx);
3231 int treg[3] = {-1, -1, -1};
3232 struct r600_bytecode_alu alu;
3233 struct r600_bytecode_cf *cf_jump, *cf_pop;
3234
3235 /* only execute factor emission for invocation 0 */
3236 /* PRED_SETE_INT __, R0.x, 0 */
3237 memset(&alu, 0, sizeof(alu));
3238 alu.op = ALU_OP2_PRED_SETE_INT;
3239 alu.src[0].chan = 2;
3240 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3241 alu.execute_mask = 1;
3242 alu.update_pred = 1;
3243 alu.last = 1;
3244 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3245
3246 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3247 cf_jump = ctx->bc->cf_last;
3248
3249 treg[0] = r600_get_temp(ctx);
3250 switch (ctx->shader->tcs_prim_mode) {
3251 case PIPE_PRIM_LINES:
3252 stride = 8; /* 2 dwords, 1 vec2 store */
3253 outer_comps = 2;
3254 inner_comps = 0;
3255 break;
3256 case PIPE_PRIM_TRIANGLES:
3257 stride = 16; /* 4 dwords, 1 vec4 store */
3258 outer_comps = 3;
3259 inner_comps = 1;
3260 treg[1] = r600_get_temp(ctx);
3261 break;
3262 case PIPE_PRIM_QUADS:
3263 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3264 outer_comps = 4;
3265 inner_comps = 2;
3266 treg[1] = r600_get_temp(ctx);
3267 treg[2] = r600_get_temp(ctx);
3268 break;
3269 default:
3270 assert(0);
3271 return -1;
3272 }
3273
3274 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3275 /* TF_WRITE takes index in R.x, value in R.y */
3276 for (j = 0; j < ctx->shader->noutput; j++) {
3277 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3278 tessinner_idx = j;
3279 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3280 tessouter_idx = j;
3281 }
3282
3283 if (tessouter_idx == -1)
3284 return -1;
3285
3286 if (tessinner_idx == -1 && inner_comps)
3287 return -1;
3288
3289 if (tessouter_idx != -1) {
3290 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3291 if (r)
3292 return r;
3293 }
3294
3295 if (tessinner_idx != -1) {
3296 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3297 if (r)
3298 return r;
3299 }
3300
3301 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3302 /* r.x = relpatchid(r0.y) * tf_stride */
3303
3304 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
3305 /* add incoming r0.w to it: t.x = t.x + r0.w */
3306 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3307 temp_reg, 0,
3308 0, 1,
3309 V_SQ_ALU_SRC_LITERAL, stride,
3310 0, 3);
3311 if (r)
3312 return r;
3313
3314 for (i = 0; i < outer_comps + inner_comps; i++) {
3315 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3316 int out_comp = i >= outer_comps ? i - outer_comps : i;
3317
3318 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3319 if (out_comp == 1)
3320 out_comp = 0;
3321 else if (out_comp == 0)
3322 out_comp = 1;
3323 }
3324
3325 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3326 treg[i / 2], (2 * (i % 2)),
3327 temp_reg, 0,
3328 V_SQ_ALU_SRC_LITERAL, 4 * i);
3329 if (r)
3330 return r;
3331 r = single_alu_op2(ctx, ALU_OP1_MOV,
3332 treg[i / 2], 1 + (2 * (i%2)),
3333 ctx->shader->output[out_idx].gpr, out_comp,
3334 0, 0);
3335 if (r)
3336 return r;
3337 }
3338 for (i = 0; i < outer_comps + inner_comps; i++) {
3339 struct r600_bytecode_gds gds;
3340
3341 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3342 gds.src_gpr = treg[i / 2];
3343 gds.src_sel_x = 2 * (i % 2);
3344 gds.src_sel_y = 1 + (2 * (i % 2));
3345 gds.src_sel_z = 4;
3346 gds.dst_sel_x = 7;
3347 gds.dst_sel_y = 7;
3348 gds.dst_sel_z = 7;
3349 gds.dst_sel_w = 7;
3350 gds.op = FETCH_OP_TF_WRITE;
3351 r = r600_bytecode_add_gds(ctx->bc, &gds);
3352 if (r)
3353 return r;
3354 }
3355
3356 // Patch up jump label
3357 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3358 cf_pop = ctx->bc->cf_last;
3359
3360 cf_jump->cf_addr = cf_pop->id + 2;
3361 cf_jump->pop_count = 1;
3362 cf_pop->cf_addr = cf_pop->id + 2;
3363 cf_pop->pop_count = 1;
3364
3365 return 0;
3366 }
3367
3368 /*
3369 * We have to work out the thread ID for load and atomic
3370 * operations, which store the returned value to an index
3371 * in an intermediate buffer.
3372 * The index is calculated by taking the thread id,
3373 * calculated from the MBCNT instructions.
3374 * Then the shader engine ID is multiplied by 256,
3375 * and the wave id is added.
3376 * Then the result is multipled by 64 and thread id is
3377 * added.
3378 */
load_thread_id_gpr(struct r600_shader_ctx * ctx)3379 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3380 {
3381 struct r600_bytecode_alu alu;
3382 int r;
3383
3384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3385 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3386 alu.dst.sel = ctx->temp_reg;
3387 alu.dst.chan = 0;
3388 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3389 alu.src[0].value = 0xffffffff;
3390 alu.dst.write = 1;
3391 r = r600_bytecode_add_alu(ctx->bc, &alu);
3392 if (r)
3393 return r;
3394
3395 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3396 alu.op = ALU_OP1_MBCNT_32HI_INT;
3397 alu.dst.sel = ctx->temp_reg;
3398 alu.dst.chan = 1;
3399 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3400 alu.src[0].value = 0xffffffff;
3401 alu.dst.write = 1;
3402 r = r600_bytecode_add_alu(ctx->bc, &alu);
3403 if (r)
3404 return r;
3405
3406 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3407 alu.op = ALU_OP3_MULADD_UINT24;
3408 alu.dst.sel = ctx->temp_reg;
3409 alu.dst.chan = 2;
3410 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3411 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3412 alu.src[1].value = 256;
3413 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3414 alu.dst.write = 1;
3415 alu.is_op3 = 1;
3416 alu.last = 1;
3417 r = r600_bytecode_add_alu(ctx->bc, &alu);
3418 if (r)
3419 return r;
3420
3421 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3422 ctx->thread_id_gpr, 1,
3423 ctx->temp_reg, 2,
3424 V_SQ_ALU_SRC_LITERAL, 0x40,
3425 ctx->temp_reg, 0);
3426 if (r)
3427 return r;
3428 return 0;
3429 }
3430
r600_shader_from_tgsi(struct r600_context * rctx,struct r600_pipe_shader * pipeshader,union r600_shader_key key)3431 static int r600_shader_from_tgsi(struct r600_context *rctx,
3432 struct r600_pipe_shader *pipeshader,
3433 union r600_shader_key key)
3434 {
3435 struct r600_screen *rscreen = rctx->screen;
3436 struct r600_shader *shader = &pipeshader->shader;
3437 struct tgsi_token *tokens = pipeshader->selector->tokens;
3438 struct pipe_stream_output_info so = pipeshader->selector->so;
3439 struct tgsi_full_immediate *immediate;
3440 struct r600_shader_ctx ctx;
3441 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3442 unsigned output_done, noutput;
3443 unsigned opcode;
3444 int j, k, r = 0;
3445 unsigned i;
3446 int next_param_base = 0, next_clip_base;
3447 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3448 bool indirect_gprs;
3449 bool ring_outputs = false;
3450 bool lds_outputs = false;
3451 bool lds_inputs = false;
3452 bool pos_emitted = false;
3453
3454 ctx.bc = &shader->bc;
3455 ctx.shader = shader;
3456
3457 r600_bytecode_init(ctx.bc, rscreen->b.gfx_level, rscreen->b.family,
3458 rscreen->has_compressed_msaa_texturing);
3459 ctx.tokens = tokens;
3460 tgsi_scan_shader(tokens, &ctx.info);
3461 shader->indirect_files = ctx.info.indirect_files;
3462
3463 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3464 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3465 ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3466 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3467
3468 shader->uses_helper_invocation = false;
3469 shader->uses_doubles = ctx.info.uses_doubles;
3470 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3471 shader->num_loops = ctx.info.opcode_count[TGSI_OPCODE_BGNLOOP];
3472 shader->uses_interpolate_at_sample = ctx.info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE] != 0;
3473
3474 shader->nsys_inputs = 0;
3475
3476 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3477 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3478 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3479 tgsi_parse_init(&ctx.parse, tokens);
3480 ctx.type = ctx.info.processor;
3481 shader->processor_type = ctx.type;
3482 ctx.bc->type = shader->processor_type;
3483
3484 switch (ctx.type) {
3485 case PIPE_SHADER_VERTEX:
3486 shader->vs_as_gs_a = key.vs.as_gs_a;
3487 shader->vs_as_es = key.vs.as_es;
3488 shader->vs_as_ls = key.vs.as_ls;
3489 shader->atomic_base = key.vs.first_atomic_counter;
3490 if (shader->vs_as_es)
3491 ring_outputs = true;
3492 if (shader->vs_as_ls)
3493 lds_outputs = true;
3494 break;
3495 case PIPE_SHADER_GEOMETRY:
3496 ring_outputs = true;
3497 shader->atomic_base = key.gs.first_atomic_counter;
3498 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3499 break;
3500 case PIPE_SHADER_TESS_CTRL:
3501 shader->tcs_prim_mode = key.tcs.prim_mode;
3502 shader->atomic_base = key.tcs.first_atomic_counter;
3503 lds_outputs = true;
3504 lds_inputs = true;
3505 break;
3506 case PIPE_SHADER_TESS_EVAL:
3507 shader->tes_as_es = key.tes.as_es;
3508 shader->atomic_base = key.tes.first_atomic_counter;
3509 lds_inputs = true;
3510 if (shader->tes_as_es)
3511 ring_outputs = true;
3512 break;
3513 case PIPE_SHADER_FRAGMENT:
3514 shader->two_side = key.ps.color_two_side;
3515 shader->atomic_base = key.ps.first_atomic_counter;
3516 shader->rat_base = key.ps.nr_cbufs;
3517 shader->image_size_const_offset = key.ps.image_size_const_offset;
3518 break;
3519 case PIPE_SHADER_COMPUTE:
3520 shader->rat_base = 0;
3521 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3522 break;
3523 default:
3524 break;
3525 }
3526
3527 if (shader->vs_as_es || shader->tes_as_es) {
3528 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3529 } else {
3530 ctx.gs_for_vs = NULL;
3531 }
3532
3533 ctx.next_ring_offset = 0;
3534 ctx.gs_out_ring_offset = 0;
3535 ctx.gs_next_vertex = 0;
3536 ctx.gs_stream_output_info = &so;
3537
3538 ctx.thread_id_gpr = -1;
3539 ctx.face_gpr = -1;
3540 ctx.fixed_pt_position_gpr = -1;
3541 ctx.fragcoord_input = -1;
3542 ctx.colors_used = 0;
3543 ctx.clip_vertex_write = 0;
3544
3545 ctx.helper_invoc_reg = -1;
3546 ctx.cs_block_size_reg = -1;
3547 ctx.cs_grid_size_reg = -1;
3548 ctx.cs_block_size_loaded = false;
3549 ctx.cs_grid_size_loaded = false;
3550
3551 shader->nr_ps_color_exports = 0;
3552
3553
3554 /* register allocations */
3555 /* Values [0,127] correspond to GPR[0..127].
3556 * Values [128,159] correspond to constant buffer bank 0
3557 * Values [160,191] correspond to constant buffer bank 1
3558 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3559 * Values [256,287] correspond to constant buffer bank 2 (EG)
3560 * Values [288,319] correspond to constant buffer bank 3 (EG)
3561 * Other special values are shown in the list below.
3562 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3563 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3564 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3565 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3566 * 248 SQ_ALU_SRC_0: special constant 0.0.
3567 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3568 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3569 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3570 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3571 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3572 * 254 SQ_ALU_SRC_PV: previous vector result.
3573 * 255 SQ_ALU_SRC_PS: previous scalar result.
3574 */
3575 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3576 ctx.file_offset[i] = 0;
3577 }
3578
3579 if (ctx.type == PIPE_SHADER_VERTEX) {
3580
3581 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3582 if (ctx.info.num_inputs)
3583 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3584 }
3585 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3586 if (ctx.bc->gfx_level >= EVERGREEN)
3587 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3588 else
3589 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3590
3591 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3592 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3593 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3594 shader->uses_helper_invocation = true;
3595 }
3596 }
3597 }
3598 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3599 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3600 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3601 }
3602 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3603 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3604 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3605 bool add_tesscoord = false, add_tess_inout = false;
3606 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3607 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3608 /* if we have tesscoord save one reg */
3609 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3610 add_tesscoord = true;
3611 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3612 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3613 add_tess_inout = true;
3614 }
3615 if (add_tesscoord || add_tess_inout)
3616 ctx.file_offset[TGSI_FILE_INPUT]++;
3617 if (add_tess_inout)
3618 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3619 }
3620 if (ctx.type == PIPE_SHADER_COMPUTE) {
3621 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3622 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3623 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3624 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3625 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3626 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3627 }
3628 }
3629
3630 ctx.file_offset[TGSI_FILE_OUTPUT] =
3631 ctx.file_offset[TGSI_FILE_INPUT] +
3632 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3633 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3634 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3635
3636 /* Outside the GPR range. This will be translated to one of the
3637 * kcache banks later. */
3638 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3639 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3640
3641 pipeshader->scratch_space_needed = 0;
3642 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3643 ctx.info.file_max[TGSI_FILE_TEMPORARY];
3644 if (regno > 124) {
3645 choose_spill_arrays(&ctx, ®no, &pipeshader->scratch_space_needed);
3646 shader->indirect_files = ctx.info.indirect_files;
3647 }
3648 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3649
3650 ctx.bc->ar_reg = ++regno;
3651 ctx.bc->index_reg[0] = ++regno;
3652 ctx.bc->index_reg[1] = ++regno;
3653
3654 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3655 ctx.tess_input_info = ++regno;
3656 ctx.tess_output_info = ++regno;
3657 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3658 ctx.tess_input_info = ++regno;
3659 ctx.tess_output_info = ++regno;
3660 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3661 ctx.gs_export_gpr_tregs[0] = ++regno;
3662 ctx.gs_export_gpr_tregs[1] = ++regno;
3663 ctx.gs_export_gpr_tregs[2] = ++regno;
3664 ctx.gs_export_gpr_tregs[3] = ++regno;
3665 if (ctx.shader->gs_tri_strip_adj_fix) {
3666 ctx.gs_rotated_input[0] = ++regno;
3667 ctx.gs_rotated_input[1] = ++regno;
3668 } else {
3669 ctx.gs_rotated_input[0] = 0;
3670 ctx.gs_rotated_input[1] = 1;
3671 }
3672 }
3673
3674 if (shader->uses_images) {
3675 ctx.thread_id_gpr = ++regno;
3676 }
3677 ctx.temp_reg = ++regno;
3678
3679 shader->max_arrays = 0;
3680 shader->num_arrays = 0;
3681 if (indirect_gprs) {
3682
3683 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3684 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3685 ctx.file_offset[TGSI_FILE_OUTPUT] -
3686 ctx.file_offset[TGSI_FILE_INPUT],
3687 0x0F);
3688 }
3689 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3690 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3691 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3692 ctx.file_offset[TGSI_FILE_OUTPUT],
3693 0x0F);
3694 }
3695 }
3696
3697 ctx.nliterals = 0;
3698 ctx.literals = NULL;
3699 ctx.max_driver_temp_used = 0;
3700
3701 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3702 ctx.info.colors_written == 1;
3703 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3704 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3705
3706 if (ctx.type == PIPE_SHADER_VERTEX ||
3707 ctx.type == PIPE_SHADER_GEOMETRY ||
3708 ctx.type == PIPE_SHADER_TESS_EVAL) {
3709 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3710 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3711 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3712 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3713 }
3714
3715 if (shader->vs_as_gs_a)
3716 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3717
3718 if (ctx.thread_id_gpr != -1) {
3719 r = load_thread_id_gpr(&ctx);
3720 if (r)
3721 return r;
3722 }
3723
3724 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3725 r600_fetch_tess_io_info(&ctx);
3726
3727 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3728 tgsi_parse_token(&ctx.parse);
3729 switch (ctx.parse.FullToken.Token.Type) {
3730 case TGSI_TOKEN_TYPE_IMMEDIATE:
3731 immediate = &ctx.parse.FullToken.FullImmediate;
3732 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3733 if(ctx.literals == NULL) {
3734 r = -ENOMEM;
3735 goto out_err;
3736 }
3737 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3738 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3739 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3740 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3741 ctx.nliterals++;
3742 break;
3743 case TGSI_TOKEN_TYPE_DECLARATION:
3744 r = tgsi_declaration(&ctx);
3745 if (r)
3746 goto out_err;
3747 break;
3748 case TGSI_TOKEN_TYPE_INSTRUCTION:
3749 case TGSI_TOKEN_TYPE_PROPERTY:
3750 break;
3751 default:
3752 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3753 r = -EINVAL;
3754 goto out_err;
3755 }
3756 }
3757
3758 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3759 shader->ring_item_sizes[1] = 0;
3760 shader->ring_item_sizes[2] = 0;
3761 shader->ring_item_sizes[3] = 0;
3762
3763 /* Process two side if needed */
3764 if (shader->two_side && ctx.colors_used) {
3765 int i, count = ctx.shader->ninput;
3766 unsigned next_lds_loc = ctx.shader->nlds;
3767
3768 /* additional inputs will be allocated right after the existing inputs,
3769 * we won't need them after the color selection, so we don't need to
3770 * reserve these gprs for the rest of the shader code and to adjust
3771 * output offsets etc. */
3772 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3773 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3774
3775 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3776 if (ctx.face_gpr == -1) {
3777 i = ctx.shader->ninput++;
3778 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3779 ctx.shader->input[i].spi_sid = 0;
3780 ctx.shader->input[i].gpr = gpr++;
3781 ctx.face_gpr = ctx.shader->input[i].gpr;
3782 }
3783
3784 for (i = 0; i < count; i++) {
3785 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3786 int ni = ctx.shader->ninput++;
3787 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3788 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3789 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3790 ctx.shader->input[ni].gpr = gpr++;
3791 // TGSI to LLVM needs to know the lds position of inputs.
3792 // Non LLVM path computes it later (in process_twoside_color)
3793 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3794 ctx.shader->input[i].back_color_input = ni;
3795 if (ctx.bc->gfx_level >= EVERGREEN) {
3796 if ((r = evergreen_interp_input(&ctx, ni)))
3797 return r;
3798 }
3799 }
3800 }
3801 }
3802
3803 if (ctx.shader->uses_helper_invocation) {
3804 if (ctx.bc->gfx_level == CAYMAN)
3805 r = cm_load_helper_invocation(&ctx);
3806 else
3807 r = eg_load_helper_invocation(&ctx);
3808 if (r)
3809 return r;
3810 }
3811
3812 /*
3813 * XXX this relies on fixed_pt_position_gpr only being present when
3814 * this shader should be executed per sample. Should be the case for now...
3815 */
3816 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3817 /*
3818 * Fix up sample mask. The hw always gives us coverage mask for
3819 * the pixel. However, for per-sample shading, we need the
3820 * coverage for the shader invocation only.
3821 * Also, with disabled msaa, only the first bit should be set
3822 * (luckily the same fixup works for both problems).
3823 * For now, we can only do it if we know this shader is always
3824 * executed per sample (due to usage of bits in the shader
3825 * forcing per-sample execution).
3826 * If the fb is not multisampled, we'd do unnecessary work but
3827 * it should still be correct.
3828 * It will however do nothing for sample shading according
3829 * to MinSampleShading.
3830 */
3831 struct r600_bytecode_alu alu;
3832 int tmp = r600_get_temp(&ctx);
3833 assert(ctx.face_gpr != -1);
3834 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3835
3836 alu.op = ALU_OP2_LSHL_INT;
3837 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3838 alu.src[0].value = 0x1;
3839 alu.src[1].sel = ctx.fixed_pt_position_gpr;
3840 alu.src[1].chan = 3;
3841 alu.dst.sel = tmp;
3842 alu.dst.chan = 0;
3843 alu.dst.write = 1;
3844 alu.last = 1;
3845 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3846 return r;
3847
3848 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3849 alu.op = ALU_OP2_AND_INT;
3850 alu.src[0].sel = tmp;
3851 alu.src[1].sel = ctx.face_gpr;
3852 alu.src[1].chan = 2;
3853 alu.dst.sel = ctx.face_gpr;
3854 alu.dst.chan = 2;
3855 alu.dst.write = 1;
3856 alu.last = 1;
3857 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3858 return r;
3859 }
3860
3861 if (ctx.fragcoord_input >= 0) {
3862 if (ctx.bc->gfx_level == CAYMAN) {
3863 for (j = 0 ; j < 4; j++) {
3864 struct r600_bytecode_alu alu;
3865 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3866 alu.op = ALU_OP1_RECIP_IEEE;
3867 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3868 alu.src[0].chan = 3;
3869
3870 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3871 alu.dst.chan = j;
3872 alu.dst.write = (j == 3);
3873 alu.last = (j == 3);
3874 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3875 return r;
3876 }
3877 } else {
3878 struct r600_bytecode_alu alu;
3879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3880 alu.op = ALU_OP1_RECIP_IEEE;
3881 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3882 alu.src[0].chan = 3;
3883
3884 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3885 alu.dst.chan = 3;
3886 alu.dst.write = 1;
3887 alu.last = 1;
3888 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3889 return r;
3890 }
3891 }
3892
3893 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3894 struct r600_bytecode_alu alu;
3895 int r;
3896
3897 /* GS thread with no output workaround - emit a cut at start of GS */
3898 if (ctx.bc->gfx_level == R600)
3899 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3900
3901 for (j = 0; j < 4; j++) {
3902 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3903 alu.op = ALU_OP1_MOV;
3904 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3905 alu.src[0].value = 0;
3906 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3907 alu.dst.write = 1;
3908 alu.last = 1;
3909 r = r600_bytecode_add_alu(ctx.bc, &alu);
3910 if (r)
3911 return r;
3912 }
3913
3914 if (ctx.shader->gs_tri_strip_adj_fix) {
3915 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3916 ctx.gs_rotated_input[0], 2,
3917 0, 2,
3918 V_SQ_ALU_SRC_LITERAL, 1);
3919 if (r)
3920 return r;
3921
3922 for (i = 0; i < 6; i++) {
3923 int rotated = (i + 4) % 6;
3924 int offset_reg = i / 3;
3925 int offset_chan = i % 3;
3926 int rotated_offset_reg = rotated / 3;
3927 int rotated_offset_chan = rotated % 3;
3928
3929 if (offset_reg == 0 && offset_chan == 2)
3930 offset_chan = 3;
3931 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3932 rotated_offset_chan = 3;
3933
3934 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3935 ctx.gs_rotated_input[offset_reg], offset_chan,
3936 ctx.gs_rotated_input[0], 2,
3937 offset_reg, offset_chan,
3938 rotated_offset_reg, rotated_offset_chan);
3939 if (r)
3940 return r;
3941 }
3942 }
3943 }
3944
3945 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3946 r600_fetch_tess_io_info(&ctx);
3947
3948 if (shader->two_side && ctx.colors_used) {
3949 if ((r = process_twoside_color_inputs(&ctx)))
3950 return r;
3951 }
3952
3953 tgsi_parse_init(&ctx.parse, tokens);
3954 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3955 tgsi_parse_token(&ctx.parse);
3956 switch (ctx.parse.FullToken.Token.Type) {
3957 case TGSI_TOKEN_TYPE_INSTRUCTION:
3958 r = tgsi_is_supported(&ctx);
3959 if (r)
3960 goto out_err;
3961 ctx.max_driver_temp_used = 0;
3962 /* reserve first tmp for everyone */
3963 r600_get_temp(&ctx);
3964
3965 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3966 if ((r = tgsi_split_constant(&ctx)))
3967 goto out_err;
3968 if ((r = tgsi_split_literal_constant(&ctx)))
3969 goto out_err;
3970 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3971 if ((r = tgsi_split_gs_inputs(&ctx)))
3972 goto out_err;
3973 } else if (lds_inputs) {
3974 if ((r = tgsi_split_lds_inputs(&ctx)))
3975 goto out_err;
3976 }
3977 if (ctx.bc->gfx_level == CAYMAN)
3978 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3979 else if (ctx.bc->gfx_level >= EVERGREEN)
3980 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3981 else
3982 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3983
3984 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3985
3986 r = ctx.inst_info->process(&ctx);
3987 if (r)
3988 goto out_err;
3989
3990 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3991 r = r600_store_tcs_output(&ctx);
3992 if (r)
3993 goto out_err;
3994 }
3995 break;
3996 default:
3997 break;
3998 }
3999 }
4000
4001 /* Reset the temporary register counter. */
4002 ctx.max_driver_temp_used = 0;
4003
4004 noutput = shader->noutput;
4005
4006 if (!ring_outputs && ctx.clip_vertex_write) {
4007 unsigned clipdist_temp[2];
4008
4009 clipdist_temp[0] = r600_get_temp(&ctx);
4010 clipdist_temp[1] = r600_get_temp(&ctx);
4011
4012 /* need to convert a clipvertex write into clipdistance writes and not export
4013 the clip vertex anymore */
4014
4015 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
4016 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4017 shader->output[noutput].gpr = clipdist_temp[0];
4018 noutput++;
4019 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4020 shader->output[noutput].gpr = clipdist_temp[1];
4021 noutput++;
4022
4023 /* reset spi_sid for clipvertex output to avoid confusing spi */
4024 shader->output[ctx.cv_output].spi_sid = 0;
4025
4026 shader->clip_dist_write = 0xFF;
4027 shader->cc_dist_mask = 0xFF;
4028
4029 for (i = 0; i < 8; i++) {
4030 int oreg = i >> 2;
4031 int ochan = i & 3;
4032
4033 for (j = 0; j < 4; j++) {
4034 struct r600_bytecode_alu alu;
4035 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4036 alu.op = ALU_OP2_DOT4;
4037 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
4038 alu.src[0].chan = j;
4039
4040 alu.src[1].sel = 512 + i;
4041 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4042 alu.src[1].chan = j;
4043
4044 alu.dst.sel = clipdist_temp[oreg];
4045 alu.dst.chan = j;
4046 alu.dst.write = (j == ochan);
4047 if (j == 3)
4048 alu.last = 1;
4049 r = r600_bytecode_add_alu(ctx.bc, &alu);
4050 if (r)
4051 return r;
4052 }
4053 }
4054 }
4055
4056 /* Add stream outputs. */
4057 if (so.num_outputs) {
4058 bool emit = false;
4059 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
4060 emit = true;
4061 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
4062 emit = true;
4063 if (emit)
4064 emit_streamout(&ctx, &so, -1, NULL);
4065 }
4066 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
4067 convert_edgeflag_to_int(&ctx);
4068
4069 if (ctx.type == PIPE_SHADER_TESS_CTRL)
4070 r600_emit_tess_factor(&ctx);
4071
4072 if (lds_outputs) {
4073 if (ctx.type == PIPE_SHADER_VERTEX) {
4074 if (ctx.shader->noutput)
4075 emit_lds_vs_writes(&ctx);
4076 }
4077 } else if (ring_outputs) {
4078 if (shader->vs_as_es || shader->tes_as_es) {
4079 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
4080 ctx.gs_export_gpr_tregs[1] = -1;
4081 ctx.gs_export_gpr_tregs[2] = -1;
4082 ctx.gs_export_gpr_tregs[3] = -1;
4083
4084 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
4085 }
4086 } else {
4087 /* Export output */
4088 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
4089
4090 for (i = 0, j = 0; i < noutput; i++, j++) {
4091 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4092 output[j].gpr = shader->output[i].gpr;
4093 output[j].elem_size = 3;
4094 output[j].swizzle_x = 0;
4095 output[j].swizzle_y = 1;
4096 output[j].swizzle_z = 2;
4097 output[j].swizzle_w = 3;
4098 output[j].burst_count = 1;
4099 output[j].type = 0xffffffff;
4100 output[j].op = CF_OP_EXPORT;
4101 switch (ctx.type) {
4102 case PIPE_SHADER_VERTEX:
4103 case PIPE_SHADER_TESS_EVAL:
4104 switch (shader->output[i].name) {
4105 case TGSI_SEMANTIC_POSITION:
4106 output[j].array_base = 60;
4107 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4108 pos_emitted = true;
4109 break;
4110
4111 case TGSI_SEMANTIC_PSIZE:
4112 output[j].array_base = 61;
4113 output[j].swizzle_y = 7;
4114 output[j].swizzle_z = 7;
4115 output[j].swizzle_w = 7;
4116 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4117 pos_emitted = true;
4118 break;
4119 case TGSI_SEMANTIC_EDGEFLAG:
4120 output[j].array_base = 61;
4121 output[j].swizzle_x = 7;
4122 output[j].swizzle_y = 0;
4123 output[j].swizzle_z = 7;
4124 output[j].swizzle_w = 7;
4125 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4126 pos_emitted = true;
4127 break;
4128 case TGSI_SEMANTIC_LAYER:
4129 /* spi_sid is 0 for outputs that are
4130 * not consumed by PS */
4131 if (shader->output[i].spi_sid) {
4132 output[j].array_base = next_param_base++;
4133 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4134 j++;
4135 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4136 }
4137 output[j].array_base = 61;
4138 output[j].swizzle_x = 7;
4139 output[j].swizzle_y = 7;
4140 output[j].swizzle_z = 0;
4141 output[j].swizzle_w = 7;
4142 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4143 pos_emitted = true;
4144 break;
4145 case TGSI_SEMANTIC_VIEWPORT_INDEX:
4146 /* spi_sid is 0 for outputs that are
4147 * not consumed by PS */
4148 if (shader->output[i].spi_sid) {
4149 output[j].array_base = next_param_base++;
4150 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4151 j++;
4152 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4153 }
4154 output[j].array_base = 61;
4155 output[j].swizzle_x = 7;
4156 output[j].swizzle_y = 7;
4157 output[j].swizzle_z = 7;
4158 output[j].swizzle_w = 0;
4159 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4160 pos_emitted = true;
4161 break;
4162 case TGSI_SEMANTIC_CLIPVERTEX:
4163 j--;
4164 break;
4165 case TGSI_SEMANTIC_CLIPDIST:
4166 output[j].array_base = next_clip_base++;
4167 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4168 pos_emitted = true;
4169 /* spi_sid is 0 for clipdistance outputs that were generated
4170 * for clipvertex - we don't need to pass them to PS */
4171 if (shader->output[i].spi_sid) {
4172 j++;
4173 /* duplicate it as PARAM to pass to the pixel shader */
4174 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4175 output[j].array_base = next_param_base++;
4176 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4177 }
4178 break;
4179 case TGSI_SEMANTIC_FOG:
4180 output[j].swizzle_y = 4; /* 0 */
4181 output[j].swizzle_z = 4; /* 0 */
4182 output[j].swizzle_w = 5; /* 1 */
4183 break;
4184 case TGSI_SEMANTIC_PRIMID:
4185 output[j].swizzle_x = 2;
4186 output[j].swizzle_y = 4; /* 0 */
4187 output[j].swizzle_z = 4; /* 0 */
4188 output[j].swizzle_w = 4; /* 0 */
4189 break;
4190 }
4191
4192 break;
4193 case PIPE_SHADER_FRAGMENT:
4194 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4195 /* never export more colors than the number of CBs */
4196 if (shader->output[i].sid >= max_color_exports) {
4197 /* skip export */
4198 j--;
4199 continue;
4200 }
4201 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4202 output[j].array_base = shader->output[i].sid;
4203 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4204 shader->nr_ps_color_exports++;
4205 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4206
4207 /* If the i-th target format is set, all previous target formats must
4208 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4209 */
4210 if (shader->output[i].sid > 0)
4211 for (unsigned x = 0; x < shader->output[i].sid; x++)
4212 shader->ps_color_export_mask |= (1 << (x*4));
4213
4214 if (shader->output[i].sid > shader->ps_export_highest)
4215 shader->ps_export_highest = shader->output[i].sid;
4216 if (shader->fs_write_all && (rscreen->b.gfx_level >= EVERGREEN)) {
4217 for (k = 1; k < max_color_exports; k++) {
4218 j++;
4219 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4220 output[j].gpr = shader->output[i].gpr;
4221 output[j].elem_size = 3;
4222 output[j].swizzle_x = 0;
4223 output[j].swizzle_y = 1;
4224 output[j].swizzle_z = 2;
4225 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4226 output[j].burst_count = 1;
4227 output[j].array_base = k;
4228 output[j].op = CF_OP_EXPORT;
4229 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4230 shader->nr_ps_color_exports++;
4231 if (k > shader->ps_export_highest)
4232 shader->ps_export_highest = k;
4233 shader->ps_color_export_mask |= (0xf << (j * 4));
4234 }
4235 }
4236 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4237 output[j].array_base = 61;
4238 output[j].swizzle_x = 2;
4239 output[j].swizzle_y = 7;
4240 output[j].swizzle_z = output[j].swizzle_w = 7;
4241 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4242 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4243 output[j].array_base = 61;
4244 output[j].swizzle_x = 7;
4245 output[j].swizzle_y = 1;
4246 output[j].swizzle_z = output[j].swizzle_w = 7;
4247 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4248 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4249 output[j].array_base = 61;
4250 output[j].swizzle_x = 7;
4251 output[j].swizzle_y = 7;
4252 output[j].swizzle_z = 0;
4253 output[j].swizzle_w = 7;
4254 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4255 } else {
4256 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4257 r = -EINVAL;
4258 goto out_err;
4259 }
4260 break;
4261 case PIPE_SHADER_TESS_CTRL:
4262 break;
4263 default:
4264 R600_ERR("unsupported processor type %d\n", ctx.type);
4265 r = -EINVAL;
4266 goto out_err;
4267 }
4268
4269 if (output[j].type == 0xffffffff) {
4270 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4271 output[j].array_base = next_param_base++;
4272 }
4273 }
4274
4275 /* add fake position export */
4276 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4277 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4278 output[j].gpr = 0;
4279 output[j].elem_size = 3;
4280 output[j].swizzle_x = 7;
4281 output[j].swizzle_y = 7;
4282 output[j].swizzle_z = 7;
4283 output[j].swizzle_w = 7;
4284 output[j].burst_count = 1;
4285 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4286 output[j].array_base = 60;
4287 output[j].op = CF_OP_EXPORT;
4288 j++;
4289 }
4290
4291 /* add fake param output for vertex shader if no param is exported */
4292 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4293 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4294 output[j].gpr = 0;
4295 output[j].elem_size = 3;
4296 output[j].swizzle_x = 7;
4297 output[j].swizzle_y = 7;
4298 output[j].swizzle_z = 7;
4299 output[j].swizzle_w = 7;
4300 output[j].burst_count = 1;
4301 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4302 output[j].array_base = 0;
4303 output[j].op = CF_OP_EXPORT;
4304 j++;
4305 }
4306
4307 /* add fake pixel export */
4308 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4309 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4310 output[j].gpr = 0;
4311 output[j].elem_size = 3;
4312 output[j].swizzle_x = 7;
4313 output[j].swizzle_y = 7;
4314 output[j].swizzle_z = 7;
4315 output[j].swizzle_w = 7;
4316 output[j].burst_count = 1;
4317 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4318 output[j].array_base = 0;
4319 output[j].op = CF_OP_EXPORT;
4320 j++;
4321 shader->nr_ps_color_exports++;
4322 shader->ps_color_export_mask = 0xf;
4323 }
4324
4325 noutput = j;
4326
4327 /* set export done on last export of each type */
4328 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4329 if (!(output_done & (1 << output[k].type))) {
4330 output_done |= (1 << output[k].type);
4331 output[k].op = CF_OP_EXPORT_DONE;
4332 }
4333 }
4334 /* add output to bytecode */
4335 for (i = 0; i < noutput; i++) {
4336 r = r600_bytecode_add_output(ctx.bc, &output[i]);
4337 if (r)
4338 goto out_err;
4339 }
4340 }
4341
4342 /* add program end */
4343 if (ctx.bc->gfx_level == CAYMAN)
4344 cm_bytecode_add_cf_end(ctx.bc);
4345 else {
4346 const struct cf_op_info *last = NULL;
4347
4348 if (ctx.bc->cf_last)
4349 last = r600_isa_cf(ctx.bc->cf_last->op);
4350
4351 /* alu clause instructions don't have EOP bit, so add NOP */
4352 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4353 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4354
4355 ctx.bc->cf_last->end_of_program = 1;
4356 }
4357
4358 /* check GPR limit - we have 124 = 128 - 4
4359 * (4 are reserved as alu clause temporary registers) */
4360 if (ctx.bc->ngpr > 124) {
4361 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4362 r = -ENOMEM;
4363 goto out_err;
4364 }
4365
4366 if (ctx.type == PIPE_SHADER_GEOMETRY) {
4367 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4368 return r;
4369 }
4370
4371 free(ctx.spilled_arrays);
4372 free(ctx.array_infos);
4373 free(ctx.literals);
4374 tgsi_parse_free(&ctx.parse);
4375 return 0;
4376 out_err:
4377 free(ctx.spilled_arrays);
4378 free(ctx.array_infos);
4379 free(ctx.literals);
4380 tgsi_parse_free(&ctx.parse);
4381 return r;
4382 }
4383
tgsi_unsupported(struct r600_shader_ctx * ctx)4384 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4385 {
4386 const unsigned tgsi_opcode =
4387 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4388 R600_ERR("%s tgsi opcode unsupported\n",
4389 tgsi_get_opcode_name(tgsi_opcode));
4390 return -EINVAL;
4391 }
4392
tgsi_end(struct r600_shader_ctx * ctx UNUSED)4393 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4394 {
4395 return 0;
4396 }
4397
r600_bytecode_src(struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src,unsigned chan)4398 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4399 const struct r600_shader_src *shader_src,
4400 unsigned chan)
4401 {
4402 bc_src->sel = shader_src->sel;
4403 bc_src->chan = shader_src->swizzle[chan];
4404 bc_src->neg = shader_src->neg;
4405 bc_src->abs = shader_src->abs;
4406 bc_src->rel = shader_src->rel;
4407 bc_src->value = shader_src->value[bc_src->chan];
4408 bc_src->kc_bank = shader_src->kc_bank;
4409 bc_src->kc_rel = shader_src->kc_rel;
4410 }
4411
r600_bytecode_src_set_abs(struct r600_bytecode_alu_src * bc_src)4412 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4413 {
4414 bc_src->abs = 1;
4415 bc_src->neg = 0;
4416 }
4417
r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src * bc_src)4418 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4419 {
4420 bc_src->neg = !bc_src->neg;
4421 }
4422
tgsi_dst(struct r600_shader_ctx * ctx,const struct tgsi_full_dst_register * tgsi_dst,unsigned swizzle,struct r600_bytecode_alu_dst * r600_dst)4423 static void tgsi_dst(struct r600_shader_ctx *ctx,
4424 const struct tgsi_full_dst_register *tgsi_dst,
4425 unsigned swizzle,
4426 struct r600_bytecode_alu_dst *r600_dst)
4427 {
4428 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4429
4430 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4431 bool spilled;
4432 unsigned idx;
4433
4434 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4435
4436 if (spilled) {
4437 struct r600_bytecode_output cf;
4438 int reg = 0;
4439 int r;
4440 bool add_pending_output = true;
4441
4442 memset(&cf, 0, sizeof(struct r600_bytecode_output));
4443 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4444 &cf.array_base, &cf.array_size);
4445
4446 /* If no component has spilled, reserve a register and add the spill code
4447 * ctx->bc->n_pending_outputs is cleared after each instruction group */
4448 if (ctx->bc->n_pending_outputs == 0) {
4449 reg = r600_get_temp(ctx);
4450 } else {
4451 /* If we are already spilling and the output address is the same like
4452 * before then just reuse the same slot */
4453 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4454 if ((cf.array_base + idx == tmpl->array_base) ||
4455 (cf.array_base == tmpl->array_base &&
4456 tmpl->index_gpr == ctx->bc->ar_reg &&
4457 tgsi_dst->Register.Indirect)) {
4458 reg = ctx->bc->pending_outputs[0].gpr;
4459 add_pending_output = false;
4460 } else {
4461 reg = r600_get_temp(ctx);
4462 }
4463 }
4464
4465 r600_dst->sel = reg;
4466 r600_dst->chan = swizzle;
4467 r600_dst->write = 1;
4468 if (inst->Instruction.Saturate) {
4469 r600_dst->clamp = 1;
4470 }
4471
4472 /* Add new outputs as pending */
4473 if (add_pending_output) {
4474 cf.op = CF_OP_MEM_SCRATCH;
4475 cf.elem_size = 3;
4476 cf.gpr = reg;
4477 cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect);
4478 cf.mark = 1;
4479 cf.comp_mask = inst->Dst[0].Register.WriteMask;
4480 cf.swizzle_x = 0;
4481 cf.swizzle_y = 1;
4482 cf.swizzle_z = 2;
4483 cf.swizzle_w = 3;
4484 cf.burst_count = 1;
4485
4486 if (tgsi_dst->Register.Indirect) {
4487 cf.index_gpr = ctx->bc->ar_reg;
4488 } else {
4489 cf.array_base += idx;
4490 cf.array_size = 0;
4491 }
4492
4493 r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4494 if (r)
4495 return;
4496
4497 r600_bytecode_add_ack(ctx->bc);
4498 }
4499 return;
4500 }
4501 else {
4502 r600_dst->sel = idx;
4503 }
4504 }
4505 else {
4506 r600_dst->sel = tgsi_dst->Register.Index;
4507 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4508 }
4509 r600_dst->chan = swizzle;
4510 r600_dst->write = 1;
4511 if (inst->Instruction.Saturate) {
4512 r600_dst->clamp = 1;
4513 }
4514 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4515 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4516 return;
4517 }
4518 }
4519 if (tgsi_dst->Register.Indirect)
4520 r600_dst->rel = V_SQ_REL_RELATIVE;
4521
4522 }
4523
tgsi_op2_64_params(struct r600_shader_ctx * ctx,bool singledest,bool swap,int dest_temp,int op_override)4524 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4525 {
4526 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4527 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4528 struct r600_bytecode_alu alu;
4529 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4530 int use_tmp = 0;
4531 int swizzle_x = inst->Src[0].Register.SwizzleX;
4532
4533 if (singledest) {
4534 switch (write_mask) {
4535 case 0x1:
4536 if (swizzle_x == 2) {
4537 write_mask = 0xc;
4538 use_tmp = 3;
4539 } else
4540 write_mask = 0x3;
4541 break;
4542 case 0x2:
4543 if (swizzle_x == 2) {
4544 write_mask = 0xc;
4545 use_tmp = 3;
4546 } else {
4547 write_mask = 0x3;
4548 use_tmp = 1;
4549 }
4550 break;
4551 case 0x4:
4552 if (swizzle_x == 0) {
4553 write_mask = 0x3;
4554 use_tmp = 1;
4555 } else
4556 write_mask = 0xc;
4557 break;
4558 case 0x8:
4559 if (swizzle_x == 0) {
4560 write_mask = 0x3;
4561 use_tmp = 1;
4562 } else {
4563 write_mask = 0xc;
4564 use_tmp = 3;
4565 }
4566 break;
4567 }
4568 }
4569
4570 lasti = tgsi_last_instruction(write_mask);
4571 for (i = 0; i <= lasti; i++) {
4572
4573 if (!(write_mask & (1 << i)))
4574 continue;
4575
4576 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4577
4578 if (singledest) {
4579 if (use_tmp || dest_temp) {
4580 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4581 alu.dst.chan = i;
4582 alu.dst.write = 1;
4583 } else {
4584 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4585 }
4586 if (i == 1 || i == 3)
4587 alu.dst.write = 0;
4588 } else
4589 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4590
4591 alu.op = op_override ? op_override : ctx->inst_info->op;
4592 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4593 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4594 } else if (!swap) {
4595 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4596 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4597 }
4598 } else {
4599 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4600 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4601 }
4602
4603 /* handle some special cases */
4604 if (i == 1 || i == 3) {
4605 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4606 case TGSI_OPCODE_DABS:
4607 r600_bytecode_src_set_abs(&alu.src[0]);
4608 break;
4609 default:
4610 break;
4611 }
4612 }
4613 if (i == lasti) {
4614 alu.last = 1;
4615 }
4616 r = r600_bytecode_add_alu(ctx->bc, &alu);
4617 if (r)
4618 return r;
4619 }
4620
4621 if (use_tmp) {
4622 write_mask = inst->Dst[0].Register.WriteMask;
4623
4624 lasti = tgsi_last_instruction(write_mask);
4625 /* move result from temp to dst */
4626 for (i = 0; i <= lasti; i++) {
4627 if (!(write_mask & (1 << i)))
4628 continue;
4629
4630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631 alu.op = ALU_OP1_MOV;
4632
4633 if (dest_temp) {
4634 alu.dst.sel = dest_temp;
4635 alu.dst.chan = i;
4636 alu.dst.write = 1;
4637 } else
4638 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4639 alu.src[0].sel = ctx->temp_reg;
4640 alu.src[0].chan = use_tmp - 1;
4641 alu.last = (i == lasti);
4642
4643 r = r600_bytecode_add_alu(ctx->bc, &alu);
4644 if (r)
4645 return r;
4646 }
4647 }
4648 return 0;
4649 }
4650
tgsi_op2_64(struct r600_shader_ctx * ctx)4651 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4652 {
4653 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4654 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4655 /* confirm writemasking */
4656 if ((write_mask & 0x3) != 0x3 &&
4657 (write_mask & 0xc) != 0xc) {
4658 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4659 return -1;
4660 }
4661 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4662 }
4663
tgsi_op2_64_single_dest(struct r600_shader_ctx * ctx)4664 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4665 {
4666 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4667 }
4668
tgsi_op2_64_single_dest_s(struct r600_shader_ctx * ctx)4669 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4670 {
4671 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4672 }
4673
tgsi_op3_64(struct r600_shader_ctx * ctx)4674 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4675 {
4676 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4677 struct r600_bytecode_alu alu;
4678 int i, j, r;
4679 int lasti = 3;
4680 int tmp = r600_get_temp(ctx);
4681
4682 for (i = 0; i < lasti + 1; i++) {
4683
4684 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685 alu.op = ctx->inst_info->op;
4686 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4687 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4688 }
4689
4690 if (inst->Dst[0].Register.WriteMask & (1 << i))
4691 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4692 else
4693 alu.dst.sel = tmp;
4694
4695 alu.dst.chan = i;
4696 alu.is_op3 = 1;
4697 if (i == lasti) {
4698 alu.last = 1;
4699 }
4700 r = r600_bytecode_add_alu(ctx->bc, &alu);
4701 if (r)
4702 return r;
4703 }
4704 return 0;
4705 }
4706
tgsi_op2_s(struct r600_shader_ctx * ctx,int swap,int trans_only)4707 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4708 {
4709 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4710 struct r600_bytecode_alu alu;
4711 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4712 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4713 /* use temp register if trans_only and more than one dst component */
4714 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4715 unsigned op = ctx->inst_info->op;
4716
4717 if (op == ALU_OP2_MUL_IEEE &&
4718 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
4719 op = ALU_OP2_MUL;
4720
4721 /* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support
4722 * source modifiers with integer ops we switch back to SUB_INT */
4723 bool src1_neg = ctx->src[1].neg;
4724 if (op == ALU_OP2_ADD_INT && src1_neg) {
4725 src1_neg = false;
4726 op = ALU_OP2_SUB_INT;
4727 }
4728
4729 for (i = 0; i <= lasti; i++) {
4730 if (!(write_mask & (1 << i)))
4731 continue;
4732
4733 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4734 if (use_tmp) {
4735 alu.dst.sel = ctx->temp_reg;
4736 alu.dst.chan = i;
4737 alu.dst.write = 1;
4738 } else
4739 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4740
4741 alu.op = op;
4742 if (!swap) {
4743 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4744 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4745 }
4746 alu.src[1].neg = src1_neg;
4747 } else {
4748 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4749 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4750 }
4751 if (i == lasti || trans_only) {
4752 alu.last = 1;
4753 }
4754 r = r600_bytecode_add_alu(ctx->bc, &alu);
4755 if (r)
4756 return r;
4757 }
4758
4759 if (use_tmp) {
4760 /* move result from temp to dst */
4761 for (i = 0; i <= lasti; i++) {
4762 if (!(write_mask & (1 << i)))
4763 continue;
4764
4765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4766 alu.op = ALU_OP1_MOV;
4767 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4768 alu.src[0].sel = ctx->temp_reg;
4769 alu.src[0].chan = i;
4770 alu.last = (i == lasti);
4771
4772 r = r600_bytecode_add_alu(ctx->bc, &alu);
4773 if (r)
4774 return r;
4775 }
4776 }
4777 return 0;
4778 }
4779
tgsi_op2(struct r600_shader_ctx * ctx)4780 static int tgsi_op2(struct r600_shader_ctx *ctx)
4781 {
4782 return tgsi_op2_s(ctx, 0, 0);
4783 }
4784
tgsi_op2_swap(struct r600_shader_ctx * ctx)4785 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4786 {
4787 return tgsi_op2_s(ctx, 1, 0);
4788 }
4789
tgsi_op2_trans(struct r600_shader_ctx * ctx)4790 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4791 {
4792 return tgsi_op2_s(ctx, 0, 1);
4793 }
4794
tgsi_ineg(struct r600_shader_ctx * ctx)4795 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4796 {
4797 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4798 struct r600_bytecode_alu alu;
4799 int i, r;
4800 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4801
4802 for (i = 0; i < lasti + 1; i++) {
4803
4804 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4805 continue;
4806 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4807 alu.op = ctx->inst_info->op;
4808
4809 alu.src[0].sel = V_SQ_ALU_SRC_0;
4810
4811 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4812
4813 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4814
4815 if (i == lasti) {
4816 alu.last = 1;
4817 }
4818 r = r600_bytecode_add_alu(ctx->bc, &alu);
4819 if (r)
4820 return r;
4821 }
4822 return 0;
4823
4824 }
4825
tgsi_dneg(struct r600_shader_ctx * ctx)4826 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4827 {
4828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4829 struct r600_bytecode_alu alu;
4830 int i, r;
4831 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4832
4833 for (i = 0; i < lasti + 1; i++) {
4834
4835 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4836 continue;
4837 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4838 alu.op = ALU_OP1_MOV;
4839
4840 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4841
4842 if (i == 1 || i == 3)
4843 r600_bytecode_src_toggle_neg(&alu.src[0]);
4844 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4845
4846 if (i == lasti) {
4847 alu.last = 1;
4848 }
4849 r = r600_bytecode_add_alu(ctx->bc, &alu);
4850 if (r)
4851 return r;
4852 }
4853 return 0;
4854
4855 }
4856
tgsi_dfracexp(struct r600_shader_ctx * ctx)4857 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4858 {
4859 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4860 struct r600_bytecode_alu alu;
4861 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4862 int i, j, r;
4863
4864 for (i = 0; i <= 3; i++) {
4865 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4866 alu.op = ctx->inst_info->op;
4867
4868 alu.dst.sel = ctx->temp_reg;
4869 alu.dst.chan = i;
4870 alu.dst.write = 1;
4871 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4872 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4873 }
4874
4875 if (i == 3)
4876 alu.last = 1;
4877
4878 r = r600_bytecode_add_alu(ctx->bc, &alu);
4879 if (r)
4880 return r;
4881 }
4882
4883 /* Replicate significand result across channels. */
4884 for (i = 0; i <= 3; i++) {
4885 if (!(write_mask & (1 << i)))
4886 continue;
4887
4888 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4889 alu.op = ALU_OP1_MOV;
4890 alu.src[0].chan = (i & 1) + 2;
4891 alu.src[0].sel = ctx->temp_reg;
4892
4893 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4894 alu.dst.write = 1;
4895 alu.last = 1;
4896 r = r600_bytecode_add_alu(ctx->bc, &alu);
4897 if (r)
4898 return r;
4899 }
4900
4901 for (i = 0; i <= 3; i++) {
4902 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4903 /* MOV third channels to writemask dst1 */
4904 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4905 alu.op = ALU_OP1_MOV;
4906 alu.src[0].chan = 1;
4907 alu.src[0].sel = ctx->temp_reg;
4908
4909 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4910 alu.last = 1;
4911 r = r600_bytecode_add_alu(ctx->bc, &alu);
4912 if (r)
4913 return r;
4914 break;
4915 }
4916 }
4917 return 0;
4918 }
4919
4920
egcm_int_to_double(struct r600_shader_ctx * ctx)4921 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4922 {
4923 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4924 struct r600_bytecode_alu alu;
4925 int i, c, r;
4926 int write_mask = inst->Dst[0].Register.WriteMask;
4927 int temp_reg = r600_get_temp(ctx);
4928
4929 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4930 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4931
4932 for (c = 0; c < 2; c++) {
4933 int dchan = c * 2;
4934 if (write_mask & (0x3 << dchan)) {
4935 /* split into 24-bit int and 8-bit int */
4936 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4937 alu.op = ALU_OP2_AND_INT;
4938 alu.dst.sel = temp_reg;
4939 alu.dst.chan = dchan;
4940 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4941 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4942 alu.src[1].value = 0xffffff00;
4943 alu.dst.write = 1;
4944 r = r600_bytecode_add_alu(ctx->bc, &alu);
4945 if (r)
4946 return r;
4947
4948 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4949 alu.op = ALU_OP2_AND_INT;
4950 alu.dst.sel = temp_reg;
4951 alu.dst.chan = dchan + 1;
4952 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4953 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4954 alu.src[1].value = 0xff;
4955 alu.dst.write = 1;
4956 alu.last = 1;
4957 r = r600_bytecode_add_alu(ctx->bc, &alu);
4958 if (r)
4959 return r;
4960 }
4961 }
4962
4963 for (c = 0; c < 2; c++) {
4964 int dchan = c * 2;
4965 if (write_mask & (0x3 << dchan)) {
4966 for (i = dchan; i <= dchan + 1; i++) {
4967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4968 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4969
4970 alu.src[0].sel = temp_reg;
4971 alu.src[0].chan = i;
4972 alu.dst.sel = temp_reg;
4973 alu.dst.chan = i;
4974 alu.dst.write = 1;
4975 if (ctx->bc->gfx_level == CAYMAN)
4976 alu.last = i == dchan + 1;
4977 else
4978 alu.last = 1; /* trans only ops on evergreen */
4979
4980 r = r600_bytecode_add_alu(ctx->bc, &alu);
4981 if (r)
4982 return r;
4983 }
4984 }
4985 }
4986
4987 for (c = 0; c < 2; c++) {
4988 int dchan = c * 2;
4989 if (write_mask & (0x3 << dchan)) {
4990 for (i = 0; i < 4; i++) {
4991 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4992 alu.op = ALU_OP1_FLT32_TO_FLT64;
4993
4994 alu.src[0].chan = dchan + (i / 2);
4995 if (i == 0 || i == 2)
4996 alu.src[0].sel = temp_reg;
4997 else {
4998 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4999 alu.src[0].value = 0x0;
5000 }
5001 alu.dst.sel = ctx->temp_reg;
5002 alu.dst.chan = i;
5003 alu.last = i == 3;
5004 alu.dst.write = 1;
5005
5006 r = r600_bytecode_add_alu(ctx->bc, &alu);
5007 if (r)
5008 return r;
5009 }
5010
5011 for (i = 0; i <= 1; i++) {
5012 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5013 alu.op = ALU_OP2_ADD_64;
5014
5015 alu.src[0].chan = fp64_switch(i);
5016 alu.src[0].sel = ctx->temp_reg;
5017
5018 alu.src[1].chan = fp64_switch(i + 2);
5019 alu.src[1].sel = ctx->temp_reg;
5020 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
5021 alu.last = i == 1;
5022
5023 r = r600_bytecode_add_alu(ctx->bc, &alu);
5024 if (r)
5025 return r;
5026 }
5027 }
5028 }
5029
5030 return 0;
5031 }
5032
egcm_double_to_int(struct r600_shader_ctx * ctx)5033 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
5034 {
5035 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5036 struct r600_bytecode_alu alu;
5037 int i, r;
5038 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5039 int treg = r600_get_temp(ctx);
5040 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
5041 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
5042
5043 /* do a 64->32 into a temp register */
5044 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
5045 if (r)
5046 return r;
5047
5048 for (i = 0; i <= lasti; i++) {
5049 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5050 continue;
5051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5052 alu.op = ctx->inst_info->op;
5053
5054 alu.src[0].chan = i;
5055 alu.src[0].sel = treg;
5056 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5057 alu.last = (i == lasti);
5058
5059 r = r600_bytecode_add_alu(ctx->bc, &alu);
5060 if (r)
5061 return r;
5062 }
5063
5064 return 0;
5065 }
5066
cayman_emit_unary_double_raw(struct r600_bytecode * bc,unsigned op,int dst_reg,struct r600_shader_src * src,bool abs)5067 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
5068 unsigned op,
5069 int dst_reg,
5070 struct r600_shader_src *src,
5071 bool abs)
5072 {
5073 struct r600_bytecode_alu alu;
5074 const int last_slot = 3;
5075 int r;
5076
5077 /* these have to write the result to X/Y by the looks of it */
5078 for (int i = 0 ; i < last_slot; i++) {
5079 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5080 alu.op = op;
5081
5082 r600_bytecode_src(&alu.src[0], src, 1);
5083 r600_bytecode_src(&alu.src[1], src, 0);
5084
5085 if (abs)
5086 r600_bytecode_src_set_abs(&alu.src[1]);
5087
5088 alu.dst.sel = dst_reg;
5089 alu.dst.chan = i;
5090 alu.dst.write = (i == 0 || i == 1);
5091
5092 if (bc->gfx_level != CAYMAN || i == last_slot - 1)
5093 alu.last = 1;
5094 r = r600_bytecode_add_alu(bc, &alu);
5095 if (r)
5096 return r;
5097 }
5098
5099 return 0;
5100 }
5101
cayman_emit_double_instr(struct r600_shader_ctx * ctx)5102 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5103 {
5104 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5105 int i, r;
5106 struct r600_bytecode_alu alu;
5107 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5108 int t1 = ctx->temp_reg;
5109
5110 /* should only be one src regs */
5111 assert(inst->Instruction.NumSrcRegs == 1);
5112
5113 /* only support one double at a time */
5114 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5115 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5116
5117 r = cayman_emit_unary_double_raw(
5118 ctx->bc, ctx->inst_info->op, t1,
5119 &ctx->src[0],
5120 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5121 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5122 if (r)
5123 return r;
5124
5125 for (i = 0 ; i <= lasti; i++) {
5126 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5127 continue;
5128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5129 alu.op = ALU_OP1_MOV;
5130 alu.src[0].sel = t1;
5131 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5132 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5133 alu.dst.write = 1;
5134 if (i == lasti)
5135 alu.last = 1;
5136 r = r600_bytecode_add_alu(ctx->bc, &alu);
5137 if (r)
5138 return r;
5139 }
5140 return 0;
5141 }
5142
cayman_emit_float_instr(struct r600_shader_ctx * ctx)5143 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5144 {
5145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5146 int i, j, r;
5147 struct r600_bytecode_alu alu;
5148 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5149
5150 for (i = 0 ; i < last_slot; i++) {
5151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5152 alu.op = ctx->inst_info->op;
5153 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5154 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5155
5156 /* RSQ should take the absolute value of src */
5157 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5158 r600_bytecode_src_set_abs(&alu.src[j]);
5159 }
5160 }
5161 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5162 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5163
5164 if (i == last_slot - 1)
5165 alu.last = 1;
5166 r = r600_bytecode_add_alu(ctx->bc, &alu);
5167 if (r)
5168 return r;
5169 }
5170 return 0;
5171 }
5172
cayman_mul_int_instr(struct r600_shader_ctx * ctx)5173 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5174 {
5175 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5176 int i, j, k, r;
5177 struct r600_bytecode_alu alu;
5178 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5179 int t1 = ctx->temp_reg;
5180
5181 for (k = 0; k <= lasti; k++) {
5182 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5183 continue;
5184
5185 for (i = 0 ; i < 4; i++) {
5186 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5187 alu.op = ctx->inst_info->op;
5188 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5189 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5190 }
5191 alu.dst.sel = t1;
5192 alu.dst.chan = i;
5193 alu.dst.write = (i == k);
5194 if (i == 3)
5195 alu.last = 1;
5196 r = r600_bytecode_add_alu(ctx->bc, &alu);
5197 if (r)
5198 return r;
5199 }
5200 }
5201
5202 for (i = 0 ; i <= lasti; i++) {
5203 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5204 continue;
5205 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5206 alu.op = ALU_OP1_MOV;
5207 alu.src[0].sel = t1;
5208 alu.src[0].chan = i;
5209 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5210 alu.dst.write = 1;
5211 if (i == lasti)
5212 alu.last = 1;
5213 r = r600_bytecode_add_alu(ctx->bc, &alu);
5214 if (r)
5215 return r;
5216 }
5217
5218 return 0;
5219 }
5220
5221
cayman_mul_double_instr(struct r600_shader_ctx * ctx)5222 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5223 {
5224 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5225 int i, j, k, r;
5226 struct r600_bytecode_alu alu;
5227 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5228 int t1 = ctx->temp_reg;
5229
5230 /* t1 would get overwritten below if we actually tried to
5231 * multiply two pairs of doubles at a time. */
5232 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5233 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5234
5235 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5236
5237 for (i = 0; i < 4; i++) {
5238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5239 alu.op = ctx->inst_info->op;
5240 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5241 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5242 }
5243 alu.dst.sel = t1;
5244 alu.dst.chan = i;
5245 alu.dst.write = 1;
5246 if (i == 3)
5247 alu.last = 1;
5248 r = r600_bytecode_add_alu(ctx->bc, &alu);
5249 if (r)
5250 return r;
5251 }
5252
5253 for (i = 0; i <= lasti; i++) {
5254 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5255 continue;
5256 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5257 alu.op = ALU_OP1_MOV;
5258 alu.src[0].sel = t1;
5259 alu.src[0].chan = i;
5260 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5261 alu.dst.write = 1;
5262 if (i == lasti)
5263 alu.last = 1;
5264 r = r600_bytecode_add_alu(ctx->bc, &alu);
5265 if (r)
5266 return r;
5267 }
5268
5269 return 0;
5270 }
5271
5272 /*
5273 * Emit RECIP_64 + MUL_64 to implement division.
5274 */
cayman_ddiv_instr(struct r600_shader_ctx * ctx)5275 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5276 {
5277 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5278 int r;
5279 struct r600_bytecode_alu alu;
5280 int t1 = ctx->temp_reg;
5281 int k;
5282
5283 /* Only support one double at a time. This is the same constraint as
5284 * in DMUL lowering. */
5285 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5286 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5287
5288 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5289
5290 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5291 if (r)
5292 return r;
5293
5294 for (int i = 0; i < 4; i++) {
5295 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5296 alu.op = ALU_OP2_MUL_64;
5297
5298 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5299
5300 alu.src[1].sel = t1;
5301 alu.src[1].chan = (i == 3) ? 0 : 1;
5302
5303 alu.dst.sel = t1;
5304 alu.dst.chan = i;
5305 alu.dst.write = 1;
5306 if (i == 3)
5307 alu.last = 1;
5308 r = r600_bytecode_add_alu(ctx->bc, &alu);
5309 if (r)
5310 return r;
5311 }
5312
5313 for (int i = 0; i < 2; i++) {
5314 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5315 alu.op = ALU_OP1_MOV;
5316 alu.src[0].sel = t1;
5317 alu.src[0].chan = i;
5318 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5319 alu.dst.write = 1;
5320 if (i == 1)
5321 alu.last = 1;
5322 r = r600_bytecode_add_alu(ctx->bc, &alu);
5323 if (r)
5324 return r;
5325 }
5326 return 0;
5327 }
5328
5329 /*
5330 * r600 - trunc to -PI..PI range
5331 * r700 - normalize by dividing by 2PI
5332 * see fdo bug 27901
5333 */
tgsi_setup_trig(struct r600_shader_ctx * ctx)5334 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5335 {
5336 int r;
5337 struct r600_bytecode_alu alu;
5338
5339 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5340 alu.op = ALU_OP3_MULADD;
5341 alu.is_op3 = 1;
5342
5343 alu.dst.chan = 0;
5344 alu.dst.sel = ctx->temp_reg;
5345 alu.dst.write = 1;
5346
5347 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5348
5349 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5350 alu.src[1].chan = 0;
5351 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5352 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5353 alu.src[2].chan = 0;
5354 alu.last = 1;
5355 r = r600_bytecode_add_alu(ctx->bc, &alu);
5356 if (r)
5357 return r;
5358
5359 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5360 alu.op = ALU_OP1_FRACT;
5361
5362 alu.dst.chan = 0;
5363 alu.dst.sel = ctx->temp_reg;
5364 alu.dst.write = 1;
5365
5366 alu.src[0].sel = ctx->temp_reg;
5367 alu.src[0].chan = 0;
5368 alu.last = 1;
5369 r = r600_bytecode_add_alu(ctx->bc, &alu);
5370 if (r)
5371 return r;
5372
5373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5374 alu.op = ALU_OP3_MULADD;
5375 alu.is_op3 = 1;
5376
5377 alu.dst.chan = 0;
5378 alu.dst.sel = ctx->temp_reg;
5379 alu.dst.write = 1;
5380
5381 alu.src[0].sel = ctx->temp_reg;
5382 alu.src[0].chan = 0;
5383
5384 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5385 alu.src[1].chan = 0;
5386 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5387 alu.src[2].chan = 0;
5388
5389 if (ctx->bc->gfx_level == R600) {
5390 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5391 alu.src[2].value = u_bitcast_f2u(-M_PI);
5392 } else {
5393 alu.src[1].sel = V_SQ_ALU_SRC_1;
5394 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5395 alu.src[2].neg = 1;
5396 }
5397
5398 alu.last = 1;
5399 r = r600_bytecode_add_alu(ctx->bc, &alu);
5400 if (r)
5401 return r;
5402 return 0;
5403 }
5404
cayman_trig(struct r600_shader_ctx * ctx)5405 static int cayman_trig(struct r600_shader_ctx *ctx)
5406 {
5407 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5408 struct r600_bytecode_alu alu;
5409 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5410 int i, r;
5411
5412 r = tgsi_setup_trig(ctx);
5413 if (r)
5414 return r;
5415
5416
5417 for (i = 0; i < last_slot; i++) {
5418 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5419 alu.op = ctx->inst_info->op;
5420 alu.dst.chan = i;
5421
5422 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5423 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5424
5425 alu.src[0].sel = ctx->temp_reg;
5426 alu.src[0].chan = 0;
5427 if (i == last_slot - 1)
5428 alu.last = 1;
5429 r = r600_bytecode_add_alu(ctx->bc, &alu);
5430 if (r)
5431 return r;
5432 }
5433 return 0;
5434 }
5435
tgsi_trig(struct r600_shader_ctx * ctx)5436 static int tgsi_trig(struct r600_shader_ctx *ctx)
5437 {
5438 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5439 struct r600_bytecode_alu alu;
5440 int i, r;
5441 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5442
5443 r = tgsi_setup_trig(ctx);
5444 if (r)
5445 return r;
5446
5447 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5448 alu.op = ctx->inst_info->op;
5449 alu.dst.chan = 0;
5450 alu.dst.sel = ctx->temp_reg;
5451 alu.dst.write = 1;
5452
5453 alu.src[0].sel = ctx->temp_reg;
5454 alu.src[0].chan = 0;
5455 alu.last = 1;
5456 r = r600_bytecode_add_alu(ctx->bc, &alu);
5457 if (r)
5458 return r;
5459
5460 /* replicate result */
5461 for (i = 0; i < lasti + 1; i++) {
5462 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5463 continue;
5464
5465 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5466 alu.op = ALU_OP1_MOV;
5467
5468 alu.src[0].sel = ctx->temp_reg;
5469 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5470 if (i == lasti)
5471 alu.last = 1;
5472 r = r600_bytecode_add_alu(ctx->bc, &alu);
5473 if (r)
5474 return r;
5475 }
5476 return 0;
5477 }
5478
tgsi_kill(struct r600_shader_ctx * ctx)5479 static int tgsi_kill(struct r600_shader_ctx *ctx)
5480 {
5481 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5482 struct r600_bytecode_alu alu;
5483 int i, r;
5484
5485 for (i = 0; i < 4; i++) {
5486 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5487 alu.op = ctx->inst_info->op;
5488
5489 alu.dst.chan = i;
5490
5491 alu.src[0].sel = V_SQ_ALU_SRC_0;
5492
5493 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5494 alu.src[1].sel = V_SQ_ALU_SRC_1;
5495 alu.src[1].neg = 1;
5496 } else {
5497 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5498 }
5499 if (i == 3) {
5500 alu.last = 1;
5501 }
5502 r = r600_bytecode_add_alu(ctx->bc, &alu);
5503 if (r)
5504 return r;
5505 }
5506
5507 /* kill must be last in ALU */
5508 ctx->bc->force_add_cf = 1;
5509 ctx->shader->uses_kill = TRUE;
5510 return 0;
5511 }
5512
tgsi_lit(struct r600_shader_ctx * ctx)5513 static int tgsi_lit(struct r600_shader_ctx *ctx)
5514 {
5515 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5516 struct r600_bytecode_alu alu;
5517 int r;
5518
5519 /* tmp.x = max(src.y, 0.0) */
5520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5521 alu.op = ALU_OP2_MAX;
5522 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5523 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5524 alu.src[1].chan = 1;
5525
5526 alu.dst.sel = ctx->temp_reg;
5527 alu.dst.chan = 0;
5528 alu.dst.write = 1;
5529
5530 alu.last = 1;
5531 r = r600_bytecode_add_alu(ctx->bc, &alu);
5532 if (r)
5533 return r;
5534
5535 if (inst->Dst[0].Register.WriteMask & (1 << 2))
5536 {
5537 int chan;
5538 int sel;
5539 unsigned i;
5540
5541 if (ctx->bc->gfx_level == CAYMAN) {
5542 for (i = 0; i < 3; i++) {
5543 /* tmp.z = log(tmp.x) */
5544 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5545 alu.op = ALU_OP1_LOG_CLAMPED;
5546 alu.src[0].sel = ctx->temp_reg;
5547 alu.src[0].chan = 0;
5548 alu.dst.sel = ctx->temp_reg;
5549 alu.dst.chan = i;
5550 if (i == 2) {
5551 alu.dst.write = 1;
5552 alu.last = 1;
5553 } else
5554 alu.dst.write = 0;
5555
5556 r = r600_bytecode_add_alu(ctx->bc, &alu);
5557 if (r)
5558 return r;
5559 }
5560 } else {
5561 /* tmp.z = log(tmp.x) */
5562 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5563 alu.op = ALU_OP1_LOG_CLAMPED;
5564 alu.src[0].sel = ctx->temp_reg;
5565 alu.src[0].chan = 0;
5566 alu.dst.sel = ctx->temp_reg;
5567 alu.dst.chan = 2;
5568 alu.dst.write = 1;
5569 alu.last = 1;
5570 r = r600_bytecode_add_alu(ctx->bc, &alu);
5571 if (r)
5572 return r;
5573 }
5574
5575 chan = alu.dst.chan;
5576 sel = alu.dst.sel;
5577
5578 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5579 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5580 alu.op = ALU_OP3_MUL_LIT;
5581 alu.src[0].sel = sel;
5582 alu.src[0].chan = chan;
5583 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5584 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5585 alu.dst.sel = ctx->temp_reg;
5586 alu.dst.chan = 0;
5587 alu.dst.write = 1;
5588 alu.is_op3 = 1;
5589 alu.last = 1;
5590 r = r600_bytecode_add_alu(ctx->bc, &alu);
5591 if (r)
5592 return r;
5593
5594 if (ctx->bc->gfx_level == CAYMAN) {
5595 for (i = 0; i < 3; i++) {
5596 /* dst.z = exp(tmp.x) */
5597 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5598 alu.op = ALU_OP1_EXP_IEEE;
5599 alu.src[0].sel = ctx->temp_reg;
5600 alu.src[0].chan = 0;
5601 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5602 if (i == 2) {
5603 alu.dst.write = 1;
5604 alu.last = 1;
5605 } else
5606 alu.dst.write = 0;
5607 r = r600_bytecode_add_alu(ctx->bc, &alu);
5608 if (r)
5609 return r;
5610 }
5611 } else {
5612 /* dst.z = exp(tmp.x) */
5613 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5614 alu.op = ALU_OP1_EXP_IEEE;
5615 alu.src[0].sel = ctx->temp_reg;
5616 alu.src[0].chan = 0;
5617 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5618 alu.last = 1;
5619 r = r600_bytecode_add_alu(ctx->bc, &alu);
5620 if (r)
5621 return r;
5622 }
5623 }
5624
5625 /* dst.x, <- 1.0 */
5626 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5627 alu.op = ALU_OP1_MOV;
5628 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5629 alu.src[0].chan = 0;
5630 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5631 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5632 r = r600_bytecode_add_alu(ctx->bc, &alu);
5633 if (r)
5634 return r;
5635
5636 /* dst.y = max(src.x, 0.0) */
5637 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5638 alu.op = ALU_OP2_MAX;
5639 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5640 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5641 alu.src[1].chan = 0;
5642 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5643 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5644 r = r600_bytecode_add_alu(ctx->bc, &alu);
5645 if (r)
5646 return r;
5647
5648 /* dst.w, <- 1.0 */
5649 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5650 alu.op = ALU_OP1_MOV;
5651 alu.src[0].sel = V_SQ_ALU_SRC_1;
5652 alu.src[0].chan = 0;
5653 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5654 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5655 alu.last = 1;
5656 r = r600_bytecode_add_alu(ctx->bc, &alu);
5657 if (r)
5658 return r;
5659
5660 return 0;
5661 }
5662
tgsi_rsq(struct r600_shader_ctx * ctx)5663 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5664 {
5665 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5666 struct r600_bytecode_alu alu;
5667 int i, r;
5668
5669 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5670
5671 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5672
5673 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5674 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5675 r600_bytecode_src_set_abs(&alu.src[i]);
5676 }
5677 alu.dst.sel = ctx->temp_reg;
5678 alu.dst.write = 1;
5679 alu.last = 1;
5680 r = r600_bytecode_add_alu(ctx->bc, &alu);
5681 if (r)
5682 return r;
5683 /* replicate result */
5684 return tgsi_helper_tempx_replicate(ctx);
5685 }
5686
tgsi_helper_tempx_replicate(struct r600_shader_ctx * ctx)5687 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5688 {
5689 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5690 struct r600_bytecode_alu alu;
5691 int i, r;
5692
5693 for (i = 0; i < 4; i++) {
5694 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5695 alu.src[0].sel = ctx->temp_reg;
5696 alu.op = ALU_OP1_MOV;
5697 alu.dst.chan = i;
5698 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5699 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5700 if (i == 3)
5701 alu.last = 1;
5702 r = r600_bytecode_add_alu(ctx->bc, &alu);
5703 if (r)
5704 return r;
5705 }
5706 return 0;
5707 }
5708
tgsi_trans_srcx_replicate(struct r600_shader_ctx * ctx)5709 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5710 {
5711 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5712 struct r600_bytecode_alu alu;
5713 int i, r;
5714
5715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5716 alu.op = ctx->inst_info->op;
5717 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5718 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5719 }
5720 alu.dst.sel = ctx->temp_reg;
5721 alu.dst.write = 1;
5722 alu.last = 1;
5723 r = r600_bytecode_add_alu(ctx->bc, &alu);
5724 if (r)
5725 return r;
5726 /* replicate result */
5727 return tgsi_helper_tempx_replicate(ctx);
5728 }
5729
cayman_pow(struct r600_shader_ctx * ctx)5730 static int cayman_pow(struct r600_shader_ctx *ctx)
5731 {
5732 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5733 int i, r;
5734 struct r600_bytecode_alu alu;
5735 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5736
5737 for (i = 0; i < 3; i++) {
5738 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5739 alu.op = ALU_OP1_LOG_IEEE;
5740 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5741 alu.dst.sel = ctx->temp_reg;
5742 alu.dst.chan = i;
5743 alu.dst.write = 1;
5744 if (i == 2)
5745 alu.last = 1;
5746 r = r600_bytecode_add_alu(ctx->bc, &alu);
5747 if (r)
5748 return r;
5749 }
5750
5751 /* b * LOG2(a) */
5752 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5753 alu.op = ALU_OP2_MUL;
5754 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5755 alu.src[1].sel = ctx->temp_reg;
5756 alu.dst.sel = ctx->temp_reg;
5757 alu.dst.write = 1;
5758 alu.last = 1;
5759 r = r600_bytecode_add_alu(ctx->bc, &alu);
5760 if (r)
5761 return r;
5762
5763 for (i = 0; i < last_slot; i++) {
5764 /* POW(a,b) = EXP2(b * LOG2(a))*/
5765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5766 alu.op = ALU_OP1_EXP_IEEE;
5767 alu.src[0].sel = ctx->temp_reg;
5768
5769 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5770 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5771 if (i == last_slot - 1)
5772 alu.last = 1;
5773 r = r600_bytecode_add_alu(ctx->bc, &alu);
5774 if (r)
5775 return r;
5776 }
5777 return 0;
5778 }
5779
tgsi_pow(struct r600_shader_ctx * ctx)5780 static int tgsi_pow(struct r600_shader_ctx *ctx)
5781 {
5782 struct r600_bytecode_alu alu;
5783 int r;
5784
5785 /* LOG2(a) */
5786 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5787 alu.op = ALU_OP1_LOG_IEEE;
5788 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5789 alu.dst.sel = ctx->temp_reg;
5790 alu.dst.write = 1;
5791 alu.last = 1;
5792 r = r600_bytecode_add_alu(ctx->bc, &alu);
5793 if (r)
5794 return r;
5795 /* b * LOG2(a) */
5796 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5797 alu.op = ALU_OP2_MUL;
5798 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5799 alu.src[1].sel = ctx->temp_reg;
5800 alu.dst.sel = ctx->temp_reg;
5801 alu.dst.write = 1;
5802 alu.last = 1;
5803 r = r600_bytecode_add_alu(ctx->bc, &alu);
5804 if (r)
5805 return r;
5806 /* POW(a,b) = EXP2(b * LOG2(a))*/
5807 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5808 alu.op = ALU_OP1_EXP_IEEE;
5809 alu.src[0].sel = ctx->temp_reg;
5810 alu.dst.sel = ctx->temp_reg;
5811 alu.dst.write = 1;
5812 alu.last = 1;
5813 r = r600_bytecode_add_alu(ctx->bc, &alu);
5814 if (r)
5815 return r;
5816 return tgsi_helper_tempx_replicate(ctx);
5817 }
5818
emit_mul_int_op(struct r600_bytecode * bc,struct r600_bytecode_alu * alu_src)5819 static int emit_mul_int_op(struct r600_bytecode *bc,
5820 struct r600_bytecode_alu *alu_src)
5821 {
5822 struct r600_bytecode_alu alu;
5823 int i, r;
5824 alu = *alu_src;
5825 if (bc->gfx_level == CAYMAN) {
5826 for (i = 0; i < 4; i++) {
5827 alu.dst.chan = i;
5828 alu.dst.write = (i == alu_src->dst.chan);
5829 alu.last = (i == 3);
5830
5831 r = r600_bytecode_add_alu(bc, &alu);
5832 if (r)
5833 return r;
5834 }
5835 } else {
5836 alu.last = 1;
5837 r = r600_bytecode_add_alu(bc, &alu);
5838 if (r)
5839 return r;
5840 }
5841 return 0;
5842 }
5843
tgsi_divmod(struct r600_shader_ctx * ctx,int mod,int signed_op)5844 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5845 {
5846 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5847 struct r600_bytecode_alu alu;
5848 int i, r, j;
5849 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5850 int lasti = tgsi_last_instruction(write_mask);
5851 int tmp0 = ctx->temp_reg;
5852 int tmp1 = r600_get_temp(ctx);
5853 int tmp2 = r600_get_temp(ctx);
5854 int tmp3 = r600_get_temp(ctx);
5855 int tmp4 = 0;
5856
5857 /* Use additional temp if dst register and src register are the same */
5858 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5859 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5860 tmp4 = r600_get_temp(ctx);
5861 }
5862
5863 /* Unsigned path:
5864 *
5865 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5866 *
5867 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5868 * 2. tmp0.z = lo (tmp0.x * src2)
5869 * 3. tmp0.w = -tmp0.z
5870 * 4. tmp0.y = hi (tmp0.x * src2)
5871 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5872 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5873 * 7. tmp1.x = tmp0.x - tmp0.w
5874 * 8. tmp1.y = tmp0.x + tmp0.w
5875 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5876 * 10. tmp0.z = hi(tmp0.x * src1) = q
5877 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5878 *
5879 * 12. tmp0.w = src1 - tmp0.y = r
5880 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5881 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5882 *
5883 * if DIV
5884 *
5885 * 15. tmp1.z = tmp0.z + 1 = q + 1
5886 * 16. tmp1.w = tmp0.z - 1 = q - 1
5887 *
5888 * else MOD
5889 *
5890 * 15. tmp1.z = tmp0.w - src2 = r - src2
5891 * 16. tmp1.w = tmp0.w + src2 = r + src2
5892 *
5893 * endif
5894 *
5895 * 17. tmp1.x = tmp1.x & tmp1.y
5896 *
5897 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5898 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5899 *
5900 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5901 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5902 *
5903 * Signed path:
5904 *
5905 * Same as unsigned, using abs values of the operands,
5906 * and fixing the sign of the result in the end.
5907 */
5908
5909 for (i = 0; i < 4; i++) {
5910 if (!(write_mask & (1<<i)))
5911 continue;
5912
5913 if (signed_op) {
5914
5915 /* tmp2.x = -src0 */
5916 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5917 alu.op = ALU_OP2_SUB_INT;
5918
5919 alu.dst.sel = tmp2;
5920 alu.dst.chan = 0;
5921 alu.dst.write = 1;
5922
5923 alu.src[0].sel = V_SQ_ALU_SRC_0;
5924
5925 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5926
5927 alu.last = 1;
5928 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5929 return r;
5930
5931 /* tmp2.y = -src1 */
5932 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5933 alu.op = ALU_OP2_SUB_INT;
5934
5935 alu.dst.sel = tmp2;
5936 alu.dst.chan = 1;
5937 alu.dst.write = 1;
5938
5939 alu.src[0].sel = V_SQ_ALU_SRC_0;
5940
5941 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5942
5943 alu.last = 1;
5944 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5945 return r;
5946
5947 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5948 /* it will be a sign of the quotient */
5949 if (!mod) {
5950
5951 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5952 alu.op = ALU_OP2_XOR_INT;
5953
5954 alu.dst.sel = tmp2;
5955 alu.dst.chan = 2;
5956 alu.dst.write = 1;
5957
5958 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5959 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5960
5961 alu.last = 1;
5962 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5963 return r;
5964 }
5965
5966 /* tmp2.x = |src0| */
5967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5968 alu.op = ALU_OP3_CNDGE_INT;
5969 alu.is_op3 = 1;
5970
5971 alu.dst.sel = tmp2;
5972 alu.dst.chan = 0;
5973 alu.dst.write = 1;
5974
5975 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5976 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5977 alu.src[2].sel = tmp2;
5978 alu.src[2].chan = 0;
5979
5980 alu.last = 1;
5981 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5982 return r;
5983
5984 /* tmp2.y = |src1| */
5985 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5986 alu.op = ALU_OP3_CNDGE_INT;
5987 alu.is_op3 = 1;
5988
5989 alu.dst.sel = tmp2;
5990 alu.dst.chan = 1;
5991 alu.dst.write = 1;
5992
5993 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5994 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5995 alu.src[2].sel = tmp2;
5996 alu.src[2].chan = 1;
5997
5998 alu.last = 1;
5999 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6000 return r;
6001
6002 }
6003
6004 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
6005 if (ctx->bc->gfx_level == CAYMAN) {
6006 /* tmp3.x = u2f(src2) */
6007 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6008 alu.op = ALU_OP1_UINT_TO_FLT;
6009
6010 alu.dst.sel = tmp3;
6011 alu.dst.chan = 0;
6012 alu.dst.write = 1;
6013
6014 if (signed_op) {
6015 alu.src[0].sel = tmp2;
6016 alu.src[0].chan = 1;
6017 } else {
6018 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6019 }
6020
6021 alu.last = 1;
6022 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6023 return r;
6024
6025 /* tmp0.x = recip(tmp3.x) */
6026 for (j = 0 ; j < 3; j++) {
6027 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6028 alu.op = ALU_OP1_RECIP_IEEE;
6029
6030 alu.dst.sel = tmp0;
6031 alu.dst.chan = j;
6032 alu.dst.write = (j == 0);
6033
6034 alu.src[0].sel = tmp3;
6035 alu.src[0].chan = 0;
6036
6037 if (j == 2)
6038 alu.last = 1;
6039 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6040 return r;
6041 }
6042
6043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6044 alu.op = ALU_OP2_MUL;
6045
6046 alu.src[0].sel = tmp0;
6047 alu.src[0].chan = 0;
6048
6049 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6050 alu.src[1].value = 0x4f800000;
6051
6052 alu.dst.sel = tmp3;
6053 alu.dst.write = 1;
6054 alu.last = 1;
6055 r = r600_bytecode_add_alu(ctx->bc, &alu);
6056 if (r)
6057 return r;
6058
6059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6060 alu.op = ALU_OP1_FLT_TO_UINT;
6061
6062 alu.dst.sel = tmp0;
6063 alu.dst.chan = 0;
6064 alu.dst.write = 1;
6065
6066 alu.src[0].sel = tmp3;
6067 alu.src[0].chan = 0;
6068
6069 alu.last = 1;
6070 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6071 return r;
6072
6073 } else {
6074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6075 alu.op = ALU_OP1_RECIP_UINT;
6076
6077 alu.dst.sel = tmp0;
6078 alu.dst.chan = 0;
6079 alu.dst.write = 1;
6080
6081 if (signed_op) {
6082 alu.src[0].sel = tmp2;
6083 alu.src[0].chan = 1;
6084 } else {
6085 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6086 }
6087
6088 alu.last = 1;
6089 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6090 return r;
6091 }
6092
6093 /* 2. tmp0.z = lo (tmp0.x * src2) */
6094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6095 alu.op = ALU_OP2_MULLO_UINT;
6096
6097 alu.dst.sel = tmp0;
6098 alu.dst.chan = 2;
6099 alu.dst.write = 1;
6100
6101 alu.src[0].sel = tmp0;
6102 alu.src[0].chan = 0;
6103 if (signed_op) {
6104 alu.src[1].sel = tmp2;
6105 alu.src[1].chan = 1;
6106 } else {
6107 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6108 }
6109
6110 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6111 return r;
6112
6113 /* 3. tmp0.w = -tmp0.z */
6114 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6115 alu.op = ALU_OP2_SUB_INT;
6116
6117 alu.dst.sel = tmp0;
6118 alu.dst.chan = 3;
6119 alu.dst.write = 1;
6120
6121 alu.src[0].sel = V_SQ_ALU_SRC_0;
6122 alu.src[1].sel = tmp0;
6123 alu.src[1].chan = 2;
6124
6125 alu.last = 1;
6126 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6127 return r;
6128
6129 /* 4. tmp0.y = hi (tmp0.x * src2) */
6130 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6131 alu.op = ALU_OP2_MULHI_UINT;
6132
6133 alu.dst.sel = tmp0;
6134 alu.dst.chan = 1;
6135 alu.dst.write = 1;
6136
6137 alu.src[0].sel = tmp0;
6138 alu.src[0].chan = 0;
6139
6140 if (signed_op) {
6141 alu.src[1].sel = tmp2;
6142 alu.src[1].chan = 1;
6143 } else {
6144 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6145 }
6146
6147 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6148 return r;
6149
6150 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
6151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6152 alu.op = ALU_OP3_CNDE_INT;
6153 alu.is_op3 = 1;
6154
6155 alu.dst.sel = tmp0;
6156 alu.dst.chan = 2;
6157 alu.dst.write = 1;
6158
6159 alu.src[0].sel = tmp0;
6160 alu.src[0].chan = 1;
6161 alu.src[1].sel = tmp0;
6162 alu.src[1].chan = 3;
6163 alu.src[2].sel = tmp0;
6164 alu.src[2].chan = 2;
6165
6166 alu.last = 1;
6167 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6168 return r;
6169
6170 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
6171 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6172 alu.op = ALU_OP2_MULHI_UINT;
6173
6174 alu.dst.sel = tmp0;
6175 alu.dst.chan = 3;
6176 alu.dst.write = 1;
6177
6178 alu.src[0].sel = tmp0;
6179 alu.src[0].chan = 2;
6180
6181 alu.src[1].sel = tmp0;
6182 alu.src[1].chan = 0;
6183
6184 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6185 return r;
6186
6187 /* 7. tmp1.x = tmp0.x - tmp0.w */
6188 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6189 alu.op = ALU_OP2_SUB_INT;
6190
6191 alu.dst.sel = tmp1;
6192 alu.dst.chan = 0;
6193 alu.dst.write = 1;
6194
6195 alu.src[0].sel = tmp0;
6196 alu.src[0].chan = 0;
6197 alu.src[1].sel = tmp0;
6198 alu.src[1].chan = 3;
6199
6200 alu.last = 1;
6201 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6202 return r;
6203
6204 /* 8. tmp1.y = tmp0.x + tmp0.w */
6205 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6206 alu.op = ALU_OP2_ADD_INT;
6207
6208 alu.dst.sel = tmp1;
6209 alu.dst.chan = 1;
6210 alu.dst.write = 1;
6211
6212 alu.src[0].sel = tmp0;
6213 alu.src[0].chan = 0;
6214 alu.src[1].sel = tmp0;
6215 alu.src[1].chan = 3;
6216
6217 alu.last = 1;
6218 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6219 return r;
6220
6221 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6222 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6223 alu.op = ALU_OP3_CNDE_INT;
6224 alu.is_op3 = 1;
6225
6226 alu.dst.sel = tmp0;
6227 alu.dst.chan = 0;
6228 alu.dst.write = 1;
6229
6230 alu.src[0].sel = tmp0;
6231 alu.src[0].chan = 1;
6232 alu.src[1].sel = tmp1;
6233 alu.src[1].chan = 1;
6234 alu.src[2].sel = tmp1;
6235 alu.src[2].chan = 0;
6236
6237 alu.last = 1;
6238 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6239 return r;
6240
6241 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
6242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6243 alu.op = ALU_OP2_MULHI_UINT;
6244
6245 alu.dst.sel = tmp0;
6246 alu.dst.chan = 2;
6247 alu.dst.write = 1;
6248
6249 alu.src[0].sel = tmp0;
6250 alu.src[0].chan = 0;
6251
6252 if (signed_op) {
6253 alu.src[1].sel = tmp2;
6254 alu.src[1].chan = 0;
6255 } else {
6256 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6257 }
6258
6259 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6260 return r;
6261
6262 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
6263 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6264 alu.op = ALU_OP2_MULLO_UINT;
6265
6266 alu.dst.sel = tmp0;
6267 alu.dst.chan = 1;
6268 alu.dst.write = 1;
6269
6270 if (signed_op) {
6271 alu.src[0].sel = tmp2;
6272 alu.src[0].chan = 1;
6273 } else {
6274 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6275 }
6276
6277 alu.src[1].sel = tmp0;
6278 alu.src[1].chan = 2;
6279
6280 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6281 return r;
6282
6283 /* 12. tmp0.w = src1 - tmp0.y = r */
6284 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6285 alu.op = ALU_OP2_SUB_INT;
6286
6287 alu.dst.sel = tmp0;
6288 alu.dst.chan = 3;
6289 alu.dst.write = 1;
6290
6291 if (signed_op) {
6292 alu.src[0].sel = tmp2;
6293 alu.src[0].chan = 0;
6294 } else {
6295 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6296 }
6297
6298 alu.src[1].sel = tmp0;
6299 alu.src[1].chan = 1;
6300
6301 alu.last = 1;
6302 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6303 return r;
6304
6305 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
6306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6307 alu.op = ALU_OP2_SETGE_UINT;
6308
6309 alu.dst.sel = tmp1;
6310 alu.dst.chan = 0;
6311 alu.dst.write = 1;
6312
6313 alu.src[0].sel = tmp0;
6314 alu.src[0].chan = 3;
6315 if (signed_op) {
6316 alu.src[1].sel = tmp2;
6317 alu.src[1].chan = 1;
6318 } else {
6319 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6320 }
6321
6322 alu.last = 1;
6323 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6324 return r;
6325
6326 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
6327 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6328 alu.op = ALU_OP2_SETGE_UINT;
6329
6330 alu.dst.sel = tmp1;
6331 alu.dst.chan = 1;
6332 alu.dst.write = 1;
6333
6334 if (signed_op) {
6335 alu.src[0].sel = tmp2;
6336 alu.src[0].chan = 0;
6337 } else {
6338 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6339 }
6340
6341 alu.src[1].sel = tmp0;
6342 alu.src[1].chan = 1;
6343
6344 alu.last = 1;
6345 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6346 return r;
6347
6348 if (mod) { /* UMOD */
6349
6350 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
6351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6352 alu.op = ALU_OP2_SUB_INT;
6353
6354 alu.dst.sel = tmp1;
6355 alu.dst.chan = 2;
6356 alu.dst.write = 1;
6357
6358 alu.src[0].sel = tmp0;
6359 alu.src[0].chan = 3;
6360
6361 if (signed_op) {
6362 alu.src[1].sel = tmp2;
6363 alu.src[1].chan = 1;
6364 } else {
6365 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6366 }
6367
6368 alu.last = 1;
6369 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6370 return r;
6371
6372 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
6373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6374 alu.op = ALU_OP2_ADD_INT;
6375
6376 alu.dst.sel = tmp1;
6377 alu.dst.chan = 3;
6378 alu.dst.write = 1;
6379
6380 alu.src[0].sel = tmp0;
6381 alu.src[0].chan = 3;
6382 if (signed_op) {
6383 alu.src[1].sel = tmp2;
6384 alu.src[1].chan = 1;
6385 } else {
6386 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6387 }
6388
6389 alu.last = 1;
6390 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6391 return r;
6392
6393 } else { /* UDIV */
6394
6395 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
6396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6397 alu.op = ALU_OP2_ADD_INT;
6398
6399 alu.dst.sel = tmp1;
6400 alu.dst.chan = 2;
6401 alu.dst.write = 1;
6402
6403 alu.src[0].sel = tmp0;
6404 alu.src[0].chan = 2;
6405 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6406
6407 alu.last = 1;
6408 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6409 return r;
6410
6411 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
6412 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6413 alu.op = ALU_OP2_ADD_INT;
6414
6415 alu.dst.sel = tmp1;
6416 alu.dst.chan = 3;
6417 alu.dst.write = 1;
6418
6419 alu.src[0].sel = tmp0;
6420 alu.src[0].chan = 2;
6421 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6422
6423 alu.last = 1;
6424 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6425 return r;
6426
6427 }
6428
6429 /* 17. tmp1.x = tmp1.x & tmp1.y */
6430 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6431 alu.op = ALU_OP2_AND_INT;
6432
6433 alu.dst.sel = tmp1;
6434 alu.dst.chan = 0;
6435 alu.dst.write = 1;
6436
6437 alu.src[0].sel = tmp1;
6438 alu.src[0].chan = 0;
6439 alu.src[1].sel = tmp1;
6440 alu.src[1].chan = 1;
6441
6442 alu.last = 1;
6443 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6444 return r;
6445
6446 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
6447 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
6448 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6449 alu.op = ALU_OP3_CNDE_INT;
6450 alu.is_op3 = 1;
6451
6452 alu.dst.sel = tmp0;
6453 alu.dst.chan = 2;
6454 alu.dst.write = 1;
6455
6456 alu.src[0].sel = tmp1;
6457 alu.src[0].chan = 0;
6458 alu.src[1].sel = tmp0;
6459 alu.src[1].chan = mod ? 3 : 2;
6460 alu.src[2].sel = tmp1;
6461 alu.src[2].chan = 2;
6462
6463 alu.last = 1;
6464 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6465 return r;
6466
6467 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6468 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6469 alu.op = ALU_OP3_CNDE_INT;
6470 alu.is_op3 = 1;
6471
6472 if (signed_op) {
6473 alu.dst.sel = tmp0;
6474 alu.dst.chan = 2;
6475 alu.dst.write = 1;
6476 } else {
6477 if (tmp4 > 0) {
6478 alu.dst.sel = tmp4;
6479 alu.dst.chan = i;
6480 alu.dst.write = 1;
6481 } else {
6482 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6483 }
6484 }
6485
6486 alu.src[0].sel = tmp1;
6487 alu.src[0].chan = 1;
6488 alu.src[1].sel = tmp1;
6489 alu.src[1].chan = 3;
6490 alu.src[2].sel = tmp0;
6491 alu.src[2].chan = 2;
6492
6493 alu.last = 1;
6494 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6495 return r;
6496
6497 if (signed_op) {
6498
6499 /* fix the sign of the result */
6500
6501 if (mod) {
6502
6503 /* tmp0.x = -tmp0.z */
6504 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6505 alu.op = ALU_OP2_SUB_INT;
6506
6507 alu.dst.sel = tmp0;
6508 alu.dst.chan = 0;
6509 alu.dst.write = 1;
6510
6511 alu.src[0].sel = V_SQ_ALU_SRC_0;
6512 alu.src[1].sel = tmp0;
6513 alu.src[1].chan = 2;
6514
6515 alu.last = 1;
6516 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6517 return r;
6518
6519 /* sign of the remainder is the same as the sign of src0 */
6520 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6521 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6522 alu.op = ALU_OP3_CNDGE_INT;
6523 alu.is_op3 = 1;
6524
6525 if (tmp4 > 0) {
6526 alu.dst.sel = tmp4;
6527 alu.dst.chan = i;
6528 alu.dst.write = 1;
6529 } else {
6530 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6531 }
6532
6533 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6534 alu.src[1].sel = tmp0;
6535 alu.src[1].chan = 2;
6536 alu.src[2].sel = tmp0;
6537 alu.src[2].chan = 0;
6538
6539 alu.last = 1;
6540 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6541 return r;
6542
6543 } else {
6544
6545 /* tmp0.x = -tmp0.z */
6546 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6547 alu.op = ALU_OP2_SUB_INT;
6548
6549 alu.dst.sel = tmp0;
6550 alu.dst.chan = 0;
6551 alu.dst.write = 1;
6552
6553 alu.src[0].sel = V_SQ_ALU_SRC_0;
6554 alu.src[1].sel = tmp0;
6555 alu.src[1].chan = 2;
6556
6557 alu.last = 1;
6558 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6559 return r;
6560
6561 /* fix the quotient sign (same as the sign of src0*src1) */
6562 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6563 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6564 alu.op = ALU_OP3_CNDGE_INT;
6565 alu.is_op3 = 1;
6566
6567 if (tmp4 > 0) {
6568 alu.dst.sel = tmp4;
6569 alu.dst.chan = i;
6570 alu.dst.write = 1;
6571 } else {
6572 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6573 }
6574
6575 alu.src[0].sel = tmp2;
6576 alu.src[0].chan = 2;
6577 alu.src[1].sel = tmp0;
6578 alu.src[1].chan = 2;
6579 alu.src[2].sel = tmp0;
6580 alu.src[2].chan = 0;
6581
6582 alu.last = 1;
6583 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6584 return r;
6585 }
6586 }
6587 }
6588
6589 if (tmp4 > 0) {
6590 for (i = 0; i <= lasti; ++i) {
6591 if (!(write_mask & (1<<i)))
6592 continue;
6593
6594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6595 alu.op = ALU_OP1_MOV;
6596 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6597 alu.src[0].sel = tmp4;
6598 alu.src[0].chan = i;
6599
6600 if (i == lasti)
6601 alu.last = 1;
6602 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6603 return r;
6604 }
6605 }
6606
6607 return 0;
6608 }
6609
tgsi_udiv(struct r600_shader_ctx * ctx)6610 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6611 {
6612 return tgsi_divmod(ctx, 0, 0);
6613 }
6614
tgsi_umod(struct r600_shader_ctx * ctx)6615 static int tgsi_umod(struct r600_shader_ctx *ctx)
6616 {
6617 return tgsi_divmod(ctx, 1, 0);
6618 }
6619
tgsi_idiv(struct r600_shader_ctx * ctx)6620 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6621 {
6622 return tgsi_divmod(ctx, 0, 1);
6623 }
6624
tgsi_imod(struct r600_shader_ctx * ctx)6625 static int tgsi_imod(struct r600_shader_ctx *ctx)
6626 {
6627 return tgsi_divmod(ctx, 1, 1);
6628 }
6629
6630
tgsi_f2i(struct r600_shader_ctx * ctx)6631 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6632 {
6633 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6634 struct r600_bytecode_alu alu;
6635 int i, r;
6636 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6637 int last_inst = tgsi_last_instruction(write_mask);
6638
6639 for (i = 0; i < 4; i++) {
6640 if (!(write_mask & (1<<i)))
6641 continue;
6642
6643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6644 alu.op = ALU_OP1_TRUNC;
6645
6646 alu.dst.sel = ctx->temp_reg;
6647 alu.dst.chan = i;
6648 alu.dst.write = 1;
6649
6650 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6651 if (i == last_inst)
6652 alu.last = 1;
6653 r = r600_bytecode_add_alu(ctx->bc, &alu);
6654 if (r)
6655 return r;
6656 }
6657
6658 for (i = 0; i < 4; i++) {
6659 if (!(write_mask & (1<<i)))
6660 continue;
6661
6662 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6663 alu.op = ctx->inst_info->op;
6664
6665 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6666
6667 alu.src[0].sel = ctx->temp_reg;
6668 alu.src[0].chan = i;
6669
6670 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6671 alu.last = 1;
6672 r = r600_bytecode_add_alu(ctx->bc, &alu);
6673 if (r)
6674 return r;
6675 }
6676
6677 return 0;
6678 }
6679
tgsi_iabs(struct r600_shader_ctx * ctx)6680 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6681 {
6682 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6683 struct r600_bytecode_alu alu;
6684 int i, r;
6685 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6686 int last_inst = tgsi_last_instruction(write_mask);
6687
6688 /* tmp = -src */
6689 for (i = 0; i < 4; i++) {
6690 if (!(write_mask & (1<<i)))
6691 continue;
6692
6693 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6694 alu.op = ALU_OP2_SUB_INT;
6695
6696 alu.dst.sel = ctx->temp_reg;
6697 alu.dst.chan = i;
6698 alu.dst.write = 1;
6699
6700 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6701 alu.src[0].sel = V_SQ_ALU_SRC_0;
6702
6703 if (i == last_inst)
6704 alu.last = 1;
6705 r = r600_bytecode_add_alu(ctx->bc, &alu);
6706 if (r)
6707 return r;
6708 }
6709
6710 /* dst = (src >= 0 ? src : tmp) */
6711 for (i = 0; i < 4; i++) {
6712 if (!(write_mask & (1<<i)))
6713 continue;
6714
6715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6716 alu.op = ALU_OP3_CNDGE_INT;
6717 alu.is_op3 = 1;
6718 alu.dst.write = 1;
6719
6720 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6721
6722 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6723 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6724 alu.src[2].sel = ctx->temp_reg;
6725 alu.src[2].chan = i;
6726
6727 if (i == last_inst)
6728 alu.last = 1;
6729 r = r600_bytecode_add_alu(ctx->bc, &alu);
6730 if (r)
6731 return r;
6732 }
6733 return 0;
6734 }
6735
tgsi_issg(struct r600_shader_ctx * ctx)6736 static int tgsi_issg(struct r600_shader_ctx *ctx)
6737 {
6738 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6739 struct r600_bytecode_alu alu;
6740 int i, r;
6741 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6742 int last_inst = tgsi_last_instruction(write_mask);
6743
6744 /* tmp = (src >= 0 ? src : -1) */
6745 for (i = 0; i < 4; i++) {
6746 if (!(write_mask & (1<<i)))
6747 continue;
6748
6749 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6750 alu.op = ALU_OP3_CNDGE_INT;
6751 alu.is_op3 = 1;
6752
6753 alu.dst.sel = ctx->temp_reg;
6754 alu.dst.chan = i;
6755 alu.dst.write = 1;
6756
6757 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6758 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6759 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6760
6761 if (i == last_inst)
6762 alu.last = 1;
6763 r = r600_bytecode_add_alu(ctx->bc, &alu);
6764 if (r)
6765 return r;
6766 }
6767
6768 /* dst = (tmp > 0 ? 1 : tmp) */
6769 for (i = 0; i < 4; i++) {
6770 if (!(write_mask & (1<<i)))
6771 continue;
6772
6773 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6774 alu.op = ALU_OP3_CNDGT_INT;
6775 alu.is_op3 = 1;
6776 alu.dst.write = 1;
6777
6778 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6779
6780 alu.src[0].sel = ctx->temp_reg;
6781 alu.src[0].chan = i;
6782
6783 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6784
6785 alu.src[2].sel = ctx->temp_reg;
6786 alu.src[2].chan = i;
6787
6788 if (i == last_inst)
6789 alu.last = 1;
6790 r = r600_bytecode_add_alu(ctx->bc, &alu);
6791 if (r)
6792 return r;
6793 }
6794 return 0;
6795 }
6796
6797
6798
tgsi_ssg(struct r600_shader_ctx * ctx)6799 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6800 {
6801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6802 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6803 int last_inst = tgsi_last_instruction(write_mask);
6804 struct r600_bytecode_alu alu;
6805 int i, r;
6806
6807 /* tmp = (src > 0 ? 1 : src) */
6808 for (i = 0; i <= last_inst; i++) {
6809 if (!(write_mask & (1 << i)))
6810 continue;
6811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6812 alu.op = ALU_OP3_CNDGT;
6813 alu.is_op3 = 1;
6814
6815 alu.dst.sel = ctx->temp_reg;
6816 alu.dst.chan = i;
6817
6818 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6819 alu.src[1].sel = V_SQ_ALU_SRC_1;
6820 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6821
6822 if (i == last_inst)
6823 alu.last = 1;
6824 r = r600_bytecode_add_alu(ctx->bc, &alu);
6825 if (r)
6826 return r;
6827 }
6828
6829 /* dst = (-tmp > 0 ? -1 : tmp) */
6830 for (i = 0; i <= last_inst; i++) {
6831 if (!(write_mask & (1 << i)))
6832 continue;
6833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6834 alu.op = ALU_OP3_CNDGT;
6835 alu.is_op3 = 1;
6836 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6837
6838 alu.src[0].sel = ctx->temp_reg;
6839 alu.src[0].chan = i;
6840 alu.src[0].neg = 1;
6841
6842 alu.src[1].sel = V_SQ_ALU_SRC_1;
6843 alu.src[1].neg = 1;
6844
6845 alu.src[2].sel = ctx->temp_reg;
6846 alu.src[2].chan = i;
6847
6848 if (i == last_inst)
6849 alu.last = 1;
6850 r = r600_bytecode_add_alu(ctx->bc, &alu);
6851 if (r)
6852 return r;
6853 }
6854 return 0;
6855 }
6856
tgsi_bfi(struct r600_shader_ctx * ctx)6857 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6858 {
6859 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6860 struct r600_bytecode_alu alu;
6861 int i, r, t1, t2;
6862
6863 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6864 int last_inst = tgsi_last_instruction(write_mask);
6865
6866 t1 = r600_get_temp(ctx);
6867
6868 for (i = 0; i < 4; i++) {
6869 if (!(write_mask & (1<<i)))
6870 continue;
6871
6872 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6873 alu.op = ALU_OP2_SETGE_INT;
6874 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6875 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6876 alu.src[1].value = 32;
6877 alu.dst.sel = ctx->temp_reg;
6878 alu.dst.chan = i;
6879 alu.dst.write = 1;
6880 alu.last = i == last_inst;
6881 r = r600_bytecode_add_alu(ctx->bc, &alu);
6882 if (r)
6883 return r;
6884 }
6885
6886 for (i = 0; i < 4; i++) {
6887 if (!(write_mask & (1<<i)))
6888 continue;
6889
6890 /* create mask tmp */
6891 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6892 alu.op = ALU_OP2_BFM_INT;
6893 alu.dst.sel = t1;
6894 alu.dst.chan = i;
6895 alu.dst.write = 1;
6896 alu.last = i == last_inst;
6897
6898 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6899 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6900
6901 r = r600_bytecode_add_alu(ctx->bc, &alu);
6902 if (r)
6903 return r;
6904 }
6905
6906 t2 = r600_get_temp(ctx);
6907
6908 for (i = 0; i < 4; i++) {
6909 if (!(write_mask & (1<<i)))
6910 continue;
6911
6912 /* shift insert left */
6913 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6914 alu.op = ALU_OP2_LSHL_INT;
6915 alu.dst.sel = t2;
6916 alu.dst.chan = i;
6917 alu.dst.write = 1;
6918 alu.last = i == last_inst;
6919
6920 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6921 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6922
6923 r = r600_bytecode_add_alu(ctx->bc, &alu);
6924 if (r)
6925 return r;
6926 }
6927
6928 for (i = 0; i < 4; i++) {
6929 if (!(write_mask & (1<<i)))
6930 continue;
6931
6932 /* actual bitfield insert */
6933 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6934 alu.op = ALU_OP3_BFI_INT;
6935 alu.is_op3 = 1;
6936 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6937 alu.dst.chan = i;
6938 alu.dst.write = 1;
6939 alu.last = i == last_inst;
6940
6941 alu.src[0].sel = t1;
6942 alu.src[0].chan = i;
6943 alu.src[1].sel = t2;
6944 alu.src[1].chan = i;
6945 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6946
6947 r = r600_bytecode_add_alu(ctx->bc, &alu);
6948 if (r)
6949 return r;
6950 }
6951
6952 for (i = 0; i < 4; i++) {
6953 if (!(write_mask & (1<<i)))
6954 continue;
6955 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6956 alu.op = ALU_OP3_CNDE_INT;
6957 alu.is_op3 = 1;
6958 alu.src[0].sel = ctx->temp_reg;
6959 alu.src[0].chan = i;
6960 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6961
6962 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6963
6964 alu.src[1].sel = alu.dst.sel;
6965 alu.src[1].chan = i;
6966
6967 alu.last = i == last_inst;
6968 r = r600_bytecode_add_alu(ctx->bc, &alu);
6969 if (r)
6970 return r;
6971 }
6972 return 0;
6973 }
6974
tgsi_msb(struct r600_shader_ctx * ctx)6975 static int tgsi_msb(struct r600_shader_ctx *ctx)
6976 {
6977 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6978 struct r600_bytecode_alu alu;
6979 int i, r, t1, t2;
6980
6981 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6982 int last_inst = tgsi_last_instruction(write_mask);
6983
6984 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6985 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6986
6987 t1 = ctx->temp_reg;
6988
6989 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6990 for (i = 0; i < 4; i++) {
6991 if (!(write_mask & (1<<i)))
6992 continue;
6993
6994 /* t1 = FFBH_INT / FFBH_UINT */
6995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6996 alu.op = ctx->inst_info->op;
6997 alu.dst.sel = t1;
6998 alu.dst.chan = i;
6999 alu.dst.write = 1;
7000 alu.last = i == last_inst;
7001
7002 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7003
7004 r = r600_bytecode_add_alu(ctx->bc, &alu);
7005 if (r)
7006 return r;
7007 }
7008
7009 t2 = r600_get_temp(ctx);
7010
7011 for (i = 0; i < 4; i++) {
7012 if (!(write_mask & (1<<i)))
7013 continue;
7014
7015 /* t2 = 31 - t1 */
7016 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7017 alu.op = ALU_OP2_SUB_INT;
7018 alu.dst.sel = t2;
7019 alu.dst.chan = i;
7020 alu.dst.write = 1;
7021 alu.last = i == last_inst;
7022
7023 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7024 alu.src[0].value = 31;
7025 alu.src[1].sel = t1;
7026 alu.src[1].chan = i;
7027
7028 r = r600_bytecode_add_alu(ctx->bc, &alu);
7029 if (r)
7030 return r;
7031 }
7032
7033 for (i = 0; i < 4; i++) {
7034 if (!(write_mask & (1<<i)))
7035 continue;
7036
7037 /* result = t1 >= 0 ? t2 : t1 */
7038 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7039 alu.op = ALU_OP3_CNDGE_INT;
7040 alu.is_op3 = 1;
7041 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7042 alu.dst.chan = i;
7043 alu.dst.write = 1;
7044 alu.last = i == last_inst;
7045
7046 alu.src[0].sel = t1;
7047 alu.src[0].chan = i;
7048 alu.src[1].sel = t2;
7049 alu.src[1].chan = i;
7050 alu.src[2].sel = t1;
7051 alu.src[2].chan = i;
7052
7053 r = r600_bytecode_add_alu(ctx->bc, &alu);
7054 if (r)
7055 return r;
7056 }
7057
7058 return 0;
7059 }
7060
tgsi_interp_egcm(struct r600_shader_ctx * ctx)7061 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
7062 {
7063 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7064 struct r600_bytecode_alu alu;
7065 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
7066 unsigned location;
7067 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
7068
7069 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
7070
7071 /* Interpolators have been marked for use already by allocate_system_value_inputs */
7072 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7073 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7074 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
7075 }
7076 else {
7077 location = TGSI_INTERPOLATE_LOC_CENTROID;
7078 ctx->shader->input[input].uses_interpolate_at_centroid = 1;
7079 }
7080
7081 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
7082 if (k < 0)
7083 k = 0;
7084 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
7085 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
7086
7087 /* NOTE: currently offset is not perspective correct */
7088 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7089 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7090 int sample_gpr = -1;
7091 int gradientsH, gradientsV;
7092 struct r600_bytecode_tex tex;
7093
7094 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7095 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
7096 }
7097
7098 gradientsH = r600_get_temp(ctx);
7099 gradientsV = r600_get_temp(ctx);
7100 for (i = 0; i < 2; i++) {
7101 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7102 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7103 tex.src_gpr = interp_gpr;
7104 tex.src_sel_x = interp_base_chan + 0;
7105 tex.src_sel_y = interp_base_chan + 1;
7106 tex.src_sel_z = 0;
7107 tex.src_sel_w = 0;
7108 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7109 tex.dst_sel_x = 0;
7110 tex.dst_sel_y = 1;
7111 tex.dst_sel_z = 7;
7112 tex.dst_sel_w = 7;
7113 tex.inst_mod = 1; // Use per pixel gradient calculation
7114 tex.sampler_id = 0;
7115 tex.resource_id = tex.sampler_id;
7116 r = r600_bytecode_add_tex(ctx->bc, &tex);
7117 if (r)
7118 return r;
7119 }
7120
7121 for (i = 0; i < 2; i++) {
7122 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7123 alu.op = ALU_OP3_MULADD;
7124 alu.is_op3 = 1;
7125 alu.src[0].sel = gradientsH;
7126 alu.src[0].chan = i;
7127 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7128 alu.src[1].sel = sample_gpr;
7129 alu.src[1].chan = 2;
7130 }
7131 else {
7132 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7133 }
7134 alu.src[2].sel = interp_gpr;
7135 alu.src[2].chan = interp_base_chan + i;
7136 alu.dst.sel = ctx->temp_reg;
7137 alu.dst.chan = i;
7138 alu.last = i == 1;
7139
7140 r = r600_bytecode_add_alu(ctx->bc, &alu);
7141 if (r)
7142 return r;
7143 }
7144
7145 for (i = 0; i < 2; i++) {
7146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7147 alu.op = ALU_OP3_MULADD;
7148 alu.is_op3 = 1;
7149 alu.src[0].sel = gradientsV;
7150 alu.src[0].chan = i;
7151 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7152 alu.src[1].sel = sample_gpr;
7153 alu.src[1].chan = 3;
7154 }
7155 else {
7156 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7157 }
7158 alu.src[2].sel = ctx->temp_reg;
7159 alu.src[2].chan = i;
7160 alu.dst.sel = ctx->temp_reg;
7161 alu.dst.chan = i;
7162 alu.last = i == 1;
7163
7164 r = r600_bytecode_add_alu(ctx->bc, &alu);
7165 if (r)
7166 return r;
7167 }
7168 }
7169
7170 tmp = r600_get_temp(ctx);
7171 for (i = 0; i < 8; i++) {
7172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7173 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7174
7175 alu.dst.sel = tmp;
7176 if ((i > 1 && i < 6)) {
7177 alu.dst.write = 1;
7178 }
7179 else {
7180 alu.dst.write = 0;
7181 }
7182 alu.dst.chan = i % 4;
7183
7184 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7185 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7186 alu.src[0].sel = ctx->temp_reg;
7187 alu.src[0].chan = 1 - (i % 2);
7188 } else {
7189 alu.src[0].sel = interp_gpr;
7190 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7191 }
7192 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7193 alu.src[1].chan = 0;
7194
7195 alu.last = i % 4 == 3;
7196 alu.bank_swizzle_force = SQ_ALU_VEC_210;
7197
7198 r = r600_bytecode_add_alu(ctx->bc, &alu);
7199 if (r)
7200 return r;
7201 }
7202
7203 // INTERP can't swizzle dst
7204 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7205 for (i = 0; i <= lasti; i++) {
7206 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7207 continue;
7208
7209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7210 alu.op = ALU_OP1_MOV;
7211 alu.src[0].sel = tmp;
7212 alu.src[0].chan = ctx->src[0].swizzle[i];
7213 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7214 alu.dst.write = 1;
7215 alu.last = i == lasti;
7216 r = r600_bytecode_add_alu(ctx->bc, &alu);
7217 if (r)
7218 return r;
7219 }
7220
7221 return 0;
7222 }
7223
7224
tgsi_helper_copy(struct r600_shader_ctx * ctx,struct tgsi_full_instruction * inst)7225 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7226 {
7227 struct r600_bytecode_alu alu;
7228 int i, r;
7229
7230 for (i = 0; i < 4; i++) {
7231 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7232 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7233 alu.op = ALU_OP0_NOP;
7234 alu.dst.chan = i;
7235 } else {
7236 alu.op = ALU_OP1_MOV;
7237 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7238 alu.src[0].sel = ctx->temp_reg;
7239 alu.src[0].chan = i;
7240 }
7241 if (i == 3) {
7242 alu.last = 1;
7243 }
7244 r = r600_bytecode_add_alu(ctx->bc, &alu);
7245 if (r)
7246 return r;
7247 }
7248 return 0;
7249 }
7250
tgsi_make_src_for_op3(struct r600_shader_ctx * ctx,unsigned writemask,struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src)7251 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7252 unsigned writemask,
7253 struct r600_bytecode_alu_src *bc_src,
7254 const struct r600_shader_src *shader_src)
7255 {
7256 struct r600_bytecode_alu alu;
7257 int i, r;
7258 int lasti = tgsi_last_instruction(writemask);
7259 int temp_reg = 0;
7260
7261 r600_bytecode_src(&bc_src[0], shader_src, 0);
7262 r600_bytecode_src(&bc_src[1], shader_src, 1);
7263 r600_bytecode_src(&bc_src[2], shader_src, 2);
7264 r600_bytecode_src(&bc_src[3], shader_src, 3);
7265
7266 if (bc_src->abs) {
7267 temp_reg = r600_get_temp(ctx);
7268
7269 for (i = 0; i < lasti + 1; i++) {
7270 if (!(writemask & (1 << i)))
7271 continue;
7272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7273 alu.op = ALU_OP1_MOV;
7274 alu.dst.sel = temp_reg;
7275 alu.dst.chan = i;
7276 alu.dst.write = 1;
7277 alu.src[0] = bc_src[i];
7278 if (i == lasti) {
7279 alu.last = 1;
7280 }
7281 r = r600_bytecode_add_alu(ctx->bc, &alu);
7282 if (r)
7283 return r;
7284 memset(&bc_src[i], 0, sizeof(*bc_src));
7285 bc_src[i].sel = temp_reg;
7286 bc_src[i].chan = i;
7287 }
7288 }
7289 return 0;
7290 }
7291
tgsi_op3_dst(struct r600_shader_ctx * ctx,int dst)7292 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7293 {
7294 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7295 struct r600_bytecode_alu alu;
7296 struct r600_bytecode_alu_src srcs[4][4];
7297 int i, j, r;
7298 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7299 unsigned op = ctx->inst_info->op;
7300
7301 if (op == ALU_OP3_MULADD_IEEE &&
7302 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
7303 op = ALU_OP3_MULADD;
7304
7305 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7306 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7307 srcs[j], &ctx->src[j]);
7308 if (r)
7309 return r;
7310 }
7311
7312 for (i = 0; i < lasti + 1; i++) {
7313 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7314 continue;
7315
7316 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7317 alu.op = op;
7318 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7319 alu.src[j] = srcs[j][i];
7320 }
7321
7322 if (dst == -1) {
7323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7324 } else {
7325 alu.dst.sel = dst;
7326 }
7327 alu.dst.chan = i;
7328 alu.dst.write = 1;
7329 alu.is_op3 = 1;
7330 if (i == lasti) {
7331 alu.last = 1;
7332 }
7333 r = r600_bytecode_add_alu(ctx->bc, &alu);
7334 if (r)
7335 return r;
7336 }
7337 return 0;
7338 }
7339
tgsi_op3(struct r600_shader_ctx * ctx)7340 static int tgsi_op3(struct r600_shader_ctx *ctx)
7341 {
7342 return tgsi_op3_dst(ctx, -1);
7343 }
7344
tgsi_dp(struct r600_shader_ctx * ctx)7345 static int tgsi_dp(struct r600_shader_ctx *ctx)
7346 {
7347 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7348 struct r600_bytecode_alu alu;
7349 int i, j, r;
7350 unsigned op = ctx->inst_info->op;
7351 if (op == ALU_OP2_DOT4_IEEE &&
7352 ctx->info.properties[TGSI_PROPERTY_LEGACY_MATH_RULES])
7353 op = ALU_OP2_DOT4;
7354
7355 for (i = 0; i < 4; i++) {
7356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7357 alu.op = op;
7358 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7359 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7360 }
7361
7362 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7363 alu.dst.chan = i;
7364 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7365 /* handle some special cases */
7366 switch (inst->Instruction.Opcode) {
7367 case TGSI_OPCODE_DP2:
7368 if (i > 1) {
7369 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7370 alu.src[0].chan = alu.src[1].chan = 0;
7371 }
7372 break;
7373 case TGSI_OPCODE_DP3:
7374 if (i > 2) {
7375 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7376 alu.src[0].chan = alu.src[1].chan = 0;
7377 }
7378 break;
7379 default:
7380 break;
7381 }
7382 if (i == 3) {
7383 alu.last = 1;
7384 }
7385 r = r600_bytecode_add_alu(ctx->bc, &alu);
7386 if (r)
7387 return r;
7388 }
7389 return 0;
7390 }
7391
tgsi_tex_src_requires_loading(struct r600_shader_ctx * ctx,unsigned index)7392 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7393 unsigned index)
7394 {
7395 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7396 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7397 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7398 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7399 ctx->src[index].neg || ctx->src[index].abs ||
7400 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7401 }
7402
tgsi_tex_get_src_gpr(struct r600_shader_ctx * ctx,unsigned index)7403 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7404 unsigned index)
7405 {
7406 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7407 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7408 }
7409
do_vtx_fetch_inst(struct r600_shader_ctx * ctx,boolean src_requires_loading)7410 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7411 {
7412 struct r600_bytecode_vtx vtx;
7413 struct r600_bytecode_alu alu;
7414 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7415 int src_gpr, r, i;
7416 int id = tgsi_tex_get_src_gpr(ctx, 1);
7417 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7418
7419 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7420 if (src_requires_loading) {
7421 for (i = 0; i < 4; i++) {
7422 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7423 alu.op = ALU_OP1_MOV;
7424 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7425 alu.dst.sel = ctx->temp_reg;
7426 alu.dst.chan = i;
7427 if (i == 3)
7428 alu.last = 1;
7429 alu.dst.write = 1;
7430 r = r600_bytecode_add_alu(ctx->bc, &alu);
7431 if (r)
7432 return r;
7433 }
7434 src_gpr = ctx->temp_reg;
7435 }
7436
7437 memset(&vtx, 0, sizeof(vtx));
7438 vtx.op = FETCH_OP_VFETCH;
7439 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7440 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7441 vtx.src_gpr = src_gpr;
7442 vtx.mega_fetch_count = 16;
7443 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7444 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7445 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
7446 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
7447 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
7448 vtx.use_const_fields = 1;
7449 vtx.buffer_index_mode = sampler_index_mode;
7450
7451 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7452 return r;
7453
7454 if (ctx->bc->gfx_level >= EVERGREEN)
7455 return 0;
7456
7457 for (i = 0; i < 4; i++) {
7458 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7459 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7460 continue;
7461
7462 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7463 alu.op = ALU_OP2_AND_INT;
7464
7465 alu.dst.chan = i;
7466 alu.dst.sel = vtx.dst_gpr;
7467 alu.dst.write = 1;
7468
7469 alu.src[0].sel = vtx.dst_gpr;
7470 alu.src[0].chan = i;
7471
7472 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7473 alu.src[1].sel += (id * 2);
7474 alu.src[1].chan = i % 4;
7475 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7476
7477 if (i == lasti)
7478 alu.last = 1;
7479 r = r600_bytecode_add_alu(ctx->bc, &alu);
7480 if (r)
7481 return r;
7482 }
7483
7484 if (inst->Dst[0].Register.WriteMask & 3) {
7485 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7486 alu.op = ALU_OP2_OR_INT;
7487
7488 alu.dst.chan = 3;
7489 alu.dst.sel = vtx.dst_gpr;
7490 alu.dst.write = 1;
7491
7492 alu.src[0].sel = vtx.dst_gpr;
7493 alu.src[0].chan = 3;
7494
7495 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7496 alu.src[1].chan = 0;
7497 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7498
7499 alu.last = 1;
7500 r = r600_bytecode_add_alu(ctx->bc, &alu);
7501 if (r)
7502 return r;
7503 }
7504 return 0;
7505 }
7506
r600_do_buffer_txq(struct r600_shader_ctx * ctx,int reg_idx,int offset,int eg_buffer_base)7507 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7508 {
7509 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7510 int r;
7511 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7512 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7513
7514 if (ctx->bc->gfx_level < EVERGREEN) {
7515 struct r600_bytecode_alu alu;
7516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7517 alu.op = ALU_OP1_MOV;
7518 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7519 /* r600 we have them at channel 2 of the second dword */
7520 alu.src[0].sel += (id * 2) + 1;
7521 alu.src[0].chan = 1;
7522 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7523 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7524 alu.last = 1;
7525 r = r600_bytecode_add_alu(ctx->bc, &alu);
7526 if (r)
7527 return r;
7528 return 0;
7529 } else {
7530 struct r600_bytecode_vtx vtx;
7531 memset(&vtx, 0, sizeof(vtx));
7532 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7533 vtx.buffer_id = id + eg_buffer_base;
7534 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7535 vtx.src_gpr = 0;
7536 vtx.mega_fetch_count = 16; /* no idea here really... */
7537 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7538 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7539 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
7540 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
7541 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
7542 vtx.data_format = FMT_32_32_32_32;
7543 vtx.buffer_index_mode = sampler_index_mode;
7544
7545 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7546 return r;
7547 return 0;
7548 }
7549 }
7550
7551
tgsi_tex(struct r600_shader_ctx * ctx)7552 static int tgsi_tex(struct r600_shader_ctx *ctx)
7553 {
7554 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7555 struct r600_bytecode_tex tex;
7556 struct r600_bytecode_tex grad_offs[3];
7557 struct r600_bytecode_alu alu;
7558 unsigned src_gpr;
7559 int r, i, j, n_grad_offs = 0;
7560 int opcode;
7561 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7562 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7563 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7564 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7565
7566 bool txf_add_offsets = inst->Texture.NumOffsets &&
7567 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7568 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7569
7570 /* Texture fetch instructions can only use gprs as source.
7571 * Also they cannot negate the source or take the absolute value */
7572 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7573 tgsi_tex_src_requires_loading(ctx, 0)) ||
7574 read_compressed_msaa || txf_add_offsets;
7575
7576 boolean src_loaded = FALSE;
7577 unsigned sampler_src_reg = 1;
7578 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7579 boolean has_txq_cube_array_z = false;
7580 unsigned sampler_index_mode;
7581 int array_index_offset_channel = -1;
7582
7583 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7584 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7585 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7586 if (inst->Dst[0].Register.WriteMask & 4) {
7587 ctx->shader->has_txq_cube_array_z_comp = true;
7588 has_txq_cube_array_z = true;
7589 }
7590
7591 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7592 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7593 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7594 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7595 sampler_src_reg = 2;
7596
7597 /* TGSI moves the sampler to src reg 3 for TXD */
7598 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7599 sampler_src_reg = 3;
7600
7601 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7602
7603 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7604
7605 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7606 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7607 if (ctx->bc->gfx_level < EVERGREEN)
7608 ctx->shader->uses_tex_buffers = true;
7609 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7610 }
7611 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7612 if (ctx->bc->gfx_level < EVERGREEN)
7613 ctx->shader->uses_tex_buffers = true;
7614 return do_vtx_fetch_inst(ctx, src_requires_loading);
7615 }
7616 }
7617
7618 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7619 int out_chan;
7620 /* Add perspective divide */
7621 if (ctx->bc->gfx_level == CAYMAN) {
7622 out_chan = 2;
7623 for (i = 0; i < 3; i++) {
7624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7625 alu.op = ALU_OP1_RECIP_IEEE;
7626 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7627
7628 alu.dst.sel = ctx->temp_reg;
7629 alu.dst.chan = i;
7630 if (i == 2)
7631 alu.last = 1;
7632 if (out_chan == i)
7633 alu.dst.write = 1;
7634 r = r600_bytecode_add_alu(ctx->bc, &alu);
7635 if (r)
7636 return r;
7637 }
7638
7639 } else {
7640 out_chan = 3;
7641 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7642 alu.op = ALU_OP1_RECIP_IEEE;
7643 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7644
7645 alu.dst.sel = ctx->temp_reg;
7646 alu.dst.chan = out_chan;
7647 alu.last = 1;
7648 alu.dst.write = 1;
7649 r = r600_bytecode_add_alu(ctx->bc, &alu);
7650 if (r)
7651 return r;
7652 }
7653
7654 for (i = 0; i < 3; i++) {
7655 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7656 alu.op = ALU_OP2_MUL;
7657 alu.src[0].sel = ctx->temp_reg;
7658 alu.src[0].chan = out_chan;
7659 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7660 alu.dst.sel = ctx->temp_reg;
7661 alu.dst.chan = i;
7662 alu.dst.write = 1;
7663 r = r600_bytecode_add_alu(ctx->bc, &alu);
7664 if (r)
7665 return r;
7666 }
7667 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7668 alu.op = ALU_OP1_MOV;
7669 alu.src[0].sel = V_SQ_ALU_SRC_1;
7670 alu.src[0].chan = 0;
7671 alu.dst.sel = ctx->temp_reg;
7672 alu.dst.chan = 3;
7673 alu.last = 1;
7674 alu.dst.write = 1;
7675 r = r600_bytecode_add_alu(ctx->bc, &alu);
7676 if (r)
7677 return r;
7678 src_loaded = TRUE;
7679 src_gpr = ctx->temp_reg;
7680 }
7681
7682
7683 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7684 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7685 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7686 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7687 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7688
7689 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7690 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7691
7692 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7693 for (i = 0; i < 4; i++) {
7694 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7695 alu.op = ALU_OP2_CUBE;
7696 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7697 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7698 alu.dst.sel = ctx->temp_reg;
7699 alu.dst.chan = i;
7700 if (i == 3)
7701 alu.last = 1;
7702 alu.dst.write = 1;
7703 r = r600_bytecode_add_alu(ctx->bc, &alu);
7704 if (r)
7705 return r;
7706 }
7707
7708 /* tmp1.z = RCP_e(|tmp1.z|) */
7709 if (ctx->bc->gfx_level == CAYMAN) {
7710 for (i = 0; i < 3; i++) {
7711 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7712 alu.op = ALU_OP1_RECIP_IEEE;
7713 alu.src[0].sel = ctx->temp_reg;
7714 alu.src[0].chan = 2;
7715 alu.src[0].abs = 1;
7716 alu.dst.sel = ctx->temp_reg;
7717 alu.dst.chan = i;
7718 if (i == 2)
7719 alu.dst.write = 1;
7720 if (i == 2)
7721 alu.last = 1;
7722 r = r600_bytecode_add_alu(ctx->bc, &alu);
7723 if (r)
7724 return r;
7725 }
7726 } else {
7727 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7728 alu.op = ALU_OP1_RECIP_IEEE;
7729 alu.src[0].sel = ctx->temp_reg;
7730 alu.src[0].chan = 2;
7731 alu.src[0].abs = 1;
7732 alu.dst.sel = ctx->temp_reg;
7733 alu.dst.chan = 2;
7734 alu.dst.write = 1;
7735 alu.last = 1;
7736 r = r600_bytecode_add_alu(ctx->bc, &alu);
7737 if (r)
7738 return r;
7739 }
7740
7741 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7742 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7743 * muladd has no writemask, have to use another temp
7744 */
7745 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7746 alu.op = ALU_OP3_MULADD;
7747 alu.is_op3 = 1;
7748
7749 alu.src[0].sel = ctx->temp_reg;
7750 alu.src[0].chan = 0;
7751 alu.src[1].sel = ctx->temp_reg;
7752 alu.src[1].chan = 2;
7753
7754 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7755 alu.src[2].chan = 0;
7756 alu.src[2].value = u_bitcast_f2u(1.5f);
7757
7758 alu.dst.sel = ctx->temp_reg;
7759 alu.dst.chan = 0;
7760 alu.dst.write = 1;
7761
7762 r = r600_bytecode_add_alu(ctx->bc, &alu);
7763 if (r)
7764 return r;
7765
7766 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7767 alu.op = ALU_OP3_MULADD;
7768 alu.is_op3 = 1;
7769
7770 alu.src[0].sel = ctx->temp_reg;
7771 alu.src[0].chan = 1;
7772 alu.src[1].sel = ctx->temp_reg;
7773 alu.src[1].chan = 2;
7774
7775 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7776 alu.src[2].chan = 0;
7777 alu.src[2].value = u_bitcast_f2u(1.5f);
7778
7779 alu.dst.sel = ctx->temp_reg;
7780 alu.dst.chan = 1;
7781 alu.dst.write = 1;
7782
7783 alu.last = 1;
7784 r = r600_bytecode_add_alu(ctx->bc, &alu);
7785 if (r)
7786 return r;
7787 /* write initial compare value into Z component
7788 - W src 0 for shadow cube
7789 - X src 1 for shadow cube array */
7790 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7791 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7792 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7793 alu.op = ALU_OP1_MOV;
7794 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7795 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7796 else
7797 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7798 alu.dst.sel = ctx->temp_reg;
7799 alu.dst.chan = 2;
7800 alu.dst.write = 1;
7801 alu.last = 1;
7802 r = r600_bytecode_add_alu(ctx->bc, &alu);
7803 if (r)
7804 return r;
7805 }
7806
7807 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7808 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7809 if (ctx->bc->gfx_level >= EVERGREEN) {
7810 int mytmp = r600_get_temp(ctx);
7811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7812 alu.op = ALU_OP1_MOV;
7813 alu.src[0].sel = ctx->temp_reg;
7814 alu.src[0].chan = 3;
7815 alu.dst.sel = mytmp;
7816 alu.dst.chan = 0;
7817 alu.dst.write = 1;
7818 alu.last = 1;
7819 r = r600_bytecode_add_alu(ctx->bc, &alu);
7820 if (r)
7821 return r;
7822
7823 /* Evaluate the array index according to floor(idx + 0.5). This
7824 * needs to be done before merging the face select value, because
7825 * otherwise the fractional part of the array index will interfere
7826 * with the face select value */
7827 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7828 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7829 alu.op = ALU_OP1_RNDNE;
7830 alu.dst.sel = ctx->temp_reg;
7831 alu.dst.chan = 3;
7832 alu.dst.write = 1;
7833 alu.last = 1;
7834 r = r600_bytecode_add_alu(ctx->bc, &alu);
7835 if (r)
7836 return r;
7837
7838 /* Because the array slice index and the cube face index are merged
7839 * into one value we have to make sure the array slice index is >= 0,
7840 * otherwise the face selection will fail */
7841 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7842 alu.op = ALU_OP2_MAX;
7843 alu.src[0].sel = ctx->temp_reg;
7844 alu.src[0].chan = 3;
7845 alu.src[1].sel = V_SQ_ALU_SRC_0;
7846 alu.dst.sel = ctx->temp_reg;
7847 alu.dst.chan = 3;
7848 alu.dst.write = 1;
7849 alu.last = 1;
7850 r = r600_bytecode_add_alu(ctx->bc, &alu);
7851 if (r)
7852 return r;
7853
7854 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7856 alu.op = ALU_OP3_MULADD;
7857 alu.is_op3 = 1;
7858 alu.src[0].sel = ctx->temp_reg;
7859 alu.src[0].chan = 3;
7860 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7861 alu.src[1].chan = 0;
7862 alu.src[1].value = u_bitcast_f2u(8.0f);
7863 alu.src[2].sel = mytmp;
7864 alu.src[2].chan = 0;
7865 alu.dst.sel = ctx->temp_reg;
7866 alu.dst.chan = 3;
7867 alu.dst.write = 1;
7868 alu.last = 1;
7869 r = r600_bytecode_add_alu(ctx->bc, &alu);
7870 if (r)
7871 return r;
7872 } else if (ctx->bc->gfx_level < EVERGREEN) {
7873 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7874 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7875 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7876 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7877 tex.src_gpr = r600_get_temp(ctx);
7878 tex.src_sel_x = 0;
7879 tex.src_sel_y = 0;
7880 tex.src_sel_z = 0;
7881 tex.src_sel_w = 0;
7882 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7883 tex.coord_type_x = 1;
7884 tex.coord_type_y = 1;
7885 tex.coord_type_z = 1;
7886 tex.coord_type_w = 1;
7887 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7888 alu.op = ALU_OP1_MOV;
7889 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7890 alu.dst.sel = tex.src_gpr;
7891 alu.dst.chan = 0;
7892 alu.last = 1;
7893 alu.dst.write = 1;
7894 r = r600_bytecode_add_alu(ctx->bc, &alu);
7895 if (r)
7896 return r;
7897
7898 r = r600_bytecode_add_tex(ctx->bc, &tex);
7899 if (r)
7900 return r;
7901 }
7902
7903 }
7904
7905 /* for cube forms of lod and bias we need to route things */
7906 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7907 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7908 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7909 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7910 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7911 alu.op = ALU_OP1_MOV;
7912 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7913 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7914 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7915 else
7916 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7917 alu.dst.sel = ctx->temp_reg;
7918 alu.dst.chan = 2;
7919 alu.last = 1;
7920 alu.dst.write = 1;
7921 r = r600_bytecode_add_alu(ctx->bc, &alu);
7922 if (r)
7923 return r;
7924 }
7925
7926 src_loaded = TRUE;
7927 src_gpr = ctx->temp_reg;
7928 }
7929
7930 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7931 int temp_h = 0, temp_v = 0;
7932 int start_val = 0;
7933
7934 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7935 if (src_loaded == TRUE)
7936 start_val = 1;
7937 else
7938 src_loaded = TRUE;
7939 for (i = start_val; i < 3; i++) {
7940 int treg = r600_get_temp(ctx);
7941
7942 if (i == 0)
7943 src_gpr = treg;
7944 else if (i == 1)
7945 temp_h = treg;
7946 else
7947 temp_v = treg;
7948
7949 for (j = 0; j < 4; j++) {
7950 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7951 alu.op = ALU_OP1_MOV;
7952 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7953 alu.dst.sel = treg;
7954 alu.dst.chan = j;
7955 if (j == 3)
7956 alu.last = 1;
7957 alu.dst.write = 1;
7958 r = r600_bytecode_add_alu(ctx->bc, &alu);
7959 if (r)
7960 return r;
7961 }
7962 }
7963 for (i = 1; i < 3; i++) {
7964 /* set gradients h/v */
7965 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7966 memset(t, 0, sizeof(struct r600_bytecode_tex));
7967 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7968 FETCH_OP_SET_GRADIENTS_V;
7969 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7970 t->sampler_index_mode = sampler_index_mode;
7971 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7972 t->resource_index_mode = sampler_index_mode;
7973
7974 t->src_gpr = (i == 1) ? temp_h : temp_v;
7975 t->src_sel_x = 0;
7976 t->src_sel_y = 1;
7977 t->src_sel_z = 2;
7978 t->src_sel_w = 3;
7979
7980 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7981 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7982 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7983 t->coord_type_x = 1;
7984 t->coord_type_y = 1;
7985 t->coord_type_z = 1;
7986 t->coord_type_w = 1;
7987 }
7988 }
7989 }
7990
7991 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7992 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
7993 * incorrectly forces nearest filtering if the texture format is integer.
7994 * The only effect it has on Gather4, which always returns 4 texels for
7995 * bilinear filtering, is that the final coordinates are off by 0.5 of
7996 * the texel size.
7997 *
7998 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7999 * or (0.5 / size) from the normalized coordinates.
8000 */
8001 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
8002 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
8003 int treg = r600_get_temp(ctx);
8004
8005 /* mov array and comparison oordinate to temp_reg if needed */
8006 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8007 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8008 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
8009 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
8010 for (i = 2; i <= end; i++) {
8011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8012 alu.op = ALU_OP1_MOV;
8013 alu.dst.sel = ctx->temp_reg;
8014 alu.dst.chan = i;
8015 alu.dst.write = 1;
8016 alu.last = (i == end);
8017 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8018 r = r600_bytecode_add_alu(ctx->bc, &alu);
8019 if (r)
8020 return r;
8021 }
8022 }
8023
8024 if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
8025 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
8026 for (i = 0; i < 2; i++) {
8027 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8028 alu.op = ALU_OP2_ADD;
8029 alu.dst.sel = ctx->temp_reg;
8030 alu.dst.chan = i;
8031 alu.dst.write = 1;
8032 alu.last = i == 1;
8033 if (src_loaded) {
8034 alu.src[0].sel = ctx->temp_reg;
8035 alu.src[0].chan = i;
8036 } else
8037 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8038 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8039 alu.src[1].neg = 1;
8040 r = r600_bytecode_add_alu(ctx->bc, &alu);
8041 if (r)
8042 return r;
8043 }
8044 } else {
8045 /* execute a TXQ */
8046 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8047 tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
8048 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8049 tex.sampler_index_mode = sampler_index_mode;
8050 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8051 tex.resource_index_mode = sampler_index_mode;
8052 tex.dst_gpr = treg;
8053 tex.src_sel_x = 4;
8054 tex.src_sel_y = 4;
8055 tex.src_sel_z = 4;
8056 tex.src_sel_w = 4;
8057 tex.dst_sel_x = 0;
8058 tex.dst_sel_y = 1;
8059 tex.dst_sel_z = 7;
8060 tex.dst_sel_w = 7;
8061 r = r600_bytecode_add_tex(ctx->bc, &tex);
8062 if (r)
8063 return r;
8064
8065 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
8066 if (ctx->bc->gfx_level == CAYMAN) {
8067 /* */
8068 for (i = 0; i < 2; i++) {
8069 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8070 alu.op = ALU_OP1_INT_TO_FLT;
8071 alu.dst.sel = treg;
8072 alu.dst.chan = i;
8073 alu.dst.write = 1;
8074 alu.src[0].sel = treg;
8075 alu.src[0].chan = i;
8076 alu.last = (i == 1) ? 1 : 0;
8077 r = r600_bytecode_add_alu(ctx->bc, &alu);
8078 if (r)
8079 return r;
8080 }
8081 for (j = 0; j < 2; j++) {
8082 for (i = 0; i < 3; i++) {
8083 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8084 alu.op = ALU_OP1_RECIP_IEEE;
8085 alu.src[0].sel = treg;
8086 alu.src[0].chan = j;
8087 alu.dst.sel = treg;
8088 alu.dst.chan = i;
8089 if (i == 2)
8090 alu.last = 1;
8091 if (i == j)
8092 alu.dst.write = 1;
8093 r = r600_bytecode_add_alu(ctx->bc, &alu);
8094 if (r)
8095 return r;
8096 }
8097 }
8098 } else {
8099 for (i = 0; i < 2; i++) {
8100 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8101 alu.op = ALU_OP1_INT_TO_FLT;
8102 alu.dst.sel = treg;
8103 alu.dst.chan = i;
8104 alu.dst.write = 1;
8105 alu.src[0].sel = treg;
8106 alu.src[0].chan = i;
8107 alu.last = 1;
8108 r = r600_bytecode_add_alu(ctx->bc, &alu);
8109 if (r)
8110 return r;
8111 }
8112 for (i = 0; i < 2; i++) {
8113 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8114 alu.op = ALU_OP1_RECIP_IEEE;
8115 alu.src[0].sel = treg;
8116 alu.src[0].chan = i;
8117 alu.dst.sel = treg;
8118 alu.dst.chan = i;
8119 alu.last = 1;
8120 alu.dst.write = 1;
8121 r = r600_bytecode_add_alu(ctx->bc, &alu);
8122 if (r)
8123 return r;
8124 }
8125 }
8126 for (i = 0; i < 2; i++) {
8127 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8128 alu.op = ALU_OP3_MULADD;
8129 alu.is_op3 = 1;
8130 alu.dst.sel = ctx->temp_reg;
8131 alu.dst.chan = i;
8132 alu.dst.write = 1;
8133 alu.last = i == 1;
8134 alu.src[0].sel = treg;
8135 alu.src[0].chan = i;
8136 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8137 alu.src[1].neg = 1;
8138 if (src_loaded) {
8139 alu.src[2].sel = ctx->temp_reg;
8140 alu.src[2].chan = i;
8141 } else
8142 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8143 r = r600_bytecode_add_alu(ctx->bc, &alu);
8144 if (r)
8145 return r;
8146 }
8147 }
8148 src_loaded = TRUE;
8149 src_gpr = ctx->temp_reg;
8150 }
8151 }
8152
8153 if (src_requires_loading && !src_loaded) {
8154 for (i = 0; i < 4; i++) {
8155 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8156 alu.op = ALU_OP1_MOV;
8157 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8158 alu.dst.sel = ctx->temp_reg;
8159 alu.dst.chan = i;
8160 if (i == 3)
8161 alu.last = 1;
8162 alu.dst.write = 1;
8163 r = r600_bytecode_add_alu(ctx->bc, &alu);
8164 if (r)
8165 return r;
8166 }
8167 src_loaded = TRUE;
8168 src_gpr = ctx->temp_reg;
8169 }
8170
8171 /* get offset values */
8172 if (inst->Texture.NumOffsets) {
8173 assert(inst->Texture.NumOffsets == 1);
8174
8175 /* The texture offset feature doesn't work with the TXF instruction
8176 * and must be emulated by adding the offset to the texture coordinates. */
8177 if (txf_add_offsets) {
8178 const struct tgsi_texture_offset *off = inst->TexOffsets;
8179
8180 switch (inst->Texture.Texture) {
8181 case TGSI_TEXTURE_3D:
8182 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8183 alu.op = ALU_OP2_ADD_INT;
8184 alu.src[0].sel = src_gpr;
8185 alu.src[0].chan = 2;
8186 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8187 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8188 alu.dst.sel = src_gpr;
8189 alu.dst.chan = 2;
8190 alu.dst.write = 1;
8191 alu.last = 1;
8192 r = r600_bytecode_add_alu(ctx->bc, &alu);
8193 if (r)
8194 return r;
8195 FALLTHROUGH;
8196
8197 case TGSI_TEXTURE_2D:
8198 case TGSI_TEXTURE_SHADOW2D:
8199 case TGSI_TEXTURE_RECT:
8200 case TGSI_TEXTURE_SHADOWRECT:
8201 case TGSI_TEXTURE_2D_ARRAY:
8202 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8203 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8204 alu.op = ALU_OP2_ADD_INT;
8205 alu.src[0].sel = src_gpr;
8206 alu.src[0].chan = 1;
8207 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8208 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8209 alu.dst.sel = src_gpr;
8210 alu.dst.chan = 1;
8211 alu.dst.write = 1;
8212 alu.last = 1;
8213 r = r600_bytecode_add_alu(ctx->bc, &alu);
8214 if (r)
8215 return r;
8216 FALLTHROUGH;
8217
8218 case TGSI_TEXTURE_1D:
8219 case TGSI_TEXTURE_SHADOW1D:
8220 case TGSI_TEXTURE_1D_ARRAY:
8221 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8222 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8223 alu.op = ALU_OP2_ADD_INT;
8224 alu.src[0].sel = src_gpr;
8225 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8226 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8227 alu.dst.sel = src_gpr;
8228 alu.dst.write = 1;
8229 alu.last = 1;
8230 r = r600_bytecode_add_alu(ctx->bc, &alu);
8231 if (r)
8232 return r;
8233 break;
8234 /* texture offsets do not apply to other texture targets */
8235 }
8236 } else {
8237 switch (inst->Texture.Texture) {
8238 case TGSI_TEXTURE_3D:
8239 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8240 FALLTHROUGH;
8241 case TGSI_TEXTURE_2D:
8242 case TGSI_TEXTURE_SHADOW2D:
8243 case TGSI_TEXTURE_RECT:
8244 case TGSI_TEXTURE_SHADOWRECT:
8245 case TGSI_TEXTURE_2D_ARRAY:
8246 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8247 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8248 FALLTHROUGH;
8249 case TGSI_TEXTURE_1D:
8250 case TGSI_TEXTURE_SHADOW1D:
8251 case TGSI_TEXTURE_1D_ARRAY:
8252 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8253 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8254 }
8255 }
8256 }
8257
8258 /* Obtain the sample index for reading a compressed MSAA color texture.
8259 * To read the FMASK, we use the ldfptr instruction, which tells us
8260 * where the samples are stored.
8261 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8262 * which is the identity mapping. Each nibble says which physical sample
8263 * should be fetched to get that sample.
8264 *
8265 * Assume src.z contains the sample index. It should be modified like this:
8266 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8267 * Then fetch the texel with src.
8268 */
8269 if (read_compressed_msaa) {
8270 unsigned sample_chan = 3;
8271 unsigned temp = r600_get_temp(ctx);
8272 assert(src_loaded);
8273
8274 /* temp.w = ldfptr() */
8275 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8276 tex.op = FETCH_OP_LD;
8277 tex.inst_mod = 1; /* to indicate this is ldfptr */
8278 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8279 tex.sampler_index_mode = sampler_index_mode;
8280 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8281 tex.resource_index_mode = sampler_index_mode;
8282 tex.src_gpr = src_gpr;
8283 tex.dst_gpr = temp;
8284 tex.dst_sel_x = 7; /* mask out these components */
8285 tex.dst_sel_y = 7;
8286 tex.dst_sel_z = 7;
8287 tex.dst_sel_w = 0; /* store X */
8288 tex.src_sel_x = 0;
8289 tex.src_sel_y = 1;
8290 tex.src_sel_z = 2;
8291 tex.src_sel_w = 3;
8292 tex.offset_x = offset_x;
8293 tex.offset_y = offset_y;
8294 tex.offset_z = offset_z;
8295 r = r600_bytecode_add_tex(ctx->bc, &tex);
8296 if (r)
8297 return r;
8298
8299 /* temp.x = sample_index*4 */
8300 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8301 alu.op = ALU_OP2_MULLO_INT;
8302 alu.src[0].sel = src_gpr;
8303 alu.src[0].chan = sample_chan;
8304 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8305 alu.src[1].value = 4;
8306 alu.dst.sel = temp;
8307 alu.dst.chan = 0;
8308 alu.dst.write = 1;
8309 r = emit_mul_int_op(ctx->bc, &alu);
8310 if (r)
8311 return r;
8312
8313 /* sample_index = temp.w >> temp.x */
8314 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8315 alu.op = ALU_OP2_LSHR_INT;
8316 alu.src[0].sel = temp;
8317 alu.src[0].chan = 3;
8318 alu.src[1].sel = temp;
8319 alu.src[1].chan = 0;
8320 alu.dst.sel = src_gpr;
8321 alu.dst.chan = sample_chan;
8322 alu.dst.write = 1;
8323 alu.last = 1;
8324 r = r600_bytecode_add_alu(ctx->bc, &alu);
8325 if (r)
8326 return r;
8327
8328 /* sample_index & 0xF */
8329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8330 alu.op = ALU_OP2_AND_INT;
8331 alu.src[0].sel = src_gpr;
8332 alu.src[0].chan = sample_chan;
8333 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8334 alu.src[1].value = 0xF;
8335 alu.dst.sel = src_gpr;
8336 alu.dst.chan = sample_chan;
8337 alu.dst.write = 1;
8338 alu.last = 1;
8339 r = r600_bytecode_add_alu(ctx->bc, &alu);
8340 if (r)
8341 return r;
8342 #if 0
8343 /* visualize the FMASK */
8344 for (i = 0; i < 4; i++) {
8345 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8346 alu.op = ALU_OP1_INT_TO_FLT;
8347 alu.src[0].sel = src_gpr;
8348 alu.src[0].chan = sample_chan;
8349 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8350 alu.dst.chan = i;
8351 alu.dst.write = 1;
8352 alu.last = 1;
8353 r = r600_bytecode_add_alu(ctx->bc, &alu);
8354 if (r)
8355 return r;
8356 }
8357 return 0;
8358 #endif
8359 }
8360
8361 /* does this shader want a num layers from TXQ for a cube array? */
8362 if (has_txq_cube_array_z) {
8363 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8364
8365 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8366 alu.op = ALU_OP1_MOV;
8367
8368 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8369 if (ctx->bc->gfx_level >= EVERGREEN) {
8370 /* with eg each dword is number of cubes */
8371 alu.src[0].sel += id / 4;
8372 alu.src[0].chan = id % 4;
8373 } else {
8374 /* r600 we have them at channel 2 of the second dword */
8375 alu.src[0].sel += (id * 2) + 1;
8376 alu.src[0].chan = 2;
8377 }
8378 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8379 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8380 alu.last = 1;
8381 r = r600_bytecode_add_alu(ctx->bc, &alu);
8382 if (r)
8383 return r;
8384 /* disable writemask from texture instruction */
8385 inst->Dst[0].Register.WriteMask &= ~4;
8386 }
8387
8388 opcode = ctx->inst_info->op;
8389 if (opcode == FETCH_OP_GATHER4 &&
8390 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8391 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8392 struct r600_bytecode_tex *t;
8393 opcode = FETCH_OP_GATHER4_O;
8394
8395 /* GATHER4_O/GATHER4_C_O use offset values loaded by
8396 SET_TEXTURE_OFFSETS instruction. The immediate offset values
8397 encoded in the instruction are ignored. */
8398 t = &grad_offs[n_grad_offs++];
8399 memset(t, 0, sizeof(struct r600_bytecode_tex));
8400 t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8401 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8402 t->sampler_index_mode = sampler_index_mode;
8403 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8404 t->resource_index_mode = sampler_index_mode;
8405
8406 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8407 t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8408 t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8409 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8410 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8411 /* make sure array index selector is 0, this is just a safety
8412 * precausion because TGSI seems to emit something strange here */
8413 t->src_sel_z = 4;
8414 else
8415 t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8416
8417 t->src_sel_w = 4;
8418
8419 t->dst_sel_x = 7;
8420 t->dst_sel_y = 7;
8421 t->dst_sel_z = 7;
8422 t->dst_sel_w = 7;
8423 }
8424
8425 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8426 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8427 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8428 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8429 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8430 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8431 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8432 switch (opcode) {
8433 case FETCH_OP_SAMPLE:
8434 opcode = FETCH_OP_SAMPLE_C;
8435 break;
8436 case FETCH_OP_SAMPLE_L:
8437 opcode = FETCH_OP_SAMPLE_C_L;
8438 break;
8439 case FETCH_OP_SAMPLE_LB:
8440 opcode = FETCH_OP_SAMPLE_C_LB;
8441 break;
8442 case FETCH_OP_SAMPLE_G:
8443 opcode = FETCH_OP_SAMPLE_C_G;
8444 break;
8445 /* Texture gather variants */
8446 case FETCH_OP_GATHER4:
8447 opcode = FETCH_OP_GATHER4_C;
8448 break;
8449 case FETCH_OP_GATHER4_O:
8450 opcode = FETCH_OP_GATHER4_C_O;
8451 break;
8452 }
8453 }
8454
8455 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8456 tex.op = opcode;
8457
8458 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8459 tex.sampler_index_mode = sampler_index_mode;
8460 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8461 tex.resource_index_mode = sampler_index_mode;
8462 tex.src_gpr = src_gpr;
8463 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8464
8465 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8466 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8467 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8468 }
8469
8470 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8471 if (inst->Src[1].Register.File != TGSI_FILE_IMMEDIATE) {
8472 /* TGSI doesn't have a spot to put the component for
8473 * shadowcubes, so it drops it on the floor. Just
8474 * assume the user wanted component 0 (it's a shadow,
8475 * anything else would be absurd).
8476 */
8477 assert(inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY);
8478 tex.inst_mod = 0;
8479 } else {
8480 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8481 tex.inst_mod = texture_component_select;
8482 }
8483
8484 if (ctx->bc->gfx_level == CAYMAN) {
8485 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8486 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8487 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8488 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8489 } else {
8490 /* GATHER4 result order is different from TGSI TG4 */
8491 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8492 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8493 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8494 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8495 }
8496 }
8497 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8498 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8499 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8500 tex.dst_sel_z = 7;
8501 tex.dst_sel_w = 7;
8502 }
8503 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8504 tex.dst_sel_x = 3;
8505 tex.dst_sel_y = 7;
8506 tex.dst_sel_z = 7;
8507 tex.dst_sel_w = 7;
8508 }
8509 else {
8510 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8511 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8512 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8513 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8514 }
8515
8516
8517 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8518 tex.src_sel_x = 4;
8519 tex.src_sel_y = 4;
8520 tex.src_sel_z = 4;
8521 tex.src_sel_w = 4;
8522 } else if (src_loaded) {
8523 tex.src_sel_x = 0;
8524 tex.src_sel_y = 1;
8525 tex.src_sel_z = 2;
8526 tex.src_sel_w = 3;
8527 } else {
8528 tex.src_sel_x = ctx->src[0].swizzle[0];
8529 tex.src_sel_y = ctx->src[0].swizzle[1];
8530 tex.src_sel_z = ctx->src[0].swizzle[2];
8531 tex.src_sel_w = ctx->src[0].swizzle[3];
8532 tex.src_rel = ctx->src[0].rel;
8533 }
8534
8535 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8536 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8537 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8538 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8539 tex.src_sel_x = 1;
8540 tex.src_sel_y = 0;
8541 tex.src_sel_z = 3;
8542 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8543 }
8544
8545 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8546 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8547 tex.coord_type_x = 1;
8548 tex.coord_type_y = 1;
8549 }
8550 tex.coord_type_z = 1;
8551 tex.coord_type_w = 1;
8552
8553 tex.offset_x = offset_x;
8554 tex.offset_y = offset_y;
8555 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8556 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8557 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8558 tex.offset_z = 0;
8559 }
8560 else {
8561 tex.offset_z = offset_z;
8562 }
8563
8564 /* Put the depth for comparison in W.
8565 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8566 * Some instructions expect the depth in Z. */
8567 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8568 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8569 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8570 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8571 opcode != FETCH_OP_SAMPLE_C_L &&
8572 opcode != FETCH_OP_SAMPLE_C_LB) {
8573 tex.src_sel_w = tex.src_sel_z;
8574 }
8575
8576 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8577 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8578 if (opcode == FETCH_OP_SAMPLE_C_L ||
8579 opcode == FETCH_OP_SAMPLE_C_LB) {
8580 /* the array index is read from Y */
8581 tex.coord_type_y = 0;
8582 array_index_offset_channel = tex.src_sel_y;
8583 } else {
8584 /* the array index is read from Z */
8585 tex.coord_type_z = 0;
8586 tex.src_sel_z = tex.src_sel_y;
8587 array_index_offset_channel = tex.src_sel_z;
8588 }
8589 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8590 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8591 tex.coord_type_z = 0;
8592 array_index_offset_channel = tex.src_sel_z;
8593 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8594 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8595 (ctx->bc->gfx_level >= EVERGREEN))
8596 /* the array index is read from Z, coordinate will be corrected elsewhere */
8597 tex.coord_type_z = 0;
8598
8599 /* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8600 * evaluate the array index */
8601 if (array_index_offset_channel >= 0 &&
8602 opcode != FETCH_OP_LD &&
8603 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8604 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8605 alu.src[0].sel = tex.src_gpr;
8606 alu.src[0].chan = array_index_offset_channel;
8607 alu.src[0].rel = tex.src_rel;
8608 alu.op = ALU_OP1_RNDNE;
8609 alu.dst.sel = tex.src_gpr;
8610 alu.dst.chan = array_index_offset_channel;
8611 alu.dst.rel = tex.src_rel;
8612 alu.dst.write = 1;
8613 alu.last = 1;
8614 r = r600_bytecode_add_alu(ctx->bc, &alu);
8615 if (r)
8616 return r;
8617 }
8618
8619 /* mask unused source components */
8620 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8621 switch (inst->Texture.Texture) {
8622 case TGSI_TEXTURE_2D:
8623 case TGSI_TEXTURE_RECT:
8624 tex.src_sel_z = 7;
8625 tex.src_sel_w = 7;
8626 break;
8627 case TGSI_TEXTURE_1D_ARRAY:
8628 tex.src_sel_y = 7;
8629 tex.src_sel_w = 7;
8630 break;
8631 case TGSI_TEXTURE_1D:
8632 tex.src_sel_y = 7;
8633 tex.src_sel_z = 7;
8634 tex.src_sel_w = 7;
8635 break;
8636 }
8637 }
8638
8639 /* Emit set gradient and offset instructions. */
8640 for (i = 0; i < n_grad_offs; ++i) {
8641 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8642 if (r)
8643 return r;
8644 }
8645
8646 r = r600_bytecode_add_tex(ctx->bc, &tex);
8647 if (r)
8648 return r;
8649
8650 /* add shadow ambient support - gallium doesn't do it yet */
8651 return 0;
8652 }
8653
find_hw_atomic_counter(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src)8654 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8655 struct tgsi_full_src_register *src)
8656 {
8657 unsigned i;
8658
8659 uint32_t index = src->Register.Index;
8660 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8661 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8662 continue;
8663 if (index > ctx->shader->atomics[i].end)
8664 continue;
8665 if (index < ctx->shader->atomics[i].start)
8666 continue;
8667 uint32_t offset = (index - ctx->shader->atomics[i].start);
8668 return ctx->shader->atomics[i].hw_idx + offset;
8669 }
8670 assert(0);
8671 return -1;
8672 }
8673
tgsi_set_gds_temp(struct r600_shader_ctx * ctx,int * uav_id_p,int * uav_index_mode_p)8674 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8675 int *uav_id_p, int *uav_index_mode_p)
8676 {
8677 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8678 int uav_id, uav_index_mode = 0;
8679 int r;
8680 bool is_cm = (ctx->bc->gfx_level == CAYMAN);
8681
8682 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8683
8684 if (inst->Src[0].Register.Indirect) {
8685 if (is_cm) {
8686 struct r600_bytecode_alu alu;
8687 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8688 alu.op = ALU_OP2_LSHL_INT;
8689 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8690 alu.src[0].chan = 0;
8691 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8692 alu.src[1].value = 2;
8693 alu.dst.sel = ctx->temp_reg;
8694 alu.dst.chan = 0;
8695 alu.dst.write = 1;
8696 alu.last = 1;
8697 r = r600_bytecode_add_alu(ctx->bc, &alu);
8698 if (r)
8699 return r;
8700
8701 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8702 ctx->temp_reg, 0,
8703 ctx->temp_reg, 0,
8704 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8705 if (r)
8706 return r;
8707 } else
8708 uav_index_mode = 2;
8709 } else if (is_cm) {
8710 r = single_alu_op2(ctx, ALU_OP1_MOV,
8711 ctx->temp_reg, 0,
8712 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8713 0, 0);
8714 if (r)
8715 return r;
8716 }
8717 *uav_id_p = uav_id;
8718 *uav_index_mode_p = uav_index_mode;
8719 return 0;
8720 }
8721
tgsi_load_gds(struct r600_shader_ctx * ctx)8722 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8723 {
8724 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8725 int r;
8726 struct r600_bytecode_gds gds;
8727 int uav_id = 0;
8728 int uav_index_mode = 0;
8729 bool is_cm = (ctx->bc->gfx_level == CAYMAN);
8730
8731 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8732 if (r)
8733 return r;
8734
8735 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8736 gds.op = FETCH_OP_GDS_READ_RET;
8737 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8738 gds.uav_id = is_cm ? 0 : uav_id;
8739 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8740 gds.src_gpr = ctx->temp_reg;
8741 gds.src_sel_x = (is_cm) ? 0 : 4;
8742 gds.src_sel_y = 4;
8743 gds.src_sel_z = 4;
8744 gds.dst_sel_x = 0;
8745 gds.dst_sel_y = 7;
8746 gds.dst_sel_z = 7;
8747 gds.dst_sel_w = 7;
8748 gds.src_gpr2 = 0;
8749 gds.alloc_consume = !is_cm;
8750 r = r600_bytecode_add_gds(ctx->bc, &gds);
8751 if (r)
8752 return r;
8753
8754 ctx->bc->cf_last->vpm = 1;
8755 return 0;
8756 }
8757
8758 /* this fixes up 1D arrays properly */
load_index_src(struct r600_shader_ctx * ctx,int src_index,int * idx_gpr)8759 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8760 {
8761 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8762 int r, i;
8763 struct r600_bytecode_alu alu;
8764 int temp_reg = r600_get_temp(ctx);
8765
8766 for (i = 0; i < 4; i++) {
8767 bool def_val = true, write_zero = false;
8768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8769 alu.op = ALU_OP1_MOV;
8770 alu.dst.sel = temp_reg;
8771 alu.dst.chan = i;
8772
8773 switch (inst->Memory.Texture) {
8774 case TGSI_TEXTURE_BUFFER:
8775 case TGSI_TEXTURE_1D:
8776 if (i == 1 || i == 2 || i == 3) {
8777 write_zero = true;
8778 }
8779 break;
8780 case TGSI_TEXTURE_1D_ARRAY:
8781 if (i == 1 || i == 3)
8782 write_zero = true;
8783 else if (i == 2) {
8784 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8785 def_val = false;
8786 }
8787 break;
8788 case TGSI_TEXTURE_2D:
8789 if (i == 2 || i == 3)
8790 write_zero = true;
8791 break;
8792 default:
8793 if (i == 3)
8794 write_zero = true;
8795 break;
8796 }
8797
8798 if (write_zero) {
8799 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8800 alu.src[0].value = 0;
8801 } else if (def_val) {
8802 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8803 }
8804
8805 if (i == 3)
8806 alu.last = 1;
8807 alu.dst.write = 1;
8808 r = r600_bytecode_add_alu(ctx->bc, &alu);
8809 if (r)
8810 return r;
8811 }
8812 *idx_gpr = temp_reg;
8813 return 0;
8814 }
8815
load_buffer_coord(struct r600_shader_ctx * ctx,int src_idx,int temp_reg)8816 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8817 int temp_reg)
8818 {
8819 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8820 int r;
8821 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8822 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8823 r = single_alu_op2(ctx, ALU_OP1_MOV,
8824 temp_reg, 0,
8825 V_SQ_ALU_SRC_LITERAL, value >> 2,
8826 0, 0);
8827 if (r)
8828 return r;
8829 } else {
8830 struct r600_bytecode_alu alu;
8831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8832 alu.op = ALU_OP2_LSHR_INT;
8833 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8834 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8835 alu.src[1].value = 2;
8836 alu.dst.sel = temp_reg;
8837 alu.dst.write = 1;
8838 alu.last = 1;
8839 r = r600_bytecode_add_alu(ctx->bc, &alu);
8840 if (r)
8841 return r;
8842 }
8843 return 0;
8844 }
8845
8846 /* ADDR[1,2] are stored in index_reg[0,1] on EG, and can be used for indexing
8847 * images and ssbos. We assume that indirects are indexed by ADDR[2], as that's
8848 * what GLSL-to-TGSI emitted.
8849 */
tgsi_indirect_to_rat_index_mode(struct tgsi_ind_register ind)8850 static unsigned tgsi_indirect_to_rat_index_mode(struct tgsi_ind_register ind)
8851 {
8852 if (ind.File == TGSI_FILE_NULL)
8853 return 0; /* CF_INDEX_NONE */
8854 else {
8855 assert(ind.Index == 2);
8856 return 2; /* CF_INDEX_1 */
8857 }
8858 }
8859
tgsi_load_buffer(struct r600_shader_ctx * ctx)8860 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8861 {
8862 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8863 /* have to work out the offset into the RAT immediate return buffer */
8864 struct r600_bytecode_vtx vtx;
8865 struct r600_bytecode_cf *cf;
8866 int r;
8867 int temp_reg = r600_get_temp(ctx);
8868 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
8869 unsigned base;
8870
8871 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8872
8873 r = load_buffer_coord(ctx, 1, temp_reg);
8874 if (r)
8875 return r;
8876 ctx->bc->cf_last->barrier = 1;
8877 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8878 vtx.op = FETCH_OP_VFETCH;
8879 vtx.buffer_id = inst->Src[0].Register.Index + base;
8880 vtx.buffer_index_mode = rat_index_mode;
8881 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8882 vtx.src_gpr = temp_reg;
8883 vtx.src_sel_x = 0;
8884 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8885 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8886 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8887 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8888 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8889 vtx.num_format_all = 1;
8890 vtx.format_comp_all = 1;
8891 vtx.srf_mode_all = 0;
8892
8893 if (inst->Dst[0].Register.WriteMask & 8) {
8894 vtx.data_format = FMT_32_32_32_32;
8895 vtx.use_const_fields = 0;
8896 } else if (inst->Dst[0].Register.WriteMask & 4) {
8897 vtx.data_format = FMT_32_32_32;
8898 vtx.use_const_fields = 0;
8899 } else if (inst->Dst[0].Register.WriteMask & 2) {
8900 vtx.data_format = FMT_32_32;
8901 vtx.use_const_fields = 0;
8902 } else {
8903 vtx.data_format = FMT_32;
8904 vtx.use_const_fields = 0;
8905 }
8906
8907 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8908 if (r)
8909 return r;
8910 cf = ctx->bc->cf_last;
8911 cf->barrier = 1;
8912 return 0;
8913 }
8914
tgsi_load_rat(struct r600_shader_ctx * ctx)8915 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8916 {
8917 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8918 /* have to work out the offset into the RAT immediate return buffer */
8919 struct r600_bytecode_vtx vtx;
8920 struct r600_bytecode_cf *cf;
8921 int r;
8922 int idx_gpr;
8923 unsigned format, num_format, format_comp, endian;
8924 const struct util_format_description *desc;
8925 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
8926 unsigned immed_base;
8927
8928 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8929 r = load_index_src(ctx, 1, &idx_gpr);
8930 if (r)
8931 return r;
8932
8933 if (rat_index_mode)
8934 egcm_load_index_reg(ctx->bc, 1, false);
8935
8936 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8937 cf = ctx->bc->cf_last;
8938
8939 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8940 cf->rat.inst = V_RAT_INST_NOP_RTN;
8941 cf->rat.index_mode = rat_index_mode;
8942 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8943 cf->output.gpr = ctx->thread_id_gpr;
8944 cf->output.index_gpr = idx_gpr;
8945 cf->output.comp_mask = 0xf;
8946 cf->output.burst_count = 1;
8947 cf->vpm = 1;
8948 cf->barrier = 1;
8949 cf->mark = 1;
8950 cf->output.elem_size = 0;
8951
8952 r600_bytecode_add_ack(ctx->bc);
8953 r600_bytecode_wait_acks(ctx->bc);
8954
8955 desc = util_format_description(inst->Memory.Format);
8956 r600_vertex_data_type(inst->Memory.Format,
8957 &format, &num_format, &format_comp, &endian);
8958 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8959 vtx.op = FETCH_OP_VFETCH;
8960 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8961 vtx.buffer_index_mode = rat_index_mode;
8962 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8963 vtx.src_gpr = ctx->thread_id_gpr;
8964 vtx.src_sel_x = 1;
8965 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8966 vtx.dst_sel_x = desc->swizzle[0];
8967 vtx.dst_sel_y = desc->swizzle[1];
8968 vtx.dst_sel_z = desc->swizzle[2];
8969 vtx.dst_sel_w = desc->swizzle[3];
8970 vtx.srf_mode_all = 1;
8971 vtx.data_format = format;
8972 vtx.num_format_all = num_format;
8973 vtx.format_comp_all = format_comp;
8974 vtx.endian = endian;
8975 vtx.offset = 0;
8976 vtx.mega_fetch_count = 3;
8977 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8978 if (r)
8979 return r;
8980 cf = ctx->bc->cf_last;
8981 cf->barrier = 1;
8982 return 0;
8983 }
8984
tgsi_load_lds(struct r600_shader_ctx * ctx)8985 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8986 {
8987 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8988 struct r600_bytecode_alu alu;
8989 int r;
8990 int temp_reg = r600_get_temp(ctx);
8991
8992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8993 alu.op = ALU_OP1_MOV;
8994 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8995 alu.dst.sel = temp_reg;
8996 alu.dst.write = 1;
8997 alu.last = 1;
8998 r = r600_bytecode_add_alu(ctx->bc, &alu);
8999 if (r)
9000 return r;
9001
9002 r = do_lds_fetch_values(ctx, temp_reg,
9003 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
9004 if (r)
9005 return r;
9006 return 0;
9007 }
9008
tgsi_load(struct r600_shader_ctx * ctx)9009 static int tgsi_load(struct r600_shader_ctx *ctx)
9010 {
9011 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9012 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9013 return tgsi_load_rat(ctx);
9014 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9015 return tgsi_load_gds(ctx);
9016 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9017 return tgsi_load_buffer(ctx);
9018 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9019 return tgsi_load_lds(ctx);
9020 return 0;
9021 }
9022
tgsi_store_buffer_rat(struct r600_shader_ctx * ctx)9023 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
9024 {
9025 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9026 struct r600_bytecode_cf *cf;
9027 int r, i;
9028 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect);
9029 int lasti;
9030 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
9031
9032 r = load_buffer_coord(ctx, 0, treg2);
9033 if (r)
9034 return r;
9035
9036 if (rat_index_mode)
9037 egcm_load_index_reg(ctx->bc, 1, false);
9038
9039 for (i = 0; i <= 3; i++) {
9040 struct r600_bytecode_alu alu;
9041 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9042 alu.op = ALU_OP1_MOV;
9043 alu.dst.sel = temp_reg;
9044 alu.dst.chan = i;
9045 alu.src[0].sel = V_SQ_ALU_SRC_0;
9046 alu.last = (i == 3);
9047 alu.dst.write = 1;
9048 r = r600_bytecode_add_alu(ctx->bc, &alu);
9049 if (r)
9050 return r;
9051 }
9052
9053 cf = NULL;
9054 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9055 for (i = 0; i <= lasti; i++) {
9056 struct r600_bytecode_alu alu;
9057 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
9058 continue;
9059
9060 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9061 temp_reg, 0,
9062 treg2, 0,
9063 V_SQ_ALU_SRC_LITERAL, i);
9064 if (r)
9065 return r;
9066
9067 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9068 alu.op = ALU_OP1_MOV;
9069 alu.dst.sel = ctx->temp_reg;
9070 alu.dst.chan = 0;
9071
9072 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9073 alu.last = 1;
9074 alu.dst.write = 1;
9075 r = r600_bytecode_add_alu(ctx->bc, &alu);
9076 if (r)
9077 return r;
9078
9079 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9080 cf = ctx->bc->cf_last;
9081
9082 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
9083 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9084 cf->rat.index_mode = rat_index_mode;
9085 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9086 cf->output.gpr = ctx->temp_reg;
9087 cf->output.index_gpr = temp_reg;
9088 cf->output.comp_mask = 1;
9089 cf->output.burst_count = 1;
9090 cf->vpm = 1;
9091 cf->barrier = 1;
9092 cf->output.elem_size = 0;
9093 }
9094
9095 /* Request an ack from the last write emitted. */
9096 if (cf) {
9097 cf->mark = true;
9098 cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
9099 r600_bytecode_add_ack(ctx->bc);
9100 }
9101
9102 return 0;
9103 }
9104
tgsi_store_rat(struct r600_shader_ctx * ctx)9105 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
9106 {
9107 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9108 struct r600_bytecode_cf *cf;
9109 bool src_requires_loading = false;
9110 int val_gpr, idx_gpr;
9111 int r, i;
9112 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Dst[0].Indirect);
9113
9114 r = load_index_src(ctx, 0, &idx_gpr);
9115 if (r)
9116 return r;
9117
9118 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
9119 src_requires_loading = true;
9120
9121 if (src_requires_loading) {
9122 struct r600_bytecode_alu alu;
9123 for (i = 0; i < 4; i++) {
9124 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9125 alu.op = ALU_OP1_MOV;
9126 alu.dst.sel = ctx->temp_reg;
9127 alu.dst.chan = i;
9128
9129 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9130 if (i == 3)
9131 alu.last = 1;
9132 alu.dst.write = 1;
9133 r = r600_bytecode_add_alu(ctx->bc, &alu);
9134 if (r)
9135 return r;
9136 }
9137 val_gpr = ctx->temp_reg;
9138 } else
9139 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9140 if (rat_index_mode)
9141 egcm_load_index_reg(ctx->bc, 1, false);
9142
9143 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9144 cf = ctx->bc->cf_last;
9145
9146 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9147 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9148 cf->rat.index_mode = rat_index_mode;
9149 cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
9150 cf->output.gpr = val_gpr;
9151 cf->output.index_gpr = idx_gpr;
9152 cf->output.comp_mask = 0xf;
9153 cf->output.burst_count = 1;
9154 cf->vpm = 1;
9155 cf->barrier = 1;
9156 cf->output.elem_size = 0;
9157 cf->mark = 1;
9158
9159 r600_bytecode_add_ack(ctx->bc);
9160
9161 return 0;
9162 }
9163
tgsi_store_lds(struct r600_shader_ctx * ctx)9164 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9165 {
9166 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9167 struct r600_bytecode_alu alu;
9168 int r, i, lasti;
9169 int write_mask = inst->Dst[0].Register.WriteMask;
9170 int temp_reg = r600_get_temp(ctx);
9171
9172 /* LDS write */
9173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9174 alu.op = ALU_OP1_MOV;
9175 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9176 alu.dst.sel = temp_reg;
9177 alu.dst.write = 1;
9178 alu.last = 1;
9179 r = r600_bytecode_add_alu(ctx->bc, &alu);
9180 if (r)
9181 return r;
9182
9183 lasti = tgsi_last_instruction(write_mask);
9184 for (i = 1; i <= lasti; i++) {
9185 if (!(write_mask & (1 << i)))
9186 continue;
9187 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9188 temp_reg, i,
9189 temp_reg, 0,
9190 V_SQ_ALU_SRC_LITERAL, 4 * i);
9191 if (r)
9192 return r;
9193 }
9194 for (i = 0; i <= lasti; i++) {
9195 if (!(write_mask & (1 << i)))
9196 continue;
9197
9198 if ((i == 0 && ((write_mask & 3) == 3)) ||
9199 (i == 2 && ((write_mask & 0xc) == 0xc))) {
9200 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9201 alu.op = LDS_OP3_LDS_WRITE_REL;
9202
9203 alu.src[0].sel = temp_reg;
9204 alu.src[0].chan = i;
9205 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9206 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9207 alu.last = 1;
9208 alu.is_lds_idx_op = true;
9209 alu.lds_idx = 1;
9210 r = r600_bytecode_add_alu(ctx->bc, &alu);
9211 if (r)
9212 return r;
9213 i += 1;
9214 continue;
9215 }
9216 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9217 alu.op = LDS_OP2_LDS_WRITE;
9218
9219 alu.src[0].sel = temp_reg;
9220 alu.src[0].chan = i;
9221 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9222
9223 alu.last = 1;
9224 alu.is_lds_idx_op = true;
9225
9226 r = r600_bytecode_add_alu(ctx->bc, &alu);
9227 if (r)
9228 return r;
9229 }
9230 return 0;
9231 }
9232
tgsi_store(struct r600_shader_ctx * ctx)9233 static int tgsi_store(struct r600_shader_ctx *ctx)
9234 {
9235 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9236 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9237 return tgsi_store_buffer_rat(ctx);
9238 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9239 return tgsi_store_lds(ctx);
9240 else
9241 return tgsi_store_rat(ctx);
9242 }
9243
tgsi_atomic_op_rat(struct r600_shader_ctx * ctx)9244 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9245 {
9246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9247 /* have to work out the offset into the RAT immediate return buffer */
9248 struct r600_bytecode_alu alu;
9249 struct r600_bytecode_vtx vtx;
9250 struct r600_bytecode_cf *cf;
9251 int r;
9252 int idx_gpr;
9253 unsigned format, num_format, format_comp, endian;
9254 const struct util_format_description *desc;
9255 unsigned rat_index_mode = tgsi_indirect_to_rat_index_mode(inst->Src[0].Indirect);
9256 unsigned immed_base;
9257 unsigned rat_base;
9258
9259 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9260 rat_base = ctx->shader->rat_base;
9261
9262 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9263 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9264 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9265
9266 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9267 if (r)
9268 return r;
9269 idx_gpr = ctx->temp_reg;
9270 } else {
9271 r = load_index_src(ctx, 1, &idx_gpr);
9272 if (r)
9273 return r;
9274 }
9275
9276 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9278 alu.op = ALU_OP1_MOV;
9279 alu.dst.sel = ctx->thread_id_gpr;
9280 alu.dst.chan = 0;
9281 alu.dst.write = 1;
9282 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9283 alu.last = 1;
9284 r = r600_bytecode_add_alu(ctx->bc, &alu);
9285 if (r)
9286 return r;
9287
9288 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9289 alu.op = ALU_OP1_MOV;
9290 alu.dst.sel = ctx->thread_id_gpr;
9291 if (ctx->bc->gfx_level == CAYMAN)
9292 alu.dst.chan = 2;
9293 else
9294 alu.dst.chan = 3;
9295 alu.dst.write = 1;
9296 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9297 alu.last = 1;
9298 r = r600_bytecode_add_alu(ctx->bc, &alu);
9299 if (r)
9300 return r;
9301 } else {
9302 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9303 alu.op = ALU_OP1_MOV;
9304 alu.dst.sel = ctx->thread_id_gpr;
9305 alu.dst.chan = 0;
9306 alu.dst.write = 1;
9307 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9308 alu.last = 1;
9309 r = r600_bytecode_add_alu(ctx->bc, &alu);
9310 if (r)
9311 return r;
9312 }
9313
9314 if (rat_index_mode)
9315 egcm_load_index_reg(ctx->bc, 1, false);
9316 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9317 cf = ctx->bc->cf_last;
9318
9319 cf->rat.id = rat_base + inst->Src[0].Register.Index;
9320 cf->rat.inst = ctx->inst_info->op;
9321 cf->rat.index_mode = rat_index_mode;
9322 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9323 cf->output.gpr = ctx->thread_id_gpr;
9324 cf->output.index_gpr = idx_gpr;
9325 cf->output.comp_mask = 0xf;
9326 cf->output.burst_count = 1;
9327 cf->vpm = 1;
9328 cf->barrier = 1;
9329 cf->mark = 1;
9330 cf->output.elem_size = 0;
9331
9332 r600_bytecode_add_ack(ctx->bc);
9333 r600_bytecode_wait_acks(ctx->bc);
9334
9335 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9336 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9337 desc = util_format_description(inst->Memory.Format);
9338 r600_vertex_data_type(inst->Memory.Format,
9339 &format, &num_format, &format_comp, &endian);
9340 vtx.dst_sel_x = desc->swizzle[0];
9341 } else {
9342 format = FMT_32;
9343 num_format = 1;
9344 format_comp = 0;
9345 endian = 0;
9346 vtx.dst_sel_x = 0;
9347 }
9348 vtx.op = FETCH_OP_VFETCH;
9349 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9350 vtx.buffer_index_mode = rat_index_mode;
9351 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9352 vtx.src_gpr = ctx->thread_id_gpr;
9353 vtx.src_sel_x = 1;
9354 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9355 vtx.dst_sel_y = 7;
9356 vtx.dst_sel_z = 7;
9357 vtx.dst_sel_w = 7;
9358 vtx.use_const_fields = 0;
9359 vtx.srf_mode_all = 1;
9360 vtx.data_format = format;
9361 vtx.num_format_all = num_format;
9362 vtx.format_comp_all = format_comp;
9363 vtx.endian = endian;
9364 vtx.offset = 0;
9365 vtx.mega_fetch_count = 0xf;
9366 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9367 if (r)
9368 return r;
9369 cf = ctx->bc->cf_last;
9370 cf->vpm = 1;
9371 cf->barrier = 1;
9372 return 0;
9373 }
9374
get_gds_op(int opcode)9375 static int get_gds_op(int opcode)
9376 {
9377 switch (opcode) {
9378 case TGSI_OPCODE_ATOMUADD:
9379 return FETCH_OP_GDS_ADD_RET;
9380 case TGSI_OPCODE_ATOMAND:
9381 return FETCH_OP_GDS_AND_RET;
9382 case TGSI_OPCODE_ATOMOR:
9383 return FETCH_OP_GDS_OR_RET;
9384 case TGSI_OPCODE_ATOMXOR:
9385 return FETCH_OP_GDS_XOR_RET;
9386 case TGSI_OPCODE_ATOMUMIN:
9387 return FETCH_OP_GDS_MIN_UINT_RET;
9388 case TGSI_OPCODE_ATOMUMAX:
9389 return FETCH_OP_GDS_MAX_UINT_RET;
9390 case TGSI_OPCODE_ATOMIMIN:
9391 return FETCH_OP_GDS_MIN_INT_RET;
9392 case TGSI_OPCODE_ATOMIMAX:
9393 return FETCH_OP_GDS_MAX_INT_RET;
9394 case TGSI_OPCODE_ATOMXCHG:
9395 return FETCH_OP_GDS_XCHG_RET;
9396 case TGSI_OPCODE_ATOMCAS:
9397 return FETCH_OP_GDS_CMP_XCHG_RET;
9398 default:
9399 return -1;
9400 }
9401 }
9402
tgsi_atomic_op_gds(struct r600_shader_ctx * ctx)9403 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9404 {
9405 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9406 struct r600_bytecode_gds gds;
9407 struct r600_bytecode_alu alu;
9408 int gds_op = get_gds_op(inst->Instruction.Opcode);
9409 int r;
9410 int uav_id = 0;
9411 int uav_index_mode = 0;
9412 bool is_cm = (ctx->bc->gfx_level == CAYMAN);
9413
9414 if (gds_op == -1) {
9415 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9416 return -1;
9417 }
9418
9419 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9420 if (r)
9421 return r;
9422
9423 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9424 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9425 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9426 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9427 alu.op = ALU_OP1_MOV;
9428 alu.dst.sel = ctx->temp_reg;
9429 alu.dst.chan = is_cm ? 2 : 1;
9430 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9431 alu.src[0].value = value;
9432 alu.last = 1;
9433 alu.dst.write = 1;
9434 r = r600_bytecode_add_alu(ctx->bc, &alu);
9435 if (r)
9436 return r;
9437 } else {
9438 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9439 alu.op = ALU_OP1_MOV;
9440 alu.dst.sel = ctx->temp_reg;
9441 alu.dst.chan = is_cm ? 2 : 1;
9442 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9443 alu.last = 1;
9444 alu.dst.write = 1;
9445 r = r600_bytecode_add_alu(ctx->bc, &alu);
9446 if (r)
9447 return r;
9448 }
9449 }
9450 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9451 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9452 int abs_value = abs(value);
9453 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9454 gds_op = FETCH_OP_GDS_SUB_RET;
9455 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9456 alu.op = ALU_OP1_MOV;
9457 alu.dst.sel = ctx->temp_reg;
9458 alu.dst.chan = is_cm ? 1 : 0;
9459 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9460 alu.src[0].value = abs_value;
9461 alu.last = 1;
9462 alu.dst.write = 1;
9463 r = r600_bytecode_add_alu(ctx->bc, &alu);
9464 if (r)
9465 return r;
9466 } else {
9467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9468 alu.op = ALU_OP1_MOV;
9469 alu.dst.sel = ctx->temp_reg;
9470 alu.dst.chan = is_cm ? 1 : 0;
9471 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9472 alu.last = 1;
9473 alu.dst.write = 1;
9474 r = r600_bytecode_add_alu(ctx->bc, &alu);
9475 if (r)
9476 return r;
9477 }
9478
9479
9480 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9481 gds.op = gds_op;
9482 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9483 gds.uav_id = is_cm ? 0 : uav_id;
9484 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9485 gds.src_gpr = ctx->temp_reg;
9486 gds.src_gpr2 = 0;
9487 gds.src_sel_x = is_cm ? 0 : 4;
9488 gds.src_sel_y = is_cm ? 1 : 0;
9489 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9490 gds.src_sel_z = is_cm ? 2 : 1;
9491 else
9492 gds.src_sel_z = 7;
9493 gds.dst_sel_x = 0;
9494 gds.dst_sel_y = 7;
9495 gds.dst_sel_z = 7;
9496 gds.dst_sel_w = 7;
9497 gds.alloc_consume = !is_cm;
9498
9499 r = r600_bytecode_add_gds(ctx->bc, &gds);
9500 if (r)
9501 return r;
9502 ctx->bc->cf_last->vpm = 1;
9503 return 0;
9504 }
9505
get_lds_op(int opcode)9506 static int get_lds_op(int opcode)
9507 {
9508 switch (opcode) {
9509 case TGSI_OPCODE_ATOMUADD:
9510 return LDS_OP2_LDS_ADD_RET;
9511 case TGSI_OPCODE_ATOMAND:
9512 return LDS_OP2_LDS_AND_RET;
9513 case TGSI_OPCODE_ATOMOR:
9514 return LDS_OP2_LDS_OR_RET;
9515 case TGSI_OPCODE_ATOMXOR:
9516 return LDS_OP2_LDS_XOR_RET;
9517 case TGSI_OPCODE_ATOMUMIN:
9518 return LDS_OP2_LDS_MIN_UINT_RET;
9519 case TGSI_OPCODE_ATOMUMAX:
9520 return LDS_OP2_LDS_MAX_UINT_RET;
9521 case TGSI_OPCODE_ATOMIMIN:
9522 return LDS_OP2_LDS_MIN_INT_RET;
9523 case TGSI_OPCODE_ATOMIMAX:
9524 return LDS_OP2_LDS_MAX_INT_RET;
9525 case TGSI_OPCODE_ATOMXCHG:
9526 return LDS_OP2_LDS_XCHG_RET;
9527 case TGSI_OPCODE_ATOMCAS:
9528 return LDS_OP3_LDS_CMP_XCHG_RET;
9529 default:
9530 return -1;
9531 }
9532 }
9533
tgsi_atomic_op_lds(struct r600_shader_ctx * ctx)9534 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9535 {
9536 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9537 int lds_op = get_lds_op(inst->Instruction.Opcode);
9538 int r;
9539
9540 struct r600_bytecode_alu alu;
9541 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9542 alu.op = lds_op;
9543 alu.is_lds_idx_op = true;
9544 alu.last = 1;
9545 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9546 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9547 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9548 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9549 else
9550 alu.src[2].sel = V_SQ_ALU_SRC_0;
9551 r = r600_bytecode_add_alu(ctx->bc, &alu);
9552 if (r)
9553 return r;
9554
9555 /* then read from LDS_OQ_A_POP */
9556 memset(&alu, 0, sizeof(alu));
9557
9558 alu.op = ALU_OP1_MOV;
9559 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9560 alu.src[0].chan = 0;
9561 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9562 alu.dst.write = 1;
9563 alu.last = 1;
9564 r = r600_bytecode_add_alu(ctx->bc, &alu);
9565 if (r)
9566 return r;
9567
9568 return 0;
9569 }
9570
tgsi_atomic_op(struct r600_shader_ctx * ctx)9571 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9572 {
9573 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9574 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9575 return tgsi_atomic_op_rat(ctx);
9576 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9577 return tgsi_atomic_op_gds(ctx);
9578 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9579 return tgsi_atomic_op_rat(ctx);
9580 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9581 return tgsi_atomic_op_lds(ctx);
9582 return 0;
9583 }
9584
tgsi_resq(struct r600_shader_ctx * ctx)9585 static int tgsi_resq(struct r600_shader_ctx *ctx)
9586 {
9587 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9588 unsigned sampler_index_mode;
9589 struct r600_bytecode_tex tex;
9590 int r;
9591 boolean has_txq_cube_array_z = false;
9592
9593 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9594 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9595 if (ctx->bc->gfx_level < EVERGREEN)
9596 ctx->shader->uses_tex_buffers = true;
9597 unsigned eg_buffer_base = 0;
9598 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9599 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9600 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9601 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9602 }
9603
9604 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9605 inst->Dst[0].Register.WriteMask & 4) {
9606 ctx->shader->has_txq_cube_array_z_comp = true;
9607 has_txq_cube_array_z = true;
9608 }
9609
9610 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9611 if (sampler_index_mode)
9612 egcm_load_index_reg(ctx->bc, 1, false);
9613
9614
9615 /* does this shader want a num layers from TXQ for a cube array? */
9616 if (has_txq_cube_array_z) {
9617 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9618 struct r600_bytecode_alu alu;
9619
9620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9621 alu.op = ALU_OP1_MOV;
9622
9623 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9624 /* with eg each dword is either number of cubes */
9625 alu.src[0].sel += id / 4;
9626 alu.src[0].chan = id % 4;
9627 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9628 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9629 alu.last = 1;
9630 r = r600_bytecode_add_alu(ctx->bc, &alu);
9631 if (r)
9632 return r;
9633 /* disable writemask from texture instruction */
9634 inst->Dst[0].Register.WriteMask &= ~4;
9635 }
9636 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9637 tex.op = ctx->inst_info->op;
9638 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9639 tex.sampler_index_mode = sampler_index_mode;
9640 tex.resource_id = tex.sampler_id;
9641 tex.resource_index_mode = sampler_index_mode;
9642 tex.src_sel_x = 4;
9643 tex.src_sel_y = 4;
9644 tex.src_sel_z = 4;
9645 tex.src_sel_w = 4;
9646 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9647 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9648 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9649 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9650 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9651 r = r600_bytecode_add_tex(ctx->bc, &tex);
9652 if (r)
9653 return r;
9654
9655 return 0;
9656 }
9657
tgsi_lrp(struct r600_shader_ctx * ctx)9658 static int tgsi_lrp(struct r600_shader_ctx *ctx)
9659 {
9660 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9661 struct r600_bytecode_alu alu;
9662 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9663 struct r600_bytecode_alu_src srcs[2][4];
9664 unsigned i;
9665 int r;
9666
9667 /* optimize if it's just an equal balance */
9668 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9669 for (i = 0; i < lasti + 1; i++) {
9670 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9671 continue;
9672
9673 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9674 alu.op = ALU_OP2_ADD;
9675 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9676 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9677 alu.omod = 3;
9678 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9679 alu.dst.chan = i;
9680 if (i == lasti) {
9681 alu.last = 1;
9682 }
9683 r = r600_bytecode_add_alu(ctx->bc, &alu);
9684 if (r)
9685 return r;
9686 }
9687 return 0;
9688 }
9689
9690 /* 1 - src0 */
9691 for (i = 0; i < lasti + 1; i++) {
9692 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9693 continue;
9694
9695 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9696 alu.op = ALU_OP2_ADD;
9697 alu.src[0].sel = V_SQ_ALU_SRC_1;
9698 alu.src[0].chan = 0;
9699 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9700 r600_bytecode_src_toggle_neg(&alu.src[1]);
9701 alu.dst.sel = ctx->temp_reg;
9702 alu.dst.chan = i;
9703 if (i == lasti) {
9704 alu.last = 1;
9705 }
9706 alu.dst.write = 1;
9707 r = r600_bytecode_add_alu(ctx->bc, &alu);
9708 if (r)
9709 return r;
9710 }
9711
9712 /* (1 - src0) * src2 */
9713 for (i = 0; i < lasti + 1; i++) {
9714 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9715 continue;
9716
9717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9718 alu.op = ALU_OP2_MUL;
9719 alu.src[0].sel = ctx->temp_reg;
9720 alu.src[0].chan = i;
9721 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9722 alu.dst.sel = ctx->temp_reg;
9723 alu.dst.chan = i;
9724 if (i == lasti) {
9725 alu.last = 1;
9726 }
9727 alu.dst.write = 1;
9728 r = r600_bytecode_add_alu(ctx->bc, &alu);
9729 if (r)
9730 return r;
9731 }
9732
9733 /* src0 * src1 + (1 - src0) * src2 */
9734
9735 for (i = 0; i < 2; i++) {
9736 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9737 srcs[i], &ctx->src[i]);
9738 if (r)
9739 return r;
9740 }
9741
9742 for (i = 0; i < lasti + 1; i++) {
9743 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9744 continue;
9745
9746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9747 alu.op = ALU_OP3_MULADD;
9748 alu.is_op3 = 1;
9749 alu.src[0] = srcs[0][i];
9750 alu.src[1] = srcs[1][i];
9751 alu.src[2].sel = ctx->temp_reg;
9752 alu.src[2].chan = i;
9753
9754 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9755 alu.dst.chan = i;
9756 if (i == lasti) {
9757 alu.last = 1;
9758 }
9759 r = r600_bytecode_add_alu(ctx->bc, &alu);
9760 if (r)
9761 return r;
9762 }
9763 return 0;
9764 }
9765
tgsi_cmp(struct r600_shader_ctx * ctx)9766 static int tgsi_cmp(struct r600_shader_ctx *ctx)
9767 {
9768 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9769 struct r600_bytecode_alu alu;
9770 int i, r, j;
9771 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9772 struct r600_bytecode_alu_src srcs[3][4];
9773
9774 unsigned op;
9775
9776 if (ctx->src[0].abs && ctx->src[0].neg) {
9777 op = ALU_OP3_CNDE;
9778 ctx->src[0].abs = 0;
9779 ctx->src[0].neg = 0;
9780 } else {
9781 op = ALU_OP3_CNDGE;
9782 }
9783
9784 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9785 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9786 srcs[j], &ctx->src[j]);
9787 if (r)
9788 return r;
9789 }
9790
9791 for (i = 0; i < lasti + 1; i++) {
9792 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9793 continue;
9794
9795 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9796 alu.op = op;
9797 alu.src[0] = srcs[0][i];
9798 alu.src[1] = srcs[2][i];
9799 alu.src[2] = srcs[1][i];
9800
9801 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9802 alu.dst.chan = i;
9803 alu.dst.write = 1;
9804 alu.is_op3 = 1;
9805 if (i == lasti)
9806 alu.last = 1;
9807 r = r600_bytecode_add_alu(ctx->bc, &alu);
9808 if (r)
9809 return r;
9810 }
9811 return 0;
9812 }
9813
tgsi_ucmp(struct r600_shader_ctx * ctx)9814 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9815 {
9816 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9817 struct r600_bytecode_alu alu;
9818 int i, r;
9819 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9820
9821 for (i = 0; i < lasti + 1; i++) {
9822 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9823 continue;
9824
9825 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9826 alu.op = ALU_OP3_CNDE_INT;
9827 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9828 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9829 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9830 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9831 alu.dst.chan = i;
9832 alu.dst.write = 1;
9833 alu.is_op3 = 1;
9834 if (i == lasti)
9835 alu.last = 1;
9836 r = r600_bytecode_add_alu(ctx->bc, &alu);
9837 if (r)
9838 return r;
9839 }
9840 return 0;
9841 }
9842
tgsi_exp(struct r600_shader_ctx * ctx)9843 static int tgsi_exp(struct r600_shader_ctx *ctx)
9844 {
9845 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9846 struct r600_bytecode_alu alu;
9847 int r;
9848 unsigned i;
9849
9850 /* result.x = 2^floor(src); */
9851 if (inst->Dst[0].Register.WriteMask & 1) {
9852 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9853
9854 alu.op = ALU_OP1_FLOOR;
9855 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9856
9857 alu.dst.sel = ctx->temp_reg;
9858 alu.dst.chan = 0;
9859 alu.dst.write = 1;
9860 alu.last = 1;
9861 r = r600_bytecode_add_alu(ctx->bc, &alu);
9862 if (r)
9863 return r;
9864
9865 if (ctx->bc->gfx_level == CAYMAN) {
9866 for (i = 0; i < 3; i++) {
9867 alu.op = ALU_OP1_EXP_IEEE;
9868 alu.src[0].sel = ctx->temp_reg;
9869 alu.src[0].chan = 0;
9870
9871 alu.dst.sel = ctx->temp_reg;
9872 alu.dst.chan = i;
9873 alu.dst.write = i == 0;
9874 alu.last = i == 2;
9875 r = r600_bytecode_add_alu(ctx->bc, &alu);
9876 if (r)
9877 return r;
9878 }
9879 } else {
9880 alu.op = ALU_OP1_EXP_IEEE;
9881 alu.src[0].sel = ctx->temp_reg;
9882 alu.src[0].chan = 0;
9883
9884 alu.dst.sel = ctx->temp_reg;
9885 alu.dst.chan = 0;
9886 alu.dst.write = 1;
9887 alu.last = 1;
9888 r = r600_bytecode_add_alu(ctx->bc, &alu);
9889 if (r)
9890 return r;
9891 }
9892 }
9893
9894 /* result.y = tmp - floor(tmp); */
9895 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9896 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9897
9898 alu.op = ALU_OP1_FRACT;
9899 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9900
9901 alu.dst.sel = ctx->temp_reg;
9902 #if 0
9903 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9904 if (r)
9905 return r;
9906 #endif
9907 alu.dst.write = 1;
9908 alu.dst.chan = 1;
9909
9910 alu.last = 1;
9911
9912 r = r600_bytecode_add_alu(ctx->bc, &alu);
9913 if (r)
9914 return r;
9915 }
9916
9917 /* result.z = RoughApprox2ToX(tmp);*/
9918 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9919 if (ctx->bc->gfx_level == CAYMAN) {
9920 for (i = 0; i < 3; i++) {
9921 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9922 alu.op = ALU_OP1_EXP_IEEE;
9923 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9924
9925 alu.dst.sel = ctx->temp_reg;
9926 alu.dst.chan = i;
9927 if (i == 2) {
9928 alu.dst.write = 1;
9929 alu.last = 1;
9930 }
9931
9932 r = r600_bytecode_add_alu(ctx->bc, &alu);
9933 if (r)
9934 return r;
9935 }
9936 } else {
9937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9938 alu.op = ALU_OP1_EXP_IEEE;
9939 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9940
9941 alu.dst.sel = ctx->temp_reg;
9942 alu.dst.write = 1;
9943 alu.dst.chan = 2;
9944
9945 alu.last = 1;
9946
9947 r = r600_bytecode_add_alu(ctx->bc, &alu);
9948 if (r)
9949 return r;
9950 }
9951 }
9952
9953 /* result.w = 1.0;*/
9954 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9955 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9956
9957 alu.op = ALU_OP1_MOV;
9958 alu.src[0].sel = V_SQ_ALU_SRC_1;
9959 alu.src[0].chan = 0;
9960
9961 alu.dst.sel = ctx->temp_reg;
9962 alu.dst.chan = 3;
9963 alu.dst.write = 1;
9964 alu.last = 1;
9965 r = r600_bytecode_add_alu(ctx->bc, &alu);
9966 if (r)
9967 return r;
9968 }
9969 return tgsi_helper_copy(ctx, inst);
9970 }
9971
tgsi_log(struct r600_shader_ctx * ctx)9972 static int tgsi_log(struct r600_shader_ctx *ctx)
9973 {
9974 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9975 struct r600_bytecode_alu alu;
9976 int r;
9977 unsigned i;
9978
9979 /* result.x = floor(log2(|src|)); */
9980 if (inst->Dst[0].Register.WriteMask & 1) {
9981 if (ctx->bc->gfx_level == CAYMAN) {
9982 for (i = 0; i < 3; i++) {
9983 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9984
9985 alu.op = ALU_OP1_LOG_IEEE;
9986 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9987 r600_bytecode_src_set_abs(&alu.src[0]);
9988
9989 alu.dst.sel = ctx->temp_reg;
9990 alu.dst.chan = i;
9991 if (i == 0)
9992 alu.dst.write = 1;
9993 if (i == 2)
9994 alu.last = 1;
9995 r = r600_bytecode_add_alu(ctx->bc, &alu);
9996 if (r)
9997 return r;
9998 }
9999
10000 } else {
10001 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10002
10003 alu.op = ALU_OP1_LOG_IEEE;
10004 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10005 r600_bytecode_src_set_abs(&alu.src[0]);
10006
10007 alu.dst.sel = ctx->temp_reg;
10008 alu.dst.chan = 0;
10009 alu.dst.write = 1;
10010 alu.last = 1;
10011 r = r600_bytecode_add_alu(ctx->bc, &alu);
10012 if (r)
10013 return r;
10014 }
10015
10016 alu.op = ALU_OP1_FLOOR;
10017 alu.src[0].sel = ctx->temp_reg;
10018 alu.src[0].chan = 0;
10019
10020 alu.dst.sel = ctx->temp_reg;
10021 alu.dst.chan = 0;
10022 alu.dst.write = 1;
10023 alu.last = 1;
10024
10025 r = r600_bytecode_add_alu(ctx->bc, &alu);
10026 if (r)
10027 return r;
10028 }
10029
10030 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
10031 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
10032
10033 if (ctx->bc->gfx_level == CAYMAN) {
10034 for (i = 0; i < 3; i++) {
10035 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10036
10037 alu.op = ALU_OP1_LOG_IEEE;
10038 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10039 r600_bytecode_src_set_abs(&alu.src[0]);
10040
10041 alu.dst.sel = ctx->temp_reg;
10042 alu.dst.chan = i;
10043 if (i == 1)
10044 alu.dst.write = 1;
10045 if (i == 2)
10046 alu.last = 1;
10047
10048 r = r600_bytecode_add_alu(ctx->bc, &alu);
10049 if (r)
10050 return r;
10051 }
10052 } else {
10053 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10054
10055 alu.op = ALU_OP1_LOG_IEEE;
10056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10057 r600_bytecode_src_set_abs(&alu.src[0]);
10058
10059 alu.dst.sel = ctx->temp_reg;
10060 alu.dst.chan = 1;
10061 alu.dst.write = 1;
10062 alu.last = 1;
10063
10064 r = r600_bytecode_add_alu(ctx->bc, &alu);
10065 if (r)
10066 return r;
10067 }
10068
10069 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10070
10071 alu.op = ALU_OP1_FLOOR;
10072 alu.src[0].sel = ctx->temp_reg;
10073 alu.src[0].chan = 1;
10074
10075 alu.dst.sel = ctx->temp_reg;
10076 alu.dst.chan = 1;
10077 alu.dst.write = 1;
10078 alu.last = 1;
10079
10080 r = r600_bytecode_add_alu(ctx->bc, &alu);
10081 if (r)
10082 return r;
10083
10084 if (ctx->bc->gfx_level == CAYMAN) {
10085 for (i = 0; i < 3; i++) {
10086 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10087 alu.op = ALU_OP1_EXP_IEEE;
10088 alu.src[0].sel = ctx->temp_reg;
10089 alu.src[0].chan = 1;
10090
10091 alu.dst.sel = ctx->temp_reg;
10092 alu.dst.chan = i;
10093 if (i == 1)
10094 alu.dst.write = 1;
10095 if (i == 2)
10096 alu.last = 1;
10097
10098 r = r600_bytecode_add_alu(ctx->bc, &alu);
10099 if (r)
10100 return r;
10101 }
10102 } else {
10103 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10104 alu.op = ALU_OP1_EXP_IEEE;
10105 alu.src[0].sel = ctx->temp_reg;
10106 alu.src[0].chan = 1;
10107
10108 alu.dst.sel = ctx->temp_reg;
10109 alu.dst.chan = 1;
10110 alu.dst.write = 1;
10111 alu.last = 1;
10112
10113 r = r600_bytecode_add_alu(ctx->bc, &alu);
10114 if (r)
10115 return r;
10116 }
10117
10118 if (ctx->bc->gfx_level == CAYMAN) {
10119 for (i = 0; i < 3; i++) {
10120 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10121 alu.op = ALU_OP1_RECIP_IEEE;
10122 alu.src[0].sel = ctx->temp_reg;
10123 alu.src[0].chan = 1;
10124
10125 alu.dst.sel = ctx->temp_reg;
10126 alu.dst.chan = i;
10127 if (i == 1)
10128 alu.dst.write = 1;
10129 if (i == 2)
10130 alu.last = 1;
10131
10132 r = r600_bytecode_add_alu(ctx->bc, &alu);
10133 if (r)
10134 return r;
10135 }
10136 } else {
10137 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10138 alu.op = ALU_OP1_RECIP_IEEE;
10139 alu.src[0].sel = ctx->temp_reg;
10140 alu.src[0].chan = 1;
10141
10142 alu.dst.sel = ctx->temp_reg;
10143 alu.dst.chan = 1;
10144 alu.dst.write = 1;
10145 alu.last = 1;
10146
10147 r = r600_bytecode_add_alu(ctx->bc, &alu);
10148 if (r)
10149 return r;
10150 }
10151
10152 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10153
10154 alu.op = ALU_OP2_MUL;
10155
10156 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10157 r600_bytecode_src_set_abs(&alu.src[0]);
10158
10159 alu.src[1].sel = ctx->temp_reg;
10160 alu.src[1].chan = 1;
10161
10162 alu.dst.sel = ctx->temp_reg;
10163 alu.dst.chan = 1;
10164 alu.dst.write = 1;
10165 alu.last = 1;
10166
10167 r = r600_bytecode_add_alu(ctx->bc, &alu);
10168 if (r)
10169 return r;
10170 }
10171
10172 /* result.z = log2(|src|);*/
10173 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10174 if (ctx->bc->gfx_level == CAYMAN) {
10175 for (i = 0; i < 3; i++) {
10176 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10177
10178 alu.op = ALU_OP1_LOG_IEEE;
10179 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10180 r600_bytecode_src_set_abs(&alu.src[0]);
10181
10182 alu.dst.sel = ctx->temp_reg;
10183 if (i == 2)
10184 alu.dst.write = 1;
10185 alu.dst.chan = i;
10186 if (i == 2)
10187 alu.last = 1;
10188
10189 r = r600_bytecode_add_alu(ctx->bc, &alu);
10190 if (r)
10191 return r;
10192 }
10193 } else {
10194 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10195
10196 alu.op = ALU_OP1_LOG_IEEE;
10197 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10198 r600_bytecode_src_set_abs(&alu.src[0]);
10199
10200 alu.dst.sel = ctx->temp_reg;
10201 alu.dst.write = 1;
10202 alu.dst.chan = 2;
10203 alu.last = 1;
10204
10205 r = r600_bytecode_add_alu(ctx->bc, &alu);
10206 if (r)
10207 return r;
10208 }
10209 }
10210
10211 /* result.w = 1.0; */
10212 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10213 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10214
10215 alu.op = ALU_OP1_MOV;
10216 alu.src[0].sel = V_SQ_ALU_SRC_1;
10217 alu.src[0].chan = 0;
10218
10219 alu.dst.sel = ctx->temp_reg;
10220 alu.dst.chan = 3;
10221 alu.dst.write = 1;
10222 alu.last = 1;
10223
10224 r = r600_bytecode_add_alu(ctx->bc, &alu);
10225 if (r)
10226 return r;
10227 }
10228
10229 return tgsi_helper_copy(ctx, inst);
10230 }
10231
tgsi_eg_arl(struct r600_shader_ctx * ctx)10232 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10233 {
10234 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10235 struct r600_bytecode_alu alu;
10236 int r;
10237 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10238 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10239
10240 assert(inst->Dst[0].Register.Index < 3);
10241 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10242
10243 switch (inst->Instruction.Opcode) {
10244 case TGSI_OPCODE_ARL:
10245 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10246 break;
10247 case TGSI_OPCODE_ARR:
10248 alu.op = ALU_OP1_FLT_TO_INT;
10249 break;
10250 case TGSI_OPCODE_UARL:
10251 alu.op = ALU_OP1_MOV;
10252 break;
10253 default:
10254 assert(0);
10255 return -1;
10256 }
10257
10258 for (i = 0; i <= lasti; ++i) {
10259 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10260 continue;
10261 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10262 alu.last = i == lasti;
10263 alu.dst.sel = reg;
10264 alu.dst.chan = i;
10265 alu.dst.write = 1;
10266 r = r600_bytecode_add_alu(ctx->bc, &alu);
10267 if (r)
10268 return r;
10269 }
10270
10271 if (inst->Dst[0].Register.Index > 0)
10272 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10273 else
10274 ctx->bc->ar_loaded = 0;
10275
10276 return 0;
10277 }
tgsi_r600_arl(struct r600_shader_ctx * ctx)10278 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10279 {
10280 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10281 struct r600_bytecode_alu alu;
10282 int r;
10283 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10284
10285 switch (inst->Instruction.Opcode) {
10286 case TGSI_OPCODE_ARL:
10287 memset(&alu, 0, sizeof(alu));
10288 alu.op = ALU_OP1_FLOOR;
10289 alu.dst.sel = ctx->bc->ar_reg;
10290 alu.dst.write = 1;
10291 for (i = 0; i <= lasti; ++i) {
10292 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10293 alu.dst.chan = i;
10294 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10295 alu.last = i == lasti;
10296 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10297 return r;
10298 }
10299 }
10300
10301 memset(&alu, 0, sizeof(alu));
10302 alu.op = ALU_OP1_FLT_TO_INT;
10303 alu.src[0].sel = ctx->bc->ar_reg;
10304 alu.dst.sel = ctx->bc->ar_reg;
10305 alu.dst.write = 1;
10306 /* FLT_TO_INT is trans-only on r600/r700 */
10307 alu.last = TRUE;
10308 for (i = 0; i <= lasti; ++i) {
10309 alu.dst.chan = i;
10310 alu.src[0].chan = i;
10311 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10312 return r;
10313 }
10314 break;
10315 case TGSI_OPCODE_ARR:
10316 memset(&alu, 0, sizeof(alu));
10317 alu.op = ALU_OP1_FLT_TO_INT;
10318 alu.dst.sel = ctx->bc->ar_reg;
10319 alu.dst.write = 1;
10320 /* FLT_TO_INT is trans-only on r600/r700 */
10321 alu.last = TRUE;
10322 for (i = 0; i <= lasti; ++i) {
10323 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10324 alu.dst.chan = i;
10325 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10326 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10327 return r;
10328 }
10329 }
10330 break;
10331 case TGSI_OPCODE_UARL:
10332 memset(&alu, 0, sizeof(alu));
10333 alu.op = ALU_OP1_MOV;
10334 alu.dst.sel = ctx->bc->ar_reg;
10335 alu.dst.write = 1;
10336 for (i = 0; i <= lasti; ++i) {
10337 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10338 alu.dst.chan = i;
10339 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10340 alu.last = i == lasti;
10341 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10342 return r;
10343 }
10344 }
10345 break;
10346 default:
10347 assert(0);
10348 return -1;
10349 }
10350
10351 ctx->bc->ar_loaded = 0;
10352 return 0;
10353 }
10354
tgsi_opdst(struct r600_shader_ctx * ctx)10355 static int tgsi_opdst(struct r600_shader_ctx *ctx)
10356 {
10357 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10358 struct r600_bytecode_alu alu;
10359 int i, r = 0;
10360
10361 for (i = 0; i < 4; i++) {
10362 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10363
10364 alu.op = ALU_OP2_MUL;
10365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10366
10367 if (i == 0 || i == 3) {
10368 alu.src[0].sel = V_SQ_ALU_SRC_1;
10369 } else {
10370 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10371 }
10372
10373 if (i == 0 || i == 2) {
10374 alu.src[1].sel = V_SQ_ALU_SRC_1;
10375 } else {
10376 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10377 }
10378 if (i == 3)
10379 alu.last = 1;
10380 r = r600_bytecode_add_alu(ctx->bc, &alu);
10381 if (r)
10382 return r;
10383 }
10384 return 0;
10385 }
10386
emit_logic_pred(struct r600_shader_ctx * ctx,int opcode,int alu_type,struct r600_bytecode_alu_src * src)10387 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10388 struct r600_bytecode_alu_src *src)
10389 {
10390 struct r600_bytecode_alu alu;
10391 int r;
10392
10393 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10394 alu.op = opcode;
10395 alu.execute_mask = 1;
10396 alu.update_pred = 1;
10397
10398 alu.dst.sel = ctx->temp_reg;
10399 alu.dst.write = 1;
10400 alu.dst.chan = 0;
10401
10402 alu.src[0] = *src;
10403 alu.src[1].sel = V_SQ_ALU_SRC_0;
10404 alu.src[1].chan = 0;
10405
10406 alu.last = 1;
10407
10408 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10409 if (r)
10410 return r;
10411 return 0;
10412 }
10413
pops(struct r600_shader_ctx * ctx,int pops)10414 static int pops(struct r600_shader_ctx *ctx, int pops)
10415 {
10416 unsigned force_pop = ctx->bc->force_add_cf;
10417
10418 if (!force_pop) {
10419 int alu_pop = 3;
10420 if (ctx->bc->cf_last) {
10421 if (ctx->bc->cf_last->op == CF_OP_ALU)
10422 alu_pop = 0;
10423 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10424 alu_pop = 1;
10425 }
10426 alu_pop += pops;
10427 if (alu_pop == 1) {
10428 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10429 ctx->bc->force_add_cf = 1;
10430 } else if (alu_pop == 2) {
10431 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10432 ctx->bc->force_add_cf = 1;
10433 } else {
10434 force_pop = 1;
10435 }
10436 }
10437
10438 if (force_pop) {
10439 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10440 ctx->bc->cf_last->pop_count = pops;
10441 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10442 }
10443
10444 return 0;
10445 }
10446
callstack_update_max_depth(struct r600_shader_ctx * ctx,unsigned reason)10447 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10448 unsigned reason)
10449 {
10450 struct r600_stack_info *stack = &ctx->bc->stack;
10451 unsigned elements;
10452 int entries;
10453
10454 unsigned entry_size = stack->entry_size;
10455
10456 elements = (stack->loop + stack->push_wqm ) * entry_size;
10457 elements += stack->push;
10458
10459 switch (ctx->bc->gfx_level) {
10460 case R600:
10461 case R700:
10462 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10463 * the stack must be reserved to hold the current active/continue
10464 * masks */
10465 if (reason == FC_PUSH_VPM || stack->push > 0) {
10466 elements += 2;
10467 }
10468 break;
10469
10470 case CAYMAN:
10471 /* r9xx: any stack operation on empty stack consumes 2 additional
10472 * elements */
10473 elements += 2;
10474
10475 FALLTHROUGH;
10476 /* FIXME: do the two elements added above cover the cases for the
10477 * r8xx+ below? */
10478
10479 case EVERGREEN:
10480 /* r8xx+: 2 extra elements are not always required, but one extra
10481 * element must be added for each of the following cases:
10482 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10483 * stack usage.
10484 * (Currently we don't use ALU_ELSE_AFTER.)
10485 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10486 * PUSH instruction executed.
10487 *
10488 * NOTE: it seems we also need to reserve additional element in some
10489 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10490 * then STACK_SIZE should be 2 instead of 1 */
10491 if (reason == FC_PUSH_VPM || stack->push > 0) {
10492 elements += 1;
10493 }
10494 break;
10495
10496 default:
10497 assert(0);
10498 break;
10499 }
10500
10501 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10502 * for all chips, so we use 4 in the final formula, not the real entry_size
10503 * for the chip */
10504 entry_size = 4;
10505
10506 entries = (elements + (entry_size - 1)) / entry_size;
10507
10508 if (entries > stack->max_entries)
10509 stack->max_entries = entries;
10510 return elements;
10511 }
10512
callstack_pop(struct r600_shader_ctx * ctx,unsigned reason)10513 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10514 {
10515 switch(reason) {
10516 case FC_PUSH_VPM:
10517 --ctx->bc->stack.push;
10518 assert(ctx->bc->stack.push >= 0);
10519 break;
10520 case FC_PUSH_WQM:
10521 --ctx->bc->stack.push_wqm;
10522 assert(ctx->bc->stack.push_wqm >= 0);
10523 break;
10524 case FC_LOOP:
10525 --ctx->bc->stack.loop;
10526 assert(ctx->bc->stack.loop >= 0);
10527 break;
10528 default:
10529 assert(0);
10530 break;
10531 }
10532 }
10533
callstack_push(struct r600_shader_ctx * ctx,unsigned reason)10534 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10535 {
10536 switch (reason) {
10537 case FC_PUSH_VPM:
10538 ++ctx->bc->stack.push;
10539 break;
10540 case FC_PUSH_WQM:
10541 ++ctx->bc->stack.push_wqm;
10542 break;
10543 case FC_LOOP:
10544 ++ctx->bc->stack.loop;
10545 break;
10546 default:
10547 assert(0);
10548 }
10549
10550 return callstack_update_max_depth(ctx, reason);
10551 }
10552
fc_set_mid(struct r600_shader_ctx * ctx,int fc_sp)10553 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10554 {
10555 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10556
10557 sp->mid = realloc((void *)sp->mid,
10558 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10559 sp->mid[sp->num_mid] = ctx->bc->cf_last;
10560 sp->num_mid++;
10561 }
10562
fc_pushlevel(struct r600_shader_ctx * ctx,int type)10563 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10564 {
10565 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10566 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10567 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10568 ctx->bc->fc_sp++;
10569 }
10570
fc_poplevel(struct r600_shader_ctx * ctx)10571 static void fc_poplevel(struct r600_shader_ctx *ctx)
10572 {
10573 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10574 free(sp->mid);
10575 sp->mid = NULL;
10576 sp->num_mid = 0;
10577 sp->start = NULL;
10578 sp->type = 0;
10579 ctx->bc->fc_sp--;
10580 }
10581
10582 #if 0
10583 static int emit_return(struct r600_shader_ctx *ctx)
10584 {
10585 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10586 return 0;
10587 }
10588
10589 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10590 {
10591
10592 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10593 ctx->bc->cf_last->pop_count = pops;
10594 /* XXX work out offset */
10595 return 0;
10596 }
10597
10598 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10599 {
10600 return 0;
10601 }
10602
10603 static void emit_testflag(struct r600_shader_ctx *ctx)
10604 {
10605
10606 }
10607
10608 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10609 {
10610 emit_testflag(ctx);
10611 emit_jump_to_offset(ctx, 1, 4);
10612 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10613 pops(ctx, ifidx + 1);
10614 emit_return(ctx);
10615 }
10616
10617 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10618 {
10619 emit_testflag(ctx);
10620
10621 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10622 ctx->bc->cf_last->pop_count = 1;
10623
10624 fc_set_mid(ctx, fc_sp);
10625
10626 pops(ctx, 1);
10627 }
10628 #endif
10629
emit_if(struct r600_shader_ctx * ctx,int opcode,struct r600_bytecode_alu_src * src)10630 static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10631 struct r600_bytecode_alu_src *src)
10632 {
10633 int alu_type = CF_OP_ALU_PUSH_BEFORE;
10634 bool needs_workaround = false;
10635 int elems = callstack_push(ctx, FC_PUSH_VPM);
10636
10637 if (ctx->bc->gfx_level == CAYMAN && ctx->bc->stack.loop > 1)
10638 needs_workaround = true;
10639
10640 if (ctx->bc->gfx_level == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10641 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10642 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10643
10644 if (elems && (!dmod1 || !dmod2))
10645 needs_workaround = true;
10646 }
10647
10648 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10649 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10650 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10651 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10652 if (needs_workaround) {
10653 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10654 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10655 alu_type = CF_OP_ALU;
10656 }
10657
10658 emit_logic_pred(ctx, opcode, alu_type, src);
10659
10660 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10661
10662 fc_pushlevel(ctx, FC_IF);
10663
10664 return 0;
10665 }
10666
tgsi_if(struct r600_shader_ctx * ctx)10667 static int tgsi_if(struct r600_shader_ctx *ctx)
10668 {
10669 struct r600_bytecode_alu_src alu_src;
10670 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10671
10672 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10673 }
10674
tgsi_uif(struct r600_shader_ctx * ctx)10675 static int tgsi_uif(struct r600_shader_ctx *ctx)
10676 {
10677 struct r600_bytecode_alu_src alu_src;
10678 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10679 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10680 }
10681
tgsi_else(struct r600_shader_ctx * ctx)10682 static int tgsi_else(struct r600_shader_ctx *ctx)
10683 {
10684 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10685 ctx->bc->cf_last->pop_count = 1;
10686
10687 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10688 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10689 return 0;
10690 }
10691
tgsi_endif(struct r600_shader_ctx * ctx)10692 static int tgsi_endif(struct r600_shader_ctx *ctx)
10693 {
10694 int offset = 2;
10695 pops(ctx, 1);
10696 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10697 R600_ERR("if/endif unbalanced in shader\n");
10698 return -1;
10699 }
10700
10701 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10702 if (ctx->bc->cf_last->eg_alu_extended)
10703 offset += 2;
10704
10705 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10706 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10707 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10708 } else {
10709 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10710 }
10711 fc_poplevel(ctx);
10712
10713 callstack_pop(ctx, FC_PUSH_VPM);
10714 return 0;
10715 }
10716
tgsi_bgnloop(struct r600_shader_ctx * ctx)10717 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10718 {
10719 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10720 * limited to 4096 iterations, like the other LOOP_* instructions. */
10721 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10722
10723 fc_pushlevel(ctx, FC_LOOP);
10724
10725 /* check stack depth */
10726 callstack_push(ctx, FC_LOOP);
10727 return 0;
10728 }
10729
tgsi_endloop(struct r600_shader_ctx * ctx)10730 static int tgsi_endloop(struct r600_shader_ctx *ctx)
10731 {
10732 int i;
10733
10734 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10735
10736 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10737 R600_ERR("loop/endloop in shader code are not paired.\n");
10738 return -EINVAL;
10739 }
10740
10741 /* fixup loop pointers - from r600isa
10742 LOOP END points to CF after LOOP START,
10743 LOOP START point to CF after LOOP END
10744 BRK/CONT point to LOOP END CF
10745 */
10746 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10747
10748 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10749
10750 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10751 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10752 }
10753 /* XXX add LOOPRET support */
10754 fc_poplevel(ctx);
10755 callstack_pop(ctx, FC_LOOP);
10756 return 0;
10757 }
10758
tgsi_loop_brk_cont(struct r600_shader_ctx * ctx)10759 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10760 {
10761 unsigned int fscp;
10762
10763 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10764 {
10765 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10766 break;
10767 }
10768
10769 if (fscp == 0) {
10770 R600_ERR("Break not inside loop/endloop pair\n");
10771 return -EINVAL;
10772 }
10773
10774 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10775
10776 fc_set_mid(ctx, fscp - 1);
10777
10778 return 0;
10779 }
10780
tgsi_gs_emit(struct r600_shader_ctx * ctx)10781 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10782 {
10783 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10784 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10785 int r;
10786
10787 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10788 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10789
10790 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10791 if (!r) {
10792 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10793 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10794 return emit_inc_ring_offset(ctx, stream, TRUE);
10795 }
10796 return r;
10797 }
10798
tgsi_umad(struct r600_shader_ctx * ctx)10799 static int tgsi_umad(struct r600_shader_ctx *ctx)
10800 {
10801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10802 struct r600_bytecode_alu alu;
10803 int i, j, r;
10804 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10805
10806 /* src0 * src1 */
10807 for (i = 0; i < lasti + 1; i++) {
10808 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10809 continue;
10810
10811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10812
10813 alu.dst.chan = i;
10814 alu.dst.sel = ctx->temp_reg;
10815 alu.dst.write = 1;
10816
10817 alu.op = ALU_OP2_MULLO_UINT;
10818 for (j = 0; j < 2; j++) {
10819 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10820 }
10821
10822 alu.last = 1;
10823 r = emit_mul_int_op(ctx->bc, &alu);
10824 if (r)
10825 return r;
10826 }
10827
10828
10829 for (i = 0; i < lasti + 1; i++) {
10830 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10831 continue;
10832
10833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10834 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10835
10836 alu.op = ALU_OP2_ADD_INT;
10837
10838 alu.src[0].sel = ctx->temp_reg;
10839 alu.src[0].chan = i;
10840
10841 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10842 if (i == lasti) {
10843 alu.last = 1;
10844 }
10845 r = r600_bytecode_add_alu(ctx->bc, &alu);
10846 if (r)
10847 return r;
10848 }
10849 return 0;
10850 }
10851
tgsi_pk2h(struct r600_shader_ctx * ctx)10852 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10853 {
10854 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10855 struct r600_bytecode_alu alu;
10856 int r, i;
10857 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10858
10859 /* temp.xy = f32_to_f16(src) */
10860 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10861 alu.op = ALU_OP1_FLT32_TO_FLT16;
10862 alu.dst.chan = 0;
10863 alu.dst.sel = ctx->temp_reg;
10864 alu.dst.write = 1;
10865 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10866 r = r600_bytecode_add_alu(ctx->bc, &alu);
10867 if (r)
10868 return r;
10869 alu.dst.chan = 1;
10870 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10871 alu.last = 1;
10872 r = r600_bytecode_add_alu(ctx->bc, &alu);
10873 if (r)
10874 return r;
10875
10876 /* dst.x = temp.y * 0x10000 + temp.x */
10877 for (i = 0; i < lasti + 1; i++) {
10878 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10879 continue;
10880
10881 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10882 alu.op = ALU_OP3_MULADD_UINT24;
10883 alu.is_op3 = 1;
10884 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10885 alu.last = i == lasti;
10886 alu.src[0].sel = ctx->temp_reg;
10887 alu.src[0].chan = 1;
10888 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10889 alu.src[1].value = 0x10000;
10890 alu.src[2].sel = ctx->temp_reg;
10891 alu.src[2].chan = 0;
10892 r = r600_bytecode_add_alu(ctx->bc, &alu);
10893 if (r)
10894 return r;
10895 }
10896
10897 return 0;
10898 }
10899
tgsi_up2h(struct r600_shader_ctx * ctx)10900 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10901 {
10902 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10903 struct r600_bytecode_alu alu;
10904 int r, i;
10905 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10906
10907 /* temp.x = src.x */
10908 /* note: no need to mask out the high bits */
10909 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10910 alu.op = ALU_OP1_MOV;
10911 alu.dst.chan = 0;
10912 alu.dst.sel = ctx->temp_reg;
10913 alu.dst.write = 1;
10914 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10915 r = r600_bytecode_add_alu(ctx->bc, &alu);
10916 if (r)
10917 return r;
10918
10919 /* temp.y = src.x >> 16 */
10920 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10921 alu.op = ALU_OP2_LSHR_INT;
10922 alu.dst.chan = 1;
10923 alu.dst.sel = ctx->temp_reg;
10924 alu.dst.write = 1;
10925 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10926 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10927 alu.src[1].value = 16;
10928 alu.last = 1;
10929 r = r600_bytecode_add_alu(ctx->bc, &alu);
10930 if (r)
10931 return r;
10932
10933 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10934 for (i = 0; i < lasti + 1; i++) {
10935 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10936 continue;
10937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10938 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10939 alu.op = ALU_OP1_FLT16_TO_FLT32;
10940 alu.src[0].sel = ctx->temp_reg;
10941 alu.src[0].chan = i % 2;
10942 alu.last = i == lasti;
10943 r = r600_bytecode_add_alu(ctx->bc, &alu);
10944 if (r)
10945 return r;
10946 }
10947
10948 return 0;
10949 }
10950
tgsi_bfe(struct r600_shader_ctx * ctx)10951 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10952 {
10953 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10954 struct r600_bytecode_alu alu;
10955 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10956 int r, i;
10957 int dst = -1;
10958
10959 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10960 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10961 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10962 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10963 dst = r600_get_temp(ctx);
10964
10965 r = tgsi_op3_dst(ctx, dst);
10966 if (r)
10967 return r;
10968
10969 for (i = 0; i < lasti + 1; i++) {
10970 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10971 alu.op = ALU_OP2_SETGE_INT;
10972 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10973 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10974 alu.src[1].value = 32;
10975 alu.dst.sel = ctx->temp_reg;
10976 alu.dst.chan = i;
10977 alu.dst.write = 1;
10978 if (i == lasti)
10979 alu.last = 1;
10980 r = r600_bytecode_add_alu(ctx->bc, &alu);
10981 if (r)
10982 return r;
10983 }
10984
10985 for (i = 0; i < lasti + 1; i++) {
10986 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10987 alu.op = ALU_OP3_CNDE_INT;
10988 alu.is_op3 = 1;
10989 alu.src[0].sel = ctx->temp_reg;
10990 alu.src[0].chan = i;
10991
10992 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10993 if (dst != -1)
10994 alu.src[1].sel = dst;
10995 else
10996 alu.src[1].sel = alu.dst.sel;
10997 alu.src[1].chan = i;
10998 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10999 alu.dst.write = 1;
11000 if (i == lasti)
11001 alu.last = 1;
11002 r = r600_bytecode_add_alu(ctx->bc, &alu);
11003 if (r)
11004 return r;
11005 }
11006
11007 return 0;
11008 }
11009
tgsi_clock(struct r600_shader_ctx * ctx)11010 static int tgsi_clock(struct r600_shader_ctx *ctx)
11011 {
11012 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11013 struct r600_bytecode_alu alu;
11014 int r;
11015
11016 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11017 alu.op = ALU_OP1_MOV;
11018 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11019 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
11020 r = r600_bytecode_add_alu(ctx->bc, &alu);
11021 if (r)
11022 return r;
11023 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11024 alu.op = ALU_OP1_MOV;
11025 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11026 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
11027 alu.last = 1;
11028 r = r600_bytecode_add_alu(ctx->bc, &alu);
11029 if (r)
11030 return r;
11031 return 0;
11032 }
11033
emit_u64add(struct r600_shader_ctx * ctx,int op,int treg,int src0_sel,int src0_chan,int src1_sel,int src1_chan)11034 static int emit_u64add(struct r600_shader_ctx *ctx, int op,
11035 int treg,
11036 int src0_sel, int src0_chan,
11037 int src1_sel, int src1_chan)
11038 {
11039 struct r600_bytecode_alu alu;
11040 int r;
11041 int opc;
11042
11043 if (op == ALU_OP2_ADD_INT)
11044 opc = ALU_OP2_ADDC_UINT;
11045 else
11046 opc = ALU_OP2_SUBB_UINT;
11047
11048 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11049 alu.op = op; ;
11050 alu.dst.sel = treg;
11051 alu.dst.chan = 0;
11052 alu.dst.write = 1;
11053 alu.src[0].sel = src0_sel;
11054 alu.src[0].chan = src0_chan + 0;
11055 alu.src[1].sel = src1_sel;
11056 alu.src[1].chan = src1_chan + 0;
11057 alu.src[1].neg = 0;
11058 r = r600_bytecode_add_alu(ctx->bc, &alu);
11059 if (r)
11060 return r;
11061
11062 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11063 alu.op = op;
11064 alu.dst.sel = treg;
11065 alu.dst.chan = 1;
11066 alu.dst.write = 1;
11067 alu.src[0].sel = src0_sel;
11068 alu.src[0].chan = src0_chan + 1;
11069 alu.src[1].sel = src1_sel;
11070 alu.src[1].chan = src1_chan + 1;
11071 alu.src[1].neg = 0;
11072 r = r600_bytecode_add_alu(ctx->bc, &alu);
11073 if (r)
11074 return r;
11075
11076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11077 alu.op = opc;
11078 alu.dst.sel = treg;
11079 alu.dst.chan = 2;
11080 alu.dst.write = 1;
11081 alu.last = 1;
11082 alu.src[0].sel = src0_sel;
11083 alu.src[0].chan = src0_chan + 0;
11084 alu.src[1].sel = src1_sel;
11085 alu.src[1].chan = src1_chan + 0;
11086 alu.src[1].neg = 0;
11087 r = r600_bytecode_add_alu(ctx->bc, &alu);
11088 if (r)
11089 return r;
11090
11091 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11092 alu.op = op;
11093 alu.dst.sel = treg;
11094 alu.dst.chan = 1;
11095 alu.dst.write = 1;
11096 alu.src[0].sel = treg;
11097 alu.src[0].chan = 1;
11098 alu.src[1].sel = treg;
11099 alu.src[1].chan = 2;
11100 alu.last = 1;
11101 r = r600_bytecode_add_alu(ctx->bc, &alu);
11102 if (r)
11103 return r;
11104 return 0;
11105 }
11106
egcm_u64add(struct r600_shader_ctx * ctx)11107 static int egcm_u64add(struct r600_shader_ctx *ctx)
11108 {
11109 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11110 struct r600_bytecode_alu alu;
11111 int r;
11112 int treg = ctx->temp_reg;
11113 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
11114
11115 if (ctx->src[1].neg) {
11116 op = ALU_OP2_SUB_INT;
11117 opc = ALU_OP2_SUBB_UINT;
11118 }
11119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11120 alu.op = op; ;
11121 alu.dst.sel = treg;
11122 alu.dst.chan = 0;
11123 alu.dst.write = 1;
11124 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11125 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11126 alu.src[1].neg = 0;
11127 r = r600_bytecode_add_alu(ctx->bc, &alu);
11128 if (r)
11129 return r;
11130
11131 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11132 alu.op = op;
11133 alu.dst.sel = treg;
11134 alu.dst.chan = 1;
11135 alu.dst.write = 1;
11136 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11137 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11138 alu.src[1].neg = 0;
11139 r = r600_bytecode_add_alu(ctx->bc, &alu);
11140 if (r)
11141 return r;
11142
11143 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11144 alu.op = opc ;
11145 alu.dst.sel = treg;
11146 alu.dst.chan = 2;
11147 alu.dst.write = 1;
11148 alu.last = 1;
11149 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11150 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11151 alu.src[1].neg = 0;
11152 r = r600_bytecode_add_alu(ctx->bc, &alu);
11153 if (r)
11154 return r;
11155
11156 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11157 alu.op = op;
11158 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11159 alu.src[0].sel = treg;
11160 alu.src[0].chan = 1;
11161 alu.src[1].sel = treg;
11162 alu.src[1].chan = 2;
11163 alu.last = 1;
11164 r = r600_bytecode_add_alu(ctx->bc, &alu);
11165 if (r)
11166 return r;
11167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11168 alu.op = ALU_OP1_MOV;
11169 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11170 alu.src[0].sel = treg;
11171 alu.src[0].chan = 0;
11172 alu.last = 1;
11173 r = r600_bytecode_add_alu(ctx->bc, &alu);
11174 if (r)
11175 return r;
11176 return 0;
11177 }
11178
11179
egcm_i64neg(struct r600_shader_ctx * ctx)11180 static int egcm_i64neg(struct r600_shader_ctx *ctx)
11181 {
11182 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11183 struct r600_bytecode_alu alu;
11184 int r;
11185 int treg = ctx->temp_reg;
11186 const int op = ALU_OP2_SUB_INT;
11187 const int opc = ALU_OP2_SUBB_UINT;
11188
11189 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11190 alu.op = op; ;
11191 alu.dst.sel = treg;
11192 alu.dst.chan = 0;
11193 alu.dst.write = 1;
11194 alu.src[0].sel = V_SQ_ALU_SRC_0;
11195 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11196 alu.src[1].neg = 0;
11197 r = r600_bytecode_add_alu(ctx->bc, &alu);
11198 if (r)
11199 return r;
11200
11201 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11202 alu.op = op;
11203 alu.dst.sel = treg;
11204 alu.dst.chan = 1;
11205 alu.dst.write = 1;
11206 alu.src[0].sel = V_SQ_ALU_SRC_0;
11207 r600_bytecode_src(&alu.src[1], &ctx->src[0], 1);
11208 alu.src[1].neg = 0;
11209 r = r600_bytecode_add_alu(ctx->bc, &alu);
11210 if (r)
11211 return r;
11212
11213 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11214 alu.op = opc ;
11215 alu.dst.sel = treg;
11216 alu.dst.chan = 2;
11217 alu.dst.write = 1;
11218 alu.last = 1;
11219 alu.src[0].sel = V_SQ_ALU_SRC_0;
11220 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11221 alu.src[1].neg = 0;
11222 r = r600_bytecode_add_alu(ctx->bc, &alu);
11223 if (r)
11224 return r;
11225
11226 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11227 alu.op = op;
11228 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11229 alu.src[0].sel = treg;
11230 alu.src[0].chan = 1;
11231 alu.src[1].sel = treg;
11232 alu.src[1].chan = 2;
11233 alu.last = 1;
11234 r = r600_bytecode_add_alu(ctx->bc, &alu);
11235 if (r)
11236 return r;
11237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11238 alu.op = ALU_OP1_MOV;
11239 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11240 alu.src[0].sel = treg;
11241 alu.src[0].chan = 0;
11242 alu.last = 1;
11243 r = r600_bytecode_add_alu(ctx->bc, &alu);
11244 if (r)
11245 return r;
11246 return 0;
11247 }
11248
11249 /* result.y = mul_high a, b
11250 result.x = mul a,b
11251 result.y += a.x * b.y + a.y * b.x;
11252 */
egcm_u64mul(struct r600_shader_ctx * ctx)11253 static int egcm_u64mul(struct r600_shader_ctx *ctx)
11254 {
11255 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11256 struct r600_bytecode_alu alu;
11257 int r;
11258 int treg = ctx->temp_reg;
11259
11260 /* temp.x = mul_lo a.x, b.x */
11261 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11262 alu.op = ALU_OP2_MULLO_UINT;
11263 alu.dst.sel = treg;
11264 alu.dst.chan = 0;
11265 alu.dst.write = 1;
11266 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11267 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11268 r = emit_mul_int_op(ctx->bc, &alu);
11269 if (r)
11270 return r;
11271
11272 /* temp.y = mul_hi a.x, b.x */
11273 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11274 alu.op = ALU_OP2_MULHI_UINT;
11275 alu.dst.sel = treg;
11276 alu.dst.chan = 1;
11277 alu.dst.write = 1;
11278 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11279 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11280 r = emit_mul_int_op(ctx->bc, &alu);
11281 if (r)
11282 return r;
11283
11284 /* temp.z = mul a.x, b.y */
11285 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11286 alu.op = ALU_OP2_MULLO_UINT;
11287 alu.dst.sel = treg;
11288 alu.dst.chan = 2;
11289 alu.dst.write = 1;
11290 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11291 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11292 r = emit_mul_int_op(ctx->bc, &alu);
11293 if (r)
11294 return r;
11295
11296 /* temp.w = mul a.y, b.x */
11297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11298 alu.op = ALU_OP2_MULLO_UINT;
11299 alu.dst.sel = treg;
11300 alu.dst.chan = 3;
11301 alu.dst.write = 1;
11302 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11303 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11304 r = emit_mul_int_op(ctx->bc, &alu);
11305 if (r)
11306 return r;
11307
11308 /* temp.z = temp.z + temp.w */
11309 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11310 alu.op = ALU_OP2_ADD_INT;
11311 alu.dst.sel = treg;
11312 alu.dst.chan = 2;
11313 alu.dst.write = 1;
11314 alu.src[0].sel = treg;
11315 alu.src[0].chan = 2;
11316 alu.src[1].sel = treg;
11317 alu.src[1].chan = 3;
11318 alu.last = 1;
11319 r = r600_bytecode_add_alu(ctx->bc, &alu);
11320 if (r)
11321 return r;
11322
11323 /* temp.y = temp.y + temp.z */
11324 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11325 alu.op = ALU_OP2_ADD_INT;
11326 alu.dst.sel = treg;
11327 alu.dst.chan = 1;
11328 alu.dst.write = 1;
11329 alu.src[0].sel = treg;
11330 alu.src[0].chan = 1;
11331 alu.src[1].sel = treg;
11332 alu.src[1].chan = 2;
11333 alu.last = 1;
11334 r = r600_bytecode_add_alu(ctx->bc, &alu);
11335 if (r)
11336 return r;
11337
11338 /* dst.x = temp.x */
11339 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11340 alu.op = ALU_OP1_MOV;
11341 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11342 alu.src[0].sel = treg;
11343 alu.src[0].chan = 0;
11344 r = r600_bytecode_add_alu(ctx->bc, &alu);
11345 if (r)
11346 return r;
11347
11348 /* dst.y = temp.y */
11349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11350 alu.op = ALU_OP1_MOV;
11351 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11352 alu.src[0].sel = treg;
11353 alu.src[0].chan = 1;
11354 alu.last = 1;
11355 r = r600_bytecode_add_alu(ctx->bc, &alu);
11356 if (r)
11357 return r;
11358
11359 return 0;
11360 }
11361
emit_u64sge(struct r600_shader_ctx * ctx,int treg,int src0_sel,int src0_base_chan,int src1_sel,int src1_base_chan)11362 static int emit_u64sge(struct r600_shader_ctx *ctx,
11363 int treg,
11364 int src0_sel, int src0_base_chan,
11365 int src1_sel, int src1_base_chan)
11366 {
11367 int r;
11368 /* for 64-bit sge */
11369 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11370 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11371 treg, 1,
11372 src0_sel, src0_base_chan + 1,
11373 src1_sel, src1_base_chan + 1);
11374 if (r)
11375 return r;
11376
11377 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11378 treg, 0,
11379 src0_sel, src0_base_chan,
11380 src1_sel, src1_base_chan);
11381 if (r)
11382 return r;
11383
11384 r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11385 treg, 2,
11386 src0_sel, src0_base_chan + 1,
11387 src1_sel, src1_base_chan + 1);
11388 if (r)
11389 return r;
11390
11391 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11392 treg, 0,
11393 treg, 0,
11394 treg, 2);
11395 if (r)
11396 return r;
11397
11398 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11399 treg, 0,
11400 treg, 0,
11401 treg, 1);
11402 if (r)
11403 return r;
11404 return 0;
11405 }
11406
11407 /* this isn't a complete div it's just enough for qbo shader to work */
egcm_u64div(struct r600_shader_ctx * ctx)11408 static int egcm_u64div(struct r600_shader_ctx *ctx)
11409 {
11410 struct r600_bytecode_alu alu;
11411 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11412 int r, i;
11413 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11414
11415 /* make sure we are dividing my a const with 0 in the high bits */
11416 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11417 return -1;
11418 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11419 return -1;
11420 /* make sure we are doing one division */
11421 if (inst->Dst[0].Register.WriteMask != 0x3)
11422 return -1;
11423
11424 /* emit_if uses ctx->temp_reg so we can't */
11425 int treg = r600_get_temp(ctx);
11426 int tmp_num = r600_get_temp(ctx);
11427 int sub_tmp = r600_get_temp(ctx);
11428
11429 /* tmp quot are tmp_num.zw */
11430 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11431 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11432 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11433 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11434
11435 /* MOV tmp_num.xy, numerator */
11436 r = single_alu_op2(ctx, ALU_OP1_MOV,
11437 tmp_num, 0,
11438 alu_num_lo.sel, alu_num_lo.chan,
11439 0, 0);
11440 if (r)
11441 return r;
11442 r = single_alu_op2(ctx, ALU_OP1_MOV,
11443 tmp_num, 1,
11444 alu_num_hi.sel, alu_num_hi.chan,
11445 0, 0);
11446 if (r)
11447 return r;
11448
11449 r = single_alu_op2(ctx, ALU_OP1_MOV,
11450 tmp_num, 2,
11451 V_SQ_ALU_SRC_LITERAL, 0,
11452 0, 0);
11453 if (r)
11454 return r;
11455
11456 r = single_alu_op2(ctx, ALU_OP1_MOV,
11457 tmp_num, 3,
11458 V_SQ_ALU_SRC_LITERAL, 0,
11459 0, 0);
11460 if (r)
11461 return r;
11462
11463 /* treg 0 is log2_denom */
11464 /* normally this gets the MSB for the denom high value
11465 - however we know this will always be 0 here. */
11466 r = single_alu_op2(ctx,
11467 ALU_OP1_MOV,
11468 treg, 0,
11469 V_SQ_ALU_SRC_LITERAL, 32,
11470 0, 0);
11471 if (r)
11472 return r;
11473
11474 /* normally check demon hi for 0, but we know it is already */
11475 /* t0.z = num_hi >= denom_lo */
11476 r = single_alu_op2(ctx,
11477 ALU_OP2_SETGE_UINT,
11478 treg, 1,
11479 alu_num_hi.sel, alu_num_hi.chan,
11480 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11481 if (r)
11482 return r;
11483
11484 memset(&alu_src, 0, sizeof(alu_src));
11485 alu_src.sel = treg;
11486 alu_src.chan = 1;
11487 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11488 if (r)
11489 return r;
11490
11491 /* for loops in here */
11492 /* get msb t0.x = msb(src[1].x) first */
11493 int msb_lo = util_last_bit(alu_denom_lo.value);
11494 r = single_alu_op2(ctx, ALU_OP1_MOV,
11495 treg, 0,
11496 V_SQ_ALU_SRC_LITERAL, msb_lo,
11497 0, 0);
11498 if (r)
11499 return r;
11500
11501 /* unroll the asm here */
11502 for (i = 0; i < 31; i++) {
11503 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11504 treg, 2,
11505 V_SQ_ALU_SRC_LITERAL, i,
11506 treg, 0);
11507 if (r)
11508 return r;
11509
11510 /* we can do this on the CPU */
11511 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11512 /* t0.z = tmp_num.y >= t0.z */
11513 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11514 treg, 1,
11515 tmp_num, 1,
11516 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11517 if (r)
11518 return r;
11519
11520 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11521 treg, 1,
11522 treg, 1,
11523 treg, 2);
11524 if (r)
11525 return r;
11526
11527 memset(&alu_src, 0, sizeof(alu_src));
11528 alu_src.sel = treg;
11529 alu_src.chan = 1;
11530 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11531 if (r)
11532 return r;
11533
11534 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11535 tmp_num, 1,
11536 tmp_num, 1,
11537 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11538 if (r)
11539 return r;
11540
11541 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11542 tmp_num, 3,
11543 tmp_num, 3,
11544 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11545 if (r)
11546 return r;
11547
11548 r = tgsi_endif(ctx);
11549 if (r)
11550 return r;
11551 }
11552
11553 /* log2_denom is always <= 31, so manually peel the last loop
11554 * iteration.
11555 */
11556 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11557 treg, 1,
11558 tmp_num, 1,
11559 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11560 if (r)
11561 return r;
11562
11563 memset(&alu_src, 0, sizeof(alu_src));
11564 alu_src.sel = treg;
11565 alu_src.chan = 1;
11566 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11567 if (r)
11568 return r;
11569
11570 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11571 tmp_num, 1,
11572 tmp_num, 1,
11573 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11574 if (r)
11575 return r;
11576
11577 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11578 tmp_num, 3,
11579 tmp_num, 3,
11580 V_SQ_ALU_SRC_LITERAL, 1U);
11581 if (r)
11582 return r;
11583 r = tgsi_endif(ctx);
11584 if (r)
11585 return r;
11586
11587 r = tgsi_endif(ctx);
11588 if (r)
11589 return r;
11590
11591 /* onto the second loop to unroll */
11592 for (i = 0; i < 31; i++) {
11593 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11594 treg, 1,
11595 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11596 treg, 0);
11597 if (r)
11598 return r;
11599
11600 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11601 r = single_alu_op2(ctx, ALU_OP1_MOV,
11602 treg, 2,
11603 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11604 0, 0);
11605 if (r)
11606 return r;
11607
11608 r = single_alu_op2(ctx, ALU_OP1_MOV,
11609 treg, 3,
11610 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11611 0, 0);
11612 if (r)
11613 return r;
11614
11615 r = emit_u64sge(ctx, sub_tmp,
11616 tmp_num, 0,
11617 treg, 2);
11618 if (r)
11619 return r;
11620
11621 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11622 treg, 1,
11623 treg, 1,
11624 sub_tmp, 0);
11625 if (r)
11626 return r;
11627
11628 memset(&alu_src, 0, sizeof(alu_src));
11629 alu_src.sel = treg;
11630 alu_src.chan = 1;
11631 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11632 if (r)
11633 return r;
11634
11635
11636 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11637 sub_tmp,
11638 tmp_num, 0,
11639 treg, 2);
11640 if (r)
11641 return r;
11642
11643 r = single_alu_op2(ctx, ALU_OP1_MOV,
11644 tmp_num, 0,
11645 sub_tmp, 0,
11646 0, 0);
11647 if (r)
11648 return r;
11649
11650 r = single_alu_op2(ctx, ALU_OP1_MOV,
11651 tmp_num, 1,
11652 sub_tmp, 1,
11653 0, 0);
11654 if (r)
11655 return r;
11656
11657 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11658 tmp_num, 2,
11659 tmp_num, 2,
11660 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11661 if (r)
11662 return r;
11663
11664 r = tgsi_endif(ctx);
11665 if (r)
11666 return r;
11667 }
11668
11669 /* log2_denom is always <= 63, so manually peel the last loop
11670 * iteration.
11671 */
11672 uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11673 r = single_alu_op2(ctx, ALU_OP1_MOV,
11674 treg, 2,
11675 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11676 0, 0);
11677 if (r)
11678 return r;
11679
11680 r = single_alu_op2(ctx, ALU_OP1_MOV,
11681 treg, 3,
11682 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11683 0, 0);
11684 if (r)
11685 return r;
11686
11687 r = emit_u64sge(ctx, sub_tmp,
11688 tmp_num, 0,
11689 treg, 2);
11690 if (r)
11691 return r;
11692
11693 memset(&alu_src, 0, sizeof(alu_src));
11694 alu_src.sel = sub_tmp;
11695 alu_src.chan = 0;
11696 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11697 if (r)
11698 return r;
11699
11700 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11701 sub_tmp,
11702 tmp_num, 0,
11703 treg, 2);
11704 if (r)
11705 return r;
11706
11707 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11708 tmp_num, 2,
11709 tmp_num, 2,
11710 V_SQ_ALU_SRC_LITERAL, 1U);
11711 if (r)
11712 return r;
11713 r = tgsi_endif(ctx);
11714 if (r)
11715 return r;
11716
11717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11718 alu.op = ALU_OP1_MOV;
11719 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11720 alu.src[0].sel = tmp_num;
11721 alu.src[0].chan = 2;
11722 r = r600_bytecode_add_alu(ctx->bc, &alu);
11723 if (r)
11724 return r;
11725
11726 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11727 alu.op = ALU_OP1_MOV;
11728 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11729 alu.src[0].sel = tmp_num;
11730 alu.src[0].chan = 3;
11731 alu.last = 1;
11732 r = r600_bytecode_add_alu(ctx->bc, &alu);
11733 if (r)
11734 return r;
11735 return 0;
11736 }
11737
egcm_u64sne(struct r600_shader_ctx * ctx)11738 static int egcm_u64sne(struct r600_shader_ctx *ctx)
11739 {
11740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11741 struct r600_bytecode_alu alu;
11742 int r;
11743 int treg = ctx->temp_reg;
11744
11745 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11746 alu.op = ALU_OP2_SETNE_INT;
11747 alu.dst.sel = treg;
11748 alu.dst.chan = 0;
11749 alu.dst.write = 1;
11750 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11751 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11752 r = r600_bytecode_add_alu(ctx->bc, &alu);
11753 if (r)
11754 return r;
11755
11756 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11757 alu.op = ALU_OP2_SETNE_INT;
11758 alu.dst.sel = treg;
11759 alu.dst.chan = 1;
11760 alu.dst.write = 1;
11761 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11762 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11763 alu.last = 1;
11764 r = r600_bytecode_add_alu(ctx->bc, &alu);
11765 if (r)
11766 return r;
11767
11768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11769 alu.op = ALU_OP2_OR_INT;
11770 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11771 alu.src[0].sel = treg;
11772 alu.src[0].chan = 0;
11773 alu.src[1].sel = treg;
11774 alu.src[1].chan = 1;
11775 alu.last = 1;
11776 r = r600_bytecode_add_alu(ctx->bc, &alu);
11777 if (r)
11778 return r;
11779 return 0;
11780 }
11781
11782 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11783 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
11784 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11785 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11786
11787 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11788
11789 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11790 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11791 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11792 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11793 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11794 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11795 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11796 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11797 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11798 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11799 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11800 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11801 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11802 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11803 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11804 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
11805 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11806 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11807 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11808 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11809 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11810 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11811 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11812 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11813 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11814 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11815 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11816 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11817 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11818 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
11819 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11820 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11821 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11822 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11823 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11824 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11825 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
11826 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11827 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11828 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11829 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11830 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11831 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11832 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11833 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11834 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11835 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11836 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11837 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11838 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11839 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11840 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
11841 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11842 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11843 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11844 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11845 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11846 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
11847 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11848 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11849 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11850 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11851 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11852 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11853 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11854 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11855 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11856 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11857 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11858 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11859 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11860 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11861 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11862 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11863 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11864 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11865 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11866 [81] = { ALU_OP0_NOP, tgsi_unsupported},
11867 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11868 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11869 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11870 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11871 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11872 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11873 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11874 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11875 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11876 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11877 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11878 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11879 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11880 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11881 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11882 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11883 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11884 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11885 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11886 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11887 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11888 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11889 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11890 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
11891 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11892 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11893 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11894 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11895 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11896 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11897 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
11898 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11899 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11900 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11901 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11902 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11903 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
11904 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11905 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11906 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11907 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11908 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11909 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11910 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11911 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11912 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11913 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11914 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11915 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11916 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11917 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11918 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11919 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11920 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11921 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11922 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11923 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11924 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11925 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11926 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11927 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11928 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11929 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11930 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11931 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11932 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11933 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11934 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11935 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11936 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11937 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11938 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11939 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11940 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11941 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11942 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11943 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11944 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11945 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11946 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
11947 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
11948 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11949 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11950 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11951 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
11952 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
11953 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
11954 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
11955 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
11956 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
11957 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
11958 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11959 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11960 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11961 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11962 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11963 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11964 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11965 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11966 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11967 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
11968 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
11969 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
11970 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
11971 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
11972 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
11973 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
11974 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
11975 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
11976 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11977 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
11978 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
11979 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
11980 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11981 };
11982
11983 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11984 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11985 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11986 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11987 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11988 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11989 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11990 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11991 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11992 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11993 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11994 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11995 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11996 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11997 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11998 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11999 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
12000 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
12001 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
12002 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
12003 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
12004 [21] = { ALU_OP0_NOP, tgsi_unsupported},
12005 [22] = { ALU_OP0_NOP, tgsi_unsupported},
12006 [23] = { ALU_OP0_NOP, tgsi_unsupported},
12007 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
12008 [25] = { ALU_OP0_NOP, tgsi_unsupported},
12009 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
12010 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
12011 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
12012 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
12013 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
12014 [31] = { ALU_OP0_NOP, tgsi_unsupported},
12015 [32] = { ALU_OP0_NOP, tgsi_unsupported},
12016 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
12017 [34] = { ALU_OP0_NOP, tgsi_unsupported},
12018 [35] = { ALU_OP0_NOP, tgsi_unsupported},
12019 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
12020 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12021 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12022 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
12023 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
12024 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
12025 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
12026 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12027 [44] = { ALU_OP0_NOP, tgsi_unsupported},
12028 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
12029 [46] = { ALU_OP0_NOP, tgsi_unsupported},
12030 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
12031 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
12032 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
12033 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
12034 [51] = { ALU_OP0_NOP, tgsi_unsupported},
12035 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
12036 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
12037 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
12038 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
12039 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
12040 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
12041 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12042 [59] = { ALU_OP0_NOP, tgsi_unsupported},
12043 [60] = { ALU_OP0_NOP, tgsi_unsupported},
12044 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
12045 [62] = { ALU_OP0_NOP, tgsi_unsupported},
12046 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
12047 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
12048 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
12049 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
12050 [67] = { ALU_OP0_NOP, tgsi_unsupported},
12051 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12052 [69] = { ALU_OP0_NOP, tgsi_unsupported},
12053 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
12054 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12055 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12056 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12057 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
12058 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
12059 [76] = { ALU_OP0_NOP, tgsi_unsupported},
12060 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
12061 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
12062 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12063 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12064 [82] = { ALU_OP0_NOP, tgsi_unsupported},
12065 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
12066 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
12067 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
12068 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
12069 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
12070 [88] = { ALU_OP0_NOP, tgsi_unsupported},
12071 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
12072 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
12073 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
12074 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
12075 [93] = { ALU_OP0_NOP, tgsi_unsupported},
12076 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
12077 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12078 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12079 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12080 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12081 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
12082 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12083 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
12084 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12085 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12086 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12087 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12088 [106] = { ALU_OP0_NOP, tgsi_unsupported},
12089 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
12090 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
12091 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
12092 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12093 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12094 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
12095 [113] = { ALU_OP0_NOP, tgsi_unsupported},
12096 [114] = { ALU_OP0_NOP, tgsi_unsupported},
12097 [115] = { ALU_OP0_NOP, tgsi_unsupported},
12098 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
12099 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
12100 /* Refer below for TGSI_OPCODE_DFMA */
12101 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
12102 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
12103 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
12104 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
12105 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
12106 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
12107 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
12108 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12109 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
12110 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
12111 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
12112 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12113 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12114 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12115 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12116 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12117 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
12118 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12119 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12120 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12121 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12122 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12123 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12124 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12125 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12126 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12127 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12128 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12129 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12130 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12131 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12132 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12133 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12134 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12135 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12136 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12137 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12138 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12139 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12140 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12141 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12142 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12143 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12144 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12145 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12146 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12147 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12148 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12149 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12150 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12151 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12152 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12153 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12154 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12155 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12156 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12157 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12158 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12159 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12160 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12161 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12162 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
12163 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
12164 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12165 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12166 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12167 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12168 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12169 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12170 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12171 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12172 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12173 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12174 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12175 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12176 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12177 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12178 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12179 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12180 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12181 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12182 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12183 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12184 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12185 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12186 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12187 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12188 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12189 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12190 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12191 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12192 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12193 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12194 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12195 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12196 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12197 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12198 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12199 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12200 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12201 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12202 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12203 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12204 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12205 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12206 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg },
12207 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12208 };
12209
12210 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12211 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
12212 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
12213 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
12214 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12215 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12216 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
12217 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
12218 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
12219 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
12220 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12221 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12222 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
12223 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
12224 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
12225 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
12226 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
12227 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
12228 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
12229 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
12230 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12231 [21] = { ALU_OP0_NOP, tgsi_unsupported},
12232 [22] = { ALU_OP0_NOP, tgsi_unsupported},
12233 [23] = { ALU_OP0_NOP, tgsi_unsupported},
12234 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
12235 [25] = { ALU_OP0_NOP, tgsi_unsupported},
12236 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
12237 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
12238 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12239 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12240 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
12241 [31] = { ALU_OP0_NOP, tgsi_unsupported},
12242 [32] = { ALU_OP0_NOP, tgsi_unsupported},
12243 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
12244 [34] = { ALU_OP0_NOP, tgsi_unsupported},
12245 [35] = { ALU_OP0_NOP, tgsi_unsupported},
12246 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
12247 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12248 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12249 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
12250 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
12251 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
12252 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
12253 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12254 [44] = { ALU_OP0_NOP, tgsi_unsupported},
12255 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
12256 [46] = { ALU_OP0_NOP, tgsi_unsupported},
12257 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
12258 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
12259 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
12260 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
12261 [51] = { ALU_OP0_NOP, tgsi_unsupported},
12262 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
12263 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
12264 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
12265 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
12266 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
12267 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
12268 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12269 [59] = { ALU_OP0_NOP, tgsi_unsupported},
12270 [60] = { ALU_OP0_NOP, tgsi_unsupported},
12271 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
12272 [62] = { ALU_OP0_NOP, tgsi_unsupported},
12273 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
12274 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
12275 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
12276 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
12277 [67] = { ALU_OP0_NOP, tgsi_unsupported},
12278 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12279 [69] = { ALU_OP0_NOP, tgsi_unsupported},
12280 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
12281 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12282 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12283 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12284 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
12285 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
12286 [76] = { ALU_OP0_NOP, tgsi_unsupported},
12287 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
12288 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
12289 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12290 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12291 [82] = { ALU_OP0_NOP, tgsi_unsupported},
12292 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
12293 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
12294 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
12295 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
12296 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
12297 [88] = { ALU_OP0_NOP, tgsi_unsupported},
12298 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
12299 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
12300 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
12301 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
12302 [93] = { ALU_OP0_NOP, tgsi_unsupported},
12303 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
12304 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12305 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12306 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12307 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12308 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
12309 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12310 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
12311 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12312 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12313 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12314 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12315 [106] = { ALU_OP0_NOP, tgsi_unsupported},
12316 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
12317 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
12318 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
12319 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12320 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12321 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
12322 [113] = { ALU_OP0_NOP, tgsi_unsupported},
12323 [114] = { ALU_OP0_NOP, tgsi_unsupported},
12324 [115] = { ALU_OP0_NOP, tgsi_unsupported},
12325 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
12326 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
12327 /* Refer below for TGSI_OPCODE_DFMA */
12328 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
12329 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
12330 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
12331 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
12332 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
12333 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
12334 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
12335 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12336 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12337 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12338 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
12339 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12340 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12341 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12342 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12343 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12344 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12345 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12346 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12347 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12348 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12349 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12350 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12351 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12352 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12353 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12354 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12355 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12356 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12357 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12358 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12359 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12360 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12361 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12362 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12363 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12364 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12365 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12366 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12367 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12368 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12369 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12370 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12371 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12372 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12373 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12374 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12375 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12376 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12377 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12378 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12379 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12380 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12381 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12382 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12383 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12384 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12385 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12386 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12387 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12388 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12389 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12390 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12391 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12392 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12393 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12394 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12395 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12396 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12397 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12398 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12399 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12400 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12401 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12402 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12403 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12404 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12405 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12406 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12407 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12408 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12409 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12410 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12411 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12412 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12413 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12414 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12415 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12416 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12417 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12418 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12419 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12420 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12421 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12422 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12423 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12424 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12425 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12426 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12427 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12428 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12429 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12430 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12431 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12432 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12433 [TGSI_OPCODE_I64NEG] = { ALU_OP0_NOP, egcm_i64neg },
12434 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12435 };
12436