1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
r600_add_gpr_array(struct r600_shader * ps,int start_gpr,int size,unsigned comp_mask)90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
r600_dump_streamout(struct pipe_stream_output_info * so)110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194
195 /* disable SB for shaders using doubles */
196 use_sb &= !shader->shader.uses_doubles;
197
198 use_sb &= !shader->shader.uses_atomics;
199 use_sb &= !shader->shader.uses_images;
200
201 /* Check if the bytecode has already been built. */
202 if (!shader->shader.bc.bytecode) {
203 r = r600_bytecode_build(&shader->shader.bc);
204 if (r) {
205 R600_ERR("building bytecode failed !\n");
206 goto error;
207 }
208 }
209
210 if (dump && !sb_disasm) {
211 fprintf(stderr, "--------------------------------------------------------------\n");
212 r600_bytecode_disasm(&shader->shader.bc);
213 fprintf(stderr, "______________________________________________________________\n");
214 } else if ((dump && sb_disasm) || use_sb) {
215 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
216 dump, use_sb);
217 if (r) {
218 R600_ERR("r600_sb_bytecode_process failed !\n");
219 goto error;
220 }
221 }
222
223 if (shader->gs_copy_shader) {
224 if (dump) {
225 // dump copy shader
226 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
227 &shader->gs_copy_shader->shader, dump, 0);
228 if (r)
229 goto error;
230 }
231
232 if ((r = store_shader(ctx, shader->gs_copy_shader)))
233 goto error;
234 }
235
236 /* Store the shader in a buffer. */
237 if ((r = store_shader(ctx, shader)))
238 goto error;
239
240 /* Build state. */
241 switch (shader->shader.processor_type) {
242 case PIPE_SHADER_TESS_CTRL:
243 evergreen_update_hs_state(ctx, shader);
244 break;
245 case PIPE_SHADER_TESS_EVAL:
246 if (key.tes.as_es)
247 evergreen_update_es_state(ctx, shader);
248 else
249 evergreen_update_vs_state(ctx, shader);
250 break;
251 case PIPE_SHADER_GEOMETRY:
252 if (rctx->b.chip_class >= EVERGREEN) {
253 evergreen_update_gs_state(ctx, shader);
254 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
255 } else {
256 r600_update_gs_state(ctx, shader);
257 r600_update_vs_state(ctx, shader->gs_copy_shader);
258 }
259 break;
260 case PIPE_SHADER_VERTEX:
261 export_shader = key.vs.as_es;
262 if (rctx->b.chip_class >= EVERGREEN) {
263 if (key.vs.as_ls)
264 evergreen_update_ls_state(ctx, shader);
265 else if (key.vs.as_es)
266 evergreen_update_es_state(ctx, shader);
267 else
268 evergreen_update_vs_state(ctx, shader);
269 } else {
270 if (export_shader)
271 r600_update_es_state(ctx, shader);
272 else
273 r600_update_vs_state(ctx, shader);
274 }
275 break;
276 case PIPE_SHADER_FRAGMENT:
277 if (rctx->b.chip_class >= EVERGREEN) {
278 evergreen_update_ps_state(ctx, shader);
279 } else {
280 r600_update_ps_state(ctx, shader);
281 }
282 break;
283 case PIPE_SHADER_COMPUTE:
284 evergreen_update_ls_state(ctx, shader);
285 break;
286 default:
287 r = -EINVAL;
288 goto error;
289 }
290 return 0;
291
292 error:
293 r600_pipe_shader_destroy(ctx, shader);
294 return r;
295 }
296
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)297 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
298 {
299 r600_resource_reference(&shader->bo, NULL);
300 r600_bytecode_clear(&shader->shader.bc);
301 r600_release_command_buffer(&shader->command_buffer);
302 }
303
304 /*
305 * tgsi -> r600 shader
306 */
307 struct r600_shader_tgsi_instruction;
308
309 struct r600_shader_src {
310 unsigned sel;
311 unsigned swizzle[4];
312 unsigned neg;
313 unsigned abs;
314 unsigned rel;
315 unsigned kc_bank;
316 boolean kc_rel; /* true if cache bank is indexed */
317 uint32_t value[4];
318 };
319
320 struct eg_interp {
321 boolean enabled;
322 unsigned ij_index;
323 };
324
325 struct r600_shader_ctx {
326 struct tgsi_shader_info info;
327 struct tgsi_parse_context parse;
328 const struct tgsi_token *tokens;
329 unsigned type;
330 unsigned file_offset[TGSI_FILE_COUNT];
331 unsigned temp_reg;
332 const struct r600_shader_tgsi_instruction *inst_info;
333 struct r600_bytecode *bc;
334 struct r600_shader *shader;
335 struct r600_shader_src src[4];
336 uint32_t *literals;
337 uint32_t nliterals;
338 uint32_t max_driver_temp_used;
339 /* needed for evergreen interpolation */
340 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
341 /* evergreen/cayman also store sample mask in face register */
342 int face_gpr;
343 /* sample id is .w component stored in fixed point position register */
344 int fixed_pt_position_gpr;
345 int colors_used;
346 boolean clip_vertex_write;
347 unsigned cv_output;
348 unsigned edgeflag_output;
349 int cs_block_size_reg;
350 int cs_grid_size_reg;
351 bool cs_block_size_loaded, cs_grid_size_loaded;
352 int fragcoord_input;
353 int next_ring_offset;
354 int gs_out_ring_offset;
355 int gs_next_vertex;
356 struct r600_shader *gs_for_vs;
357 int gs_export_gpr_tregs[4];
358 int gs_rotated_input[2];
359 const struct pipe_stream_output_info *gs_stream_output_info;
360 unsigned enabled_stream_buffers_mask;
361 unsigned tess_input_info; /* temp with tess input offsets */
362 unsigned tess_output_info; /* temp with tess input offsets */
363 unsigned thread_id_gpr; /* temp with thread id calculated for images */
364 bool thread_id_gpr_loaded;
365 };
366
367 struct r600_shader_tgsi_instruction {
368 unsigned op;
369 int (*process)(struct r600_shader_ctx *ctx);
370 };
371
372 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
373 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
374 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
375 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
376 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
377 static int tgsi_else(struct r600_shader_ctx *ctx);
378 static int tgsi_endif(struct r600_shader_ctx *ctx);
379 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
380 static int tgsi_endloop(struct r600_shader_ctx *ctx);
381 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
382 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
383 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
384 unsigned int dst_reg);
385 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
386 const struct r600_shader_src *shader_src,
387 unsigned chan);
388 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
389 unsigned dst_reg, unsigned mask);
390
ctx_needs_stack_workaround_8xx(struct r600_shader_ctx * ctx)391 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
392 {
393 if (ctx->bc->family == CHIP_HEMLOCK ||
394 ctx->bc->family == CHIP_CYPRESS ||
395 ctx->bc->family == CHIP_JUNIPER)
396 return false;
397 return true;
398 }
399
tgsi_last_instruction(unsigned writemask)400 static int tgsi_last_instruction(unsigned writemask)
401 {
402 int i, lasti = 0;
403
404 for (i = 0; i < 4; i++) {
405 if (writemask & (1 << i)) {
406 lasti = i;
407 }
408 }
409 return lasti;
410 }
411
tgsi_is_supported(struct r600_shader_ctx * ctx)412 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
413 {
414 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
415 unsigned j;
416
417 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
418 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
419 return -EINVAL;
420 }
421 #if 0
422 if (i->Instruction.Label) {
423 R600_ERR("label unsupported\n");
424 return -EINVAL;
425 }
426 #endif
427 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
428 if (i->Src[j].Register.Dimension) {
429 switch (i->Src[j].Register.File) {
430 case TGSI_FILE_CONSTANT:
431 case TGSI_FILE_HW_ATOMIC:
432 break;
433 case TGSI_FILE_INPUT:
434 if (ctx->type == PIPE_SHADER_GEOMETRY ||
435 ctx->type == PIPE_SHADER_TESS_CTRL ||
436 ctx->type == PIPE_SHADER_TESS_EVAL)
437 break;
438 case TGSI_FILE_OUTPUT:
439 if (ctx->type == PIPE_SHADER_TESS_CTRL)
440 break;
441 default:
442 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
443 i->Src[j].Register.File,
444 i->Src[j].Register.Dimension);
445 return -EINVAL;
446 }
447 }
448 }
449 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
450 if (i->Dst[j].Register.Dimension) {
451 if (ctx->type == PIPE_SHADER_TESS_CTRL)
452 continue;
453 R600_ERR("unsupported dst (dimension)\n");
454 return -EINVAL;
455 }
456 }
457 return 0;
458 }
459
eg_get_interpolator_index(unsigned interpolate,unsigned location)460 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
461 {
462 if (interpolate == TGSI_INTERPOLATE_COLOR ||
463 interpolate == TGSI_INTERPOLATE_LINEAR ||
464 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
465 {
466 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
467 int loc;
468
469 switch(location) {
470 case TGSI_INTERPOLATE_LOC_CENTER:
471 loc = 1;
472 break;
473 case TGSI_INTERPOLATE_LOC_CENTROID:
474 loc = 2;
475 break;
476 case TGSI_INTERPOLATE_LOC_SAMPLE:
477 default:
478 loc = 0; break;
479 }
480
481 return is_linear * 3 + loc;
482 }
483
484 return -1;
485 }
486
evergreen_interp_assign_ij_index(struct r600_shader_ctx * ctx,int input)487 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
488 int input)
489 {
490 int i = eg_get_interpolator_index(
491 ctx->shader->input[input].interpolate,
492 ctx->shader->input[input].interpolate_location);
493 assert(i >= 0);
494 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
495 }
496
evergreen_interp_alu(struct r600_shader_ctx * ctx,int input)497 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
498 {
499 int i, r;
500 struct r600_bytecode_alu alu;
501 int gpr = 0, base_chan = 0;
502 int ij_index = ctx->shader->input[input].ij_index;
503
504 /* work out gpr and base_chan from index */
505 gpr = ij_index / 2;
506 base_chan = (2 * (ij_index % 2)) + 1;
507
508 for (i = 0; i < 8; i++) {
509 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
510
511 if (i < 4)
512 alu.op = ALU_OP2_INTERP_ZW;
513 else
514 alu.op = ALU_OP2_INTERP_XY;
515
516 if ((i > 1) && (i < 6)) {
517 alu.dst.sel = ctx->shader->input[input].gpr;
518 alu.dst.write = 1;
519 }
520
521 alu.dst.chan = i % 4;
522
523 alu.src[0].sel = gpr;
524 alu.src[0].chan = (base_chan - (i % 2));
525
526 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
527
528 alu.bank_swizzle_force = SQ_ALU_VEC_210;
529 if ((i % 4) == 3)
530 alu.last = 1;
531 r = r600_bytecode_add_alu(ctx->bc, &alu);
532 if (r)
533 return r;
534 }
535 return 0;
536 }
537
evergreen_interp_flat(struct r600_shader_ctx * ctx,int input)538 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
539 {
540 int i, r;
541 struct r600_bytecode_alu alu;
542
543 for (i = 0; i < 4; i++) {
544 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
545
546 alu.op = ALU_OP1_INTERP_LOAD_P0;
547
548 alu.dst.sel = ctx->shader->input[input].gpr;
549 alu.dst.write = 1;
550
551 alu.dst.chan = i;
552
553 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
554 alu.src[0].chan = i;
555
556 if (i == 3)
557 alu.last = 1;
558 r = r600_bytecode_add_alu(ctx->bc, &alu);
559 if (r)
560 return r;
561 }
562 return 0;
563 }
564
565 /*
566 * Special export handling in shaders
567 *
568 * shader export ARRAY_BASE for EXPORT_POS:
569 * 60 is position
570 * 61 is misc vector
571 * 62, 63 are clip distance vectors
572 *
573 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
574 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
575 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
576 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
577 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
578 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
579 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
580 * exclusive from render target index)
581 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
582 *
583 *
584 * shader export ARRAY_BASE for EXPORT_PIXEL:
585 * 0-7 CB targets
586 * 61 computed Z vector
587 *
588 * The use of the values exported in the computed Z vector are controlled
589 * by DB_SHADER_CONTROL:
590 * Z_EXPORT_ENABLE - Z as a float in RED
591 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
592 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
593 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
594 * DB_SOURCE_FORMAT - export control restrictions
595 *
596 */
597
598
599 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
r600_spi_sid(struct r600_shader_io * io)600 static int r600_spi_sid(struct r600_shader_io * io)
601 {
602 int index, name = io->name;
603
604 /* These params are handled differently, they don't need
605 * semantic indices, so we'll use 0 for them.
606 */
607 if (name == TGSI_SEMANTIC_POSITION ||
608 name == TGSI_SEMANTIC_PSIZE ||
609 name == TGSI_SEMANTIC_EDGEFLAG ||
610 name == TGSI_SEMANTIC_FACE ||
611 name == TGSI_SEMANTIC_SAMPLEMASK)
612 index = 0;
613 else {
614 if (name == TGSI_SEMANTIC_GENERIC) {
615 /* For generic params simply use sid from tgsi */
616 index = io->sid;
617 } else {
618 /* For non-generic params - pack name and sid into 8 bits */
619 index = 0x80 | (name<<3) | (io->sid);
620 }
621
622 /* Make sure that all really used indices have nonzero value, so
623 * we can just compare it to 0 later instead of comparing the name
624 * with different values to detect special cases. */
625 index++;
626 }
627
628 return index;
629 };
630
631 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)632 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
633 {
634 switch (semantic_name) {
635 case TGSI_SEMANTIC_POSITION:
636 return 0;
637 case TGSI_SEMANTIC_PSIZE:
638 return 1;
639 case TGSI_SEMANTIC_CLIPDIST:
640 assert(index <= 1);
641 return 2 + index;
642 case TGSI_SEMANTIC_GENERIC:
643 if (index <= 63-4)
644 return 4 + index - 9;
645 else
646 /* same explanation as in the default statement,
647 * the only user hitting this is st/nine.
648 */
649 return 0;
650
651 /* patch indices are completely separate and thus start from 0 */
652 case TGSI_SEMANTIC_TESSOUTER:
653 return 0;
654 case TGSI_SEMANTIC_TESSINNER:
655 return 1;
656 case TGSI_SEMANTIC_PATCH:
657 return 2 + index;
658
659 default:
660 /* Don't fail here. The result of this function is only used
661 * for LS, TCS, TES, and GS, where legacy GL semantics can't
662 * occur, but this function is called for all vertex shaders
663 * before it's known whether LS will be compiled or not.
664 */
665 return 0;
666 }
667 }
668
669 /* turn input into interpolate on EG */
evergreen_interp_input(struct r600_shader_ctx * ctx,int index)670 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
671 {
672 int r = 0;
673
674 if (ctx->shader->input[index].spi_sid) {
675 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
676 if (ctx->shader->input[index].interpolate > 0) {
677 evergreen_interp_assign_ij_index(ctx, index);
678 r = evergreen_interp_alu(ctx, index);
679 } else {
680 r = evergreen_interp_flat(ctx, index);
681 }
682 }
683 return r;
684 }
685
select_twoside_color(struct r600_shader_ctx * ctx,int front,int back)686 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
687 {
688 struct r600_bytecode_alu alu;
689 int i, r;
690 int gpr_front = ctx->shader->input[front].gpr;
691 int gpr_back = ctx->shader->input[back].gpr;
692
693 for (i = 0; i < 4; i++) {
694 memset(&alu, 0, sizeof(alu));
695 alu.op = ALU_OP3_CNDGT;
696 alu.is_op3 = 1;
697 alu.dst.write = 1;
698 alu.dst.sel = gpr_front;
699 alu.src[0].sel = ctx->face_gpr;
700 alu.src[1].sel = gpr_front;
701 alu.src[2].sel = gpr_back;
702
703 alu.dst.chan = i;
704 alu.src[1].chan = i;
705 alu.src[2].chan = i;
706 alu.last = (i==3);
707
708 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
709 return r;
710 }
711
712 return 0;
713 }
714
715 /* execute a single slot ALU calculation */
single_alu_op2(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val)716 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
717 int dst_sel, int dst_chan,
718 int src0_sel, unsigned src0_chan_val,
719 int src1_sel, unsigned src1_chan_val)
720 {
721 struct r600_bytecode_alu alu;
722 int r, i;
723
724 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
725 for (i = 0; i < 4; i++) {
726 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
727 alu.op = op;
728 alu.src[0].sel = src0_sel;
729 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
730 alu.src[0].value = src0_chan_val;
731 else
732 alu.src[0].chan = src0_chan_val;
733 alu.src[1].sel = src1_sel;
734 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
735 alu.src[1].value = src1_chan_val;
736 else
737 alu.src[1].chan = src1_chan_val;
738 alu.dst.sel = dst_sel;
739 alu.dst.chan = i;
740 alu.dst.write = i == dst_chan;
741 alu.last = (i == 3);
742 r = r600_bytecode_add_alu(ctx->bc, &alu);
743 if (r)
744 return r;
745 }
746 return 0;
747 }
748
749 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
750 alu.op = op;
751 alu.src[0].sel = src0_sel;
752 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
753 alu.src[0].value = src0_chan_val;
754 else
755 alu.src[0].chan = src0_chan_val;
756 alu.src[1].sel = src1_sel;
757 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
758 alu.src[1].value = src1_chan_val;
759 else
760 alu.src[1].chan = src1_chan_val;
761 alu.dst.sel = dst_sel;
762 alu.dst.chan = dst_chan;
763 alu.dst.write = 1;
764 alu.last = 1;
765 r = r600_bytecode_add_alu(ctx->bc, &alu);
766 if (r)
767 return r;
768 return 0;
769 }
770
771 /* execute a single slot ALU calculation */
single_alu_op3(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val,int src2_sel,unsigned src2_chan_val)772 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
773 int dst_sel, int dst_chan,
774 int src0_sel, unsigned src0_chan_val,
775 int src1_sel, unsigned src1_chan_val,
776 int src2_sel, unsigned src2_chan_val)
777 {
778 struct r600_bytecode_alu alu;
779 int r;
780
781 /* validate this for other ops */
782 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
783 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
784 alu.op = op;
785 alu.src[0].sel = src0_sel;
786 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
787 alu.src[0].value = src0_chan_val;
788 else
789 alu.src[0].chan = src0_chan_val;
790 alu.src[1].sel = src1_sel;
791 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
792 alu.src[1].value = src1_chan_val;
793 else
794 alu.src[1].chan = src1_chan_val;
795 alu.src[2].sel = src2_sel;
796 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
797 alu.src[2].value = src2_chan_val;
798 else
799 alu.src[2].chan = src2_chan_val;
800 alu.dst.sel = dst_sel;
801 alu.dst.chan = dst_chan;
802 alu.is_op3 = 1;
803 alu.last = 1;
804 r = r600_bytecode_add_alu(ctx->bc, &alu);
805 if (r)
806 return r;
807 return 0;
808 }
809
810 /* put it in temp_reg.x */
get_lds_offset0(struct r600_shader_ctx * ctx,int rel_patch_chan,int temp_reg,bool is_patch_var)811 static int get_lds_offset0(struct r600_shader_ctx *ctx,
812 int rel_patch_chan,
813 int temp_reg, bool is_patch_var)
814 {
815 int r;
816
817 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
818 /* ADD
819 Dimension - patch0_offset (input_vals.z),
820 Non-dim - patch0_data_offset (input_vals.w)
821 */
822 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
823 temp_reg, 0,
824 ctx->tess_output_info, 0,
825 0, rel_patch_chan,
826 ctx->tess_output_info, is_patch_var ? 3 : 2);
827 if (r)
828 return r;
829 return 0;
830 }
831
get_address_file_reg(struct r600_shader_ctx * ctx,int index)832 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
833 {
834 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
835 }
836
r600_get_temp(struct r600_shader_ctx * ctx)837 static int r600_get_temp(struct r600_shader_ctx *ctx)
838 {
839 return ctx->temp_reg + ctx->max_driver_temp_used++;
840 }
841
vs_add_primid_output(struct r600_shader_ctx * ctx,int prim_id_sid)842 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
843 {
844 int i;
845 i = ctx->shader->noutput++;
846 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
847 ctx->shader->output[i].sid = 0;
848 ctx->shader->output[i].gpr = 0;
849 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
850 ctx->shader->output[i].write_mask = 0x4;
851 ctx->shader->output[i].spi_sid = prim_id_sid;
852
853 return 0;
854 }
855
tgsi_barrier(struct r600_shader_ctx * ctx)856 static int tgsi_barrier(struct r600_shader_ctx *ctx)
857 {
858 struct r600_bytecode_alu alu;
859 int r;
860
861 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
862 alu.op = ctx->inst_info->op;
863 alu.last = 1;
864
865 r = r600_bytecode_add_alu(ctx->bc, &alu);
866 if (r)
867 return r;
868 return 0;
869 }
870
tgsi_declaration(struct r600_shader_ctx * ctx)871 static int tgsi_declaration(struct r600_shader_ctx *ctx)
872 {
873 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
874 int r, i, j, count = d->Range.Last - d->Range.First + 1;
875
876 switch (d->Declaration.File) {
877 case TGSI_FILE_INPUT:
878 for (j = 0; j < count; j++) {
879 i = ctx->shader->ninput + j;
880 assert(i < ARRAY_SIZE(ctx->shader->input));
881 ctx->shader->input[i].name = d->Semantic.Name;
882 ctx->shader->input[i].sid = d->Semantic.Index + j;
883 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
884 ctx->shader->input[i].interpolate_location = d->Interp.Location;
885 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
886 if (ctx->type == PIPE_SHADER_FRAGMENT) {
887 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
888 switch (ctx->shader->input[i].name) {
889 case TGSI_SEMANTIC_FACE:
890 if (ctx->face_gpr != -1)
891 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
892 else
893 ctx->face_gpr = ctx->shader->input[i].gpr;
894 break;
895 case TGSI_SEMANTIC_COLOR:
896 ctx->colors_used++;
897 break;
898 case TGSI_SEMANTIC_POSITION:
899 ctx->fragcoord_input = i;
900 break;
901 case TGSI_SEMANTIC_PRIMID:
902 /* set this for now */
903 ctx->shader->gs_prim_id_input = true;
904 ctx->shader->ps_prim_id_input = i;
905 break;
906 }
907 if (ctx->bc->chip_class >= EVERGREEN) {
908 if ((r = evergreen_interp_input(ctx, i)))
909 return r;
910 }
911 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
912 /* FIXME probably skip inputs if they aren't passed in the ring */
913 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
914 ctx->next_ring_offset += 16;
915 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
916 ctx->shader->gs_prim_id_input = true;
917 }
918 }
919 ctx->shader->ninput += count;
920 break;
921 case TGSI_FILE_OUTPUT:
922 for (j = 0; j < count; j++) {
923 i = ctx->shader->noutput + j;
924 assert(i < ARRAY_SIZE(ctx->shader->output));
925 ctx->shader->output[i].name = d->Semantic.Name;
926 ctx->shader->output[i].sid = d->Semantic.Index + j;
927 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
928 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
929 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
930 if (ctx->type == PIPE_SHADER_VERTEX ||
931 ctx->type == PIPE_SHADER_GEOMETRY ||
932 ctx->type == PIPE_SHADER_TESS_EVAL) {
933 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
934 switch (d->Semantic.Name) {
935 case TGSI_SEMANTIC_CLIPDIST:
936 break;
937 case TGSI_SEMANTIC_PSIZE:
938 ctx->shader->vs_out_misc_write = 1;
939 ctx->shader->vs_out_point_size = 1;
940 break;
941 case TGSI_SEMANTIC_EDGEFLAG:
942 ctx->shader->vs_out_misc_write = 1;
943 ctx->shader->vs_out_edgeflag = 1;
944 ctx->edgeflag_output = i;
945 break;
946 case TGSI_SEMANTIC_VIEWPORT_INDEX:
947 ctx->shader->vs_out_misc_write = 1;
948 ctx->shader->vs_out_viewport = 1;
949 break;
950 case TGSI_SEMANTIC_LAYER:
951 ctx->shader->vs_out_misc_write = 1;
952 ctx->shader->vs_out_layer = 1;
953 break;
954 case TGSI_SEMANTIC_CLIPVERTEX:
955 ctx->clip_vertex_write = TRUE;
956 ctx->cv_output = i;
957 break;
958 }
959 if (ctx->type == PIPE_SHADER_GEOMETRY) {
960 ctx->gs_out_ring_offset += 16;
961 }
962 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
963 switch (d->Semantic.Name) {
964 case TGSI_SEMANTIC_COLOR:
965 ctx->shader->nr_ps_max_color_exports++;
966 break;
967 }
968 }
969 }
970 ctx->shader->noutput += count;
971 break;
972 case TGSI_FILE_TEMPORARY:
973 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
974 if (d->Array.ArrayID) {
975 r600_add_gpr_array(ctx->shader,
976 ctx->file_offset[TGSI_FILE_TEMPORARY] +
977 d->Range.First,
978 d->Range.Last - d->Range.First + 1, 0x0F);
979 }
980 }
981 break;
982
983 case TGSI_FILE_CONSTANT:
984 case TGSI_FILE_SAMPLER:
985 case TGSI_FILE_SAMPLER_VIEW:
986 case TGSI_FILE_ADDRESS:
987 case TGSI_FILE_BUFFER:
988 case TGSI_FILE_IMAGE:
989 case TGSI_FILE_MEMORY:
990 break;
991
992 case TGSI_FILE_HW_ATOMIC:
993 i = ctx->shader->nhwatomic_ranges;
994 ctx->shader->atomics[i].start = d->Range.First;
995 ctx->shader->atomics[i].end = d->Range.Last;
996 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
997 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
998 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
999 ctx->shader->nhwatomic_ranges++;
1000 ctx->shader->nhwatomic += count;
1001 break;
1002
1003 case TGSI_FILE_SYSTEM_VALUE:
1004 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1005 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1006 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1007 break; /* Already handled from allocate_system_value_inputs */
1008 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1009 break;
1010 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1011 break;
1012 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1013 break;
1014 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1015 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1016 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1017 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1018 unsigned temp_reg = r600_get_temp(ctx);
1019
1020 r = get_lds_offset0(ctx, 2, temp_reg, true);
1021 if (r)
1022 return r;
1023
1024 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1025 temp_reg, 0,
1026 temp_reg, 0,
1027 V_SQ_ALU_SRC_LITERAL, param * 16);
1028 if (r)
1029 return r;
1030
1031 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1032 }
1033 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1034 /* MOV r1.x, r0.x;
1035 MOV r1.y, r0.y;
1036 */
1037 for (i = 0; i < 2; i++) {
1038 struct r600_bytecode_alu alu;
1039 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1040 alu.op = ALU_OP1_MOV;
1041 alu.src[0].sel = 0;
1042 alu.src[0].chan = 0 + i;
1043 alu.dst.sel = 1;
1044 alu.dst.chan = 0 + i;
1045 alu.dst.write = 1;
1046 alu.last = (i == 1) ? 1 : 0;
1047 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1048 return r;
1049 }
1050 /* ADD r1.z, 1.0f, -r0.x */
1051 struct r600_bytecode_alu alu;
1052 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1053 alu.op = ALU_OP2_ADD;
1054 alu.src[0].sel = V_SQ_ALU_SRC_1;
1055 alu.src[1].sel = 1;
1056 alu.src[1].chan = 0;
1057 alu.src[1].neg = 1;
1058 alu.dst.sel = 1;
1059 alu.dst.chan = 2;
1060 alu.dst.write = 1;
1061 alu.last = 1;
1062 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1063 return r;
1064
1065 /* ADD r1.z, r1.z, -r1.y */
1066 alu.op = ALU_OP2_ADD;
1067 alu.src[0].sel = 1;
1068 alu.src[0].chan = 2;
1069 alu.src[1].sel = 1;
1070 alu.src[1].chan = 1;
1071 alu.src[1].neg = 1;
1072 alu.dst.sel = 1;
1073 alu.dst.chan = 2;
1074 alu.dst.write = 1;
1075 alu.last = 1;
1076 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1077 return r;
1078 break;
1079 }
1080 break;
1081 default:
1082 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1083 return -EINVAL;
1084 }
1085 return 0;
1086 }
1087
allocate_system_value_inputs(struct r600_shader_ctx * ctx,int gpr_offset)1088 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1089 {
1090 struct tgsi_parse_context parse;
1091 struct {
1092 boolean enabled;
1093 int *reg;
1094 unsigned name, alternate_name;
1095 } inputs[2] = {
1096 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1097
1098 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1099 };
1100 int num_regs = 0;
1101 unsigned k, i;
1102
1103 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1104 return 0;
1105 }
1106
1107 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1108 while (!tgsi_parse_end_of_tokens(&parse)) {
1109 tgsi_parse_token(&parse);
1110
1111 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1112 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1113 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1114 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1115 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1116 {
1117 int interpolate, location, k;
1118
1119 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1120 location = TGSI_INTERPOLATE_LOC_CENTER;
1121 inputs[1].enabled = true; /* needs SAMPLEID */
1122 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1123 location = TGSI_INTERPOLATE_LOC_CENTER;
1124 /* Needs sample positions, currently those are always available */
1125 } else {
1126 location = TGSI_INTERPOLATE_LOC_CENTROID;
1127 }
1128
1129 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1130 k = eg_get_interpolator_index(interpolate, location);
1131 if (k >= 0)
1132 ctx->eg_interpolators[k].enabled = true;
1133 }
1134 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1135 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1136 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1137 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1138 if (d->Semantic.Name == inputs[k].name ||
1139 d->Semantic.Name == inputs[k].alternate_name) {
1140 inputs[k].enabled = true;
1141 }
1142 }
1143 }
1144 }
1145 }
1146
1147 tgsi_parse_free(&parse);
1148
1149 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1150 boolean enabled = inputs[i].enabled;
1151 int *reg = inputs[i].reg;
1152 unsigned name = inputs[i].name;
1153
1154 if (enabled) {
1155 int gpr = gpr_offset + num_regs++;
1156 ctx->shader->nsys_inputs++;
1157
1158 // add to inputs, allocate a gpr
1159 k = ctx->shader->ninput++;
1160 ctx->shader->input[k].name = name;
1161 ctx->shader->input[k].sid = 0;
1162 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1163 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1164 *reg = ctx->shader->input[k].gpr = gpr;
1165 }
1166 }
1167
1168 return gpr_offset + num_regs;
1169 }
1170
1171 /*
1172 * for evergreen we need to scan the shader to find the number of GPRs we need to
1173 * reserve for interpolation and system values
1174 *
1175 * we need to know if we are going to emit
1176 * any sample or centroid inputs
1177 * if perspective and linear are required
1178 */
evergreen_gpr_count(struct r600_shader_ctx * ctx)1179 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1180 {
1181 unsigned i;
1182 int num_baryc;
1183 struct tgsi_parse_context parse;
1184
1185 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1186
1187 for (i = 0; i < ctx->info.num_inputs; i++) {
1188 int k;
1189 /* skip position/face/mask/sampleid */
1190 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1191 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1192 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1193 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1194 continue;
1195
1196 k = eg_get_interpolator_index(
1197 ctx->info.input_interpolate[i],
1198 ctx->info.input_interpolate_loc[i]);
1199 if (k >= 0)
1200 ctx->eg_interpolators[k].enabled = TRUE;
1201 }
1202
1203 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1204 return 0;
1205 }
1206
1207 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1208 while (!tgsi_parse_end_of_tokens(&parse)) {
1209 tgsi_parse_token(&parse);
1210
1211 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1212 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1213 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1214 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1215 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1216 {
1217 int interpolate, location, k;
1218
1219 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1220 location = TGSI_INTERPOLATE_LOC_CENTER;
1221 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1222 location = TGSI_INTERPOLATE_LOC_CENTER;
1223 } else {
1224 location = TGSI_INTERPOLATE_LOC_CENTROID;
1225 }
1226
1227 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1228 k = eg_get_interpolator_index(interpolate, location);
1229 if (k >= 0)
1230 ctx->eg_interpolators[k].enabled = true;
1231 }
1232 }
1233 }
1234
1235 tgsi_parse_free(&parse);
1236
1237 /* assign gpr to each interpolator according to priority */
1238 num_baryc = 0;
1239 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1240 if (ctx->eg_interpolators[i].enabled) {
1241 ctx->eg_interpolators[i].ij_index = num_baryc;
1242 num_baryc ++;
1243 }
1244 }
1245
1246 /* XXX PULL MODEL and LINE STIPPLE */
1247
1248 num_baryc = (num_baryc + 1) >> 1;
1249 return allocate_system_value_inputs(ctx, num_baryc);
1250 }
1251
1252 /* sample_id_sel == NULL means fetch for current sample */
load_sample_position(struct r600_shader_ctx * ctx,struct r600_shader_src * sample_id,int chan_sel)1253 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1254 {
1255 struct r600_bytecode_vtx vtx;
1256 int r, t1;
1257
1258 assert(ctx->fixed_pt_position_gpr != -1);
1259
1260 t1 = r600_get_temp(ctx);
1261
1262 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1263 vtx.op = FETCH_OP_VFETCH;
1264 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1265 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1266 if (sample_id == NULL) {
1267 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1268 vtx.src_sel_x = 3;
1269 }
1270 else {
1271 struct r600_bytecode_alu alu;
1272
1273 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1274 alu.op = ALU_OP1_MOV;
1275 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1276 alu.dst.sel = t1;
1277 alu.dst.write = 1;
1278 alu.last = 1;
1279 r = r600_bytecode_add_alu(ctx->bc, &alu);
1280 if (r)
1281 return r;
1282
1283 vtx.src_gpr = t1;
1284 vtx.src_sel_x = 0;
1285 }
1286 vtx.mega_fetch_count = 16;
1287 vtx.dst_gpr = t1;
1288 vtx.dst_sel_x = 0;
1289 vtx.dst_sel_y = 1;
1290 vtx.dst_sel_z = 2;
1291 vtx.dst_sel_w = 3;
1292 vtx.data_format = FMT_32_32_32_32_FLOAT;
1293 vtx.num_format_all = 2;
1294 vtx.format_comp_all = 1;
1295 vtx.use_const_fields = 0;
1296 vtx.offset = 0;
1297 vtx.endian = r600_endian_swap(32);
1298 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1299
1300 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1301 if (r)
1302 return r;
1303
1304 return t1;
1305 }
1306
load_block_grid_size(struct r600_shader_ctx * ctx,bool load_block)1307 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1308 {
1309 struct r600_bytecode_vtx vtx;
1310 int r, t1;
1311
1312 if (ctx->cs_block_size_loaded)
1313 return ctx->cs_block_size_reg;
1314 if (ctx->cs_grid_size_loaded)
1315 return ctx->cs_grid_size_reg;
1316
1317 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1318 struct r600_bytecode_alu alu;
1319 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1320 alu.op = ALU_OP1_MOV;
1321 alu.src[0].sel = V_SQ_ALU_SRC_0;
1322 alu.dst.sel = t1;
1323 alu.dst.write = 1;
1324 alu.last = 1;
1325 r = r600_bytecode_add_alu(ctx->bc, &alu);
1326 if (r)
1327 return r;
1328
1329 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1330 vtx.op = FETCH_OP_VFETCH;
1331 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1332 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1333 vtx.src_gpr = t1;
1334 vtx.src_sel_x = 0;
1335
1336 vtx.mega_fetch_count = 16;
1337 vtx.dst_gpr = t1;
1338 vtx.dst_sel_x = 0;
1339 vtx.dst_sel_y = 1;
1340 vtx.dst_sel_z = 2;
1341 vtx.dst_sel_w = 7;
1342 vtx.data_format = FMT_32_32_32_32;
1343 vtx.num_format_all = 1;
1344 vtx.format_comp_all = 0;
1345 vtx.use_const_fields = 0;
1346 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1347 vtx.endian = r600_endian_swap(32);
1348 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1349
1350 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1351 if (r)
1352 return r;
1353
1354 if (load_block)
1355 ctx->cs_block_size_loaded = true;
1356 else
1357 ctx->cs_grid_size_loaded = true;
1358 return t1;
1359 }
1360
tgsi_src(struct r600_shader_ctx * ctx,const struct tgsi_full_src_register * tgsi_src,struct r600_shader_src * r600_src)1361 static void tgsi_src(struct r600_shader_ctx *ctx,
1362 const struct tgsi_full_src_register *tgsi_src,
1363 struct r600_shader_src *r600_src)
1364 {
1365 memset(r600_src, 0, sizeof(*r600_src));
1366 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1367 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1368 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1369 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1370 r600_src->neg = tgsi_src->Register.Negate;
1371 r600_src->abs = tgsi_src->Register.Absolute;
1372
1373 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1374 int index;
1375 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1376 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1377 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1378
1379 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1380 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1381 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1382 return;
1383 }
1384 index = tgsi_src->Register.Index;
1385 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1386 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1387 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1388 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1389 r600_src->swizzle[0] = 2; // Z value
1390 r600_src->swizzle[1] = 2;
1391 r600_src->swizzle[2] = 2;
1392 r600_src->swizzle[3] = 2;
1393 r600_src->sel = ctx->face_gpr;
1394 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1395 r600_src->swizzle[0] = 3; // W value
1396 r600_src->swizzle[1] = 3;
1397 r600_src->swizzle[2] = 3;
1398 r600_src->swizzle[3] = 3;
1399 r600_src->sel = ctx->fixed_pt_position_gpr;
1400 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1401 r600_src->swizzle[0] = 0;
1402 r600_src->swizzle[1] = 1;
1403 r600_src->swizzle[2] = 4;
1404 r600_src->swizzle[3] = 4;
1405 r600_src->sel = load_sample_position(ctx, NULL, -1);
1406 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1407 r600_src->swizzle[0] = 3;
1408 r600_src->swizzle[1] = 3;
1409 r600_src->swizzle[2] = 3;
1410 r600_src->swizzle[3] = 3;
1411 r600_src->sel = 0;
1412 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1413 r600_src->swizzle[0] = 0;
1414 r600_src->swizzle[1] = 0;
1415 r600_src->swizzle[2] = 0;
1416 r600_src->swizzle[3] = 0;
1417 r600_src->sel = 0;
1418 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1419 r600_src->sel = 0;
1420 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1421 r600_src->sel = 1;
1422 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1423 r600_src->swizzle[0] = 3;
1424 r600_src->swizzle[1] = 3;
1425 r600_src->swizzle[2] = 3;
1426 r600_src->swizzle[3] = 3;
1427 r600_src->sel = 1;
1428 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1429 r600_src->swizzle[0] = 2;
1430 r600_src->swizzle[1] = 2;
1431 r600_src->swizzle[2] = 2;
1432 r600_src->swizzle[3] = 2;
1433 r600_src->sel = 0;
1434 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1435 r600_src->sel = 1;
1436 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1437 r600_src->sel = 3;
1438 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1439 r600_src->sel = 2;
1440 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1441 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1442 r600_src->sel = ctx->tess_input_info;
1443 r600_src->swizzle[0] = 2;
1444 r600_src->swizzle[1] = 2;
1445 r600_src->swizzle[2] = 2;
1446 r600_src->swizzle[3] = 2;
1447 } else {
1448 r600_src->sel = ctx->tess_input_info;
1449 r600_src->swizzle[0] = 3;
1450 r600_src->swizzle[1] = 3;
1451 r600_src->swizzle[2] = 3;
1452 r600_src->swizzle[3] = 3;
1453 }
1454 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1455 r600_src->sel = 0;
1456 r600_src->swizzle[0] = 0;
1457 r600_src->swizzle[1] = 0;
1458 r600_src->swizzle[2] = 0;
1459 r600_src->swizzle[3] = 0;
1460 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1461 r600_src->sel = 0;
1462 r600_src->swizzle[0] = 3;
1463 r600_src->swizzle[1] = 3;
1464 r600_src->swizzle[2] = 3;
1465 r600_src->swizzle[3] = 3;
1466 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1467 r600_src->sel = load_block_grid_size(ctx, false);
1468 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1469 r600_src->sel = load_block_grid_size(ctx, true);
1470 }
1471 } else {
1472 if (tgsi_src->Register.Indirect)
1473 r600_src->rel = V_SQ_REL_RELATIVE;
1474 r600_src->sel = tgsi_src->Register.Index;
1475 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1476 }
1477 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1478 if (tgsi_src->Register.Dimension) {
1479 r600_src->kc_bank = tgsi_src->Dimension.Index;
1480 if (tgsi_src->Dimension.Indirect) {
1481 r600_src->kc_rel = 1;
1482 }
1483 }
1484 }
1485 }
1486
tgsi_fetch_rel_const(struct r600_shader_ctx * ctx,unsigned int cb_idx,unsigned cb_rel,unsigned int offset,unsigned ar_chan,unsigned int dst_reg)1487 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1488 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1489 unsigned int dst_reg)
1490 {
1491 struct r600_bytecode_vtx vtx;
1492 unsigned int ar_reg;
1493 int r;
1494
1495 if (offset) {
1496 struct r600_bytecode_alu alu;
1497
1498 memset(&alu, 0, sizeof(alu));
1499
1500 alu.op = ALU_OP2_ADD_INT;
1501 alu.src[0].sel = ctx->bc->ar_reg;
1502 alu.src[0].chan = ar_chan;
1503
1504 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1505 alu.src[1].value = offset;
1506
1507 alu.dst.sel = dst_reg;
1508 alu.dst.chan = ar_chan;
1509 alu.dst.write = 1;
1510 alu.last = 1;
1511
1512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1513 return r;
1514
1515 ar_reg = dst_reg;
1516 } else {
1517 ar_reg = ctx->bc->ar_reg;
1518 }
1519
1520 memset(&vtx, 0, sizeof(vtx));
1521 vtx.buffer_id = cb_idx;
1522 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1523 vtx.src_gpr = ar_reg;
1524 vtx.src_sel_x = ar_chan;
1525 vtx.mega_fetch_count = 16;
1526 vtx.dst_gpr = dst_reg;
1527 vtx.dst_sel_x = 0; /* SEL_X */
1528 vtx.dst_sel_y = 1; /* SEL_Y */
1529 vtx.dst_sel_z = 2; /* SEL_Z */
1530 vtx.dst_sel_w = 3; /* SEL_W */
1531 vtx.data_format = FMT_32_32_32_32_FLOAT;
1532 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1533 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1534 vtx.endian = r600_endian_swap(32);
1535 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1536
1537 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1538 return r;
1539
1540 return 0;
1541 }
1542
fetch_gs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1543 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1544 {
1545 struct r600_bytecode_vtx vtx;
1546 int r;
1547 unsigned index = src->Register.Index;
1548 unsigned vtx_id = src->Dimension.Index;
1549 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1550 int offset_chan = vtx_id % 3;
1551 int t2 = 0;
1552
1553 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1554 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1555
1556 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1557 offset_chan = 3;
1558
1559 if (src->Dimension.Indirect || src->Register.Indirect)
1560 t2 = r600_get_temp(ctx);
1561
1562 if (src->Dimension.Indirect) {
1563 int treg[3];
1564 struct r600_bytecode_alu alu;
1565 int r, i;
1566 unsigned addr_reg;
1567 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1568 if (src->DimIndirect.Index > 0) {
1569 r = single_alu_op2(ctx, ALU_OP1_MOV,
1570 ctx->bc->ar_reg, 0,
1571 addr_reg, 0,
1572 0, 0);
1573 if (r)
1574 return r;
1575 }
1576 /*
1577 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1578 at least this is what fglrx seems to do. */
1579 for (i = 0; i < 3; i++) {
1580 treg[i] = r600_get_temp(ctx);
1581 }
1582 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1583
1584 for (i = 0; i < 3; i++) {
1585 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1586 alu.op = ALU_OP1_MOV;
1587 alu.src[0].sel = ctx->gs_rotated_input[0];
1588 alu.src[0].chan = i == 2 ? 3 : i;
1589 alu.dst.sel = treg[i];
1590 alu.dst.chan = 0;
1591 alu.dst.write = 1;
1592 alu.last = 1;
1593 r = r600_bytecode_add_alu(ctx->bc, &alu);
1594 if (r)
1595 return r;
1596 }
1597 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1598 alu.op = ALU_OP1_MOV;
1599 alu.src[0].sel = treg[0];
1600 alu.src[0].rel = 1;
1601 alu.dst.sel = t2;
1602 alu.dst.write = 1;
1603 alu.last = 1;
1604 r = r600_bytecode_add_alu(ctx->bc, &alu);
1605 if (r)
1606 return r;
1607 offset_reg = t2;
1608 offset_chan = 0;
1609 }
1610
1611 if (src->Register.Indirect) {
1612 int addr_reg;
1613 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1614
1615 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1616
1617 /* pull the value from index_reg */
1618 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1619 t2, 1,
1620 addr_reg, 0,
1621 V_SQ_ALU_SRC_LITERAL, first);
1622 if (r)
1623 return r;
1624 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1625 t2, 0,
1626 t2, 1,
1627 V_SQ_ALU_SRC_LITERAL, 4,
1628 offset_reg, offset_chan);
1629 if (r)
1630 return r;
1631 offset_reg = t2;
1632 offset_chan = 0;
1633 index = src->Register.Index - first;
1634 }
1635
1636 memset(&vtx, 0, sizeof(vtx));
1637 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1638 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1639 vtx.src_gpr = offset_reg;
1640 vtx.src_sel_x = offset_chan;
1641 vtx.offset = index * 16; /*bytes*/
1642 vtx.mega_fetch_count = 16;
1643 vtx.dst_gpr = dst_reg;
1644 vtx.dst_sel_x = 0; /* SEL_X */
1645 vtx.dst_sel_y = 1; /* SEL_Y */
1646 vtx.dst_sel_z = 2; /* SEL_Z */
1647 vtx.dst_sel_w = 3; /* SEL_W */
1648 if (ctx->bc->chip_class >= EVERGREEN) {
1649 vtx.use_const_fields = 1;
1650 } else {
1651 vtx.data_format = FMT_32_32_32_32_FLOAT;
1652 }
1653
1654 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1655 return r;
1656
1657 return 0;
1658 }
1659
tgsi_split_gs_inputs(struct r600_shader_ctx * ctx)1660 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1661 {
1662 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1663 unsigned i;
1664
1665 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1666 struct tgsi_full_src_register *src = &inst->Src[i];
1667
1668 if (src->Register.File == TGSI_FILE_INPUT) {
1669 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1670 /* primitive id is in R0.z */
1671 ctx->src[i].sel = 0;
1672 ctx->src[i].swizzle[0] = 2;
1673 }
1674 }
1675 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1676 int treg = r600_get_temp(ctx);
1677
1678 fetch_gs_input(ctx, src, treg);
1679 ctx->src[i].sel = treg;
1680 ctx->src[i].rel = 0;
1681 }
1682 }
1683 return 0;
1684 }
1685
1686
1687 /* Tessellation shaders pass outputs to the next shader using LDS.
1688 *
1689 * LS outputs = TCS(HS) inputs
1690 * TCS(HS) outputs = TES(DS) inputs
1691 *
1692 * The LDS layout is:
1693 * - TCS inputs for patch 0
1694 * - TCS inputs for patch 1
1695 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1696 * - ...
1697 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1698 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1699 * - TCS outputs for patch 1
1700 * - Per-patch TCS outputs for patch 1
1701 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1702 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1703 * - ...
1704 *
1705 * All three shaders VS(LS), TCS, TES share the same LDS space.
1706 */
1707 /* this will return with the dw address in temp_reg.x */
r600_get_byte_address(struct r600_shader_ctx * ctx,int temp_reg,const struct tgsi_full_dst_register * dst,const struct tgsi_full_src_register * src,int stride_bytes_reg,int stride_bytes_chan)1708 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1709 const struct tgsi_full_dst_register *dst,
1710 const struct tgsi_full_src_register *src,
1711 int stride_bytes_reg, int stride_bytes_chan)
1712 {
1713 struct tgsi_full_dst_register reg;
1714 ubyte *name, *index, *array_first;
1715 int r;
1716 int param;
1717 struct tgsi_shader_info *info = &ctx->info;
1718 /* Set the register description. The address computation is the same
1719 * for sources and destinations. */
1720 if (src) {
1721 reg.Register.File = src->Register.File;
1722 reg.Register.Index = src->Register.Index;
1723 reg.Register.Indirect = src->Register.Indirect;
1724 reg.Register.Dimension = src->Register.Dimension;
1725 reg.Indirect = src->Indirect;
1726 reg.Dimension = src->Dimension;
1727 reg.DimIndirect = src->DimIndirect;
1728 } else
1729 reg = *dst;
1730
1731 /* If the register is 2-dimensional (e.g. an array of vertices
1732 * in a primitive), calculate the base address of the vertex. */
1733 if (reg.Register.Dimension) {
1734 int sel, chan;
1735 if (reg.Dimension.Indirect) {
1736 unsigned addr_reg;
1737 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1738
1739 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1740 /* pull the value from index_reg */
1741 sel = addr_reg;
1742 chan = 0;
1743 } else {
1744 sel = V_SQ_ALU_SRC_LITERAL;
1745 chan = reg.Dimension.Index;
1746 }
1747
1748 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1749 temp_reg, 0,
1750 stride_bytes_reg, stride_bytes_chan,
1751 sel, chan,
1752 temp_reg, 0);
1753 if (r)
1754 return r;
1755 }
1756
1757 if (reg.Register.File == TGSI_FILE_INPUT) {
1758 name = info->input_semantic_name;
1759 index = info->input_semantic_index;
1760 array_first = info->input_array_first;
1761 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1762 name = info->output_semantic_name;
1763 index = info->output_semantic_index;
1764 array_first = info->output_array_first;
1765 } else {
1766 assert(0);
1767 return -1;
1768 }
1769 if (reg.Register.Indirect) {
1770 int addr_reg;
1771 int first;
1772 /* Add the relative address of the element. */
1773 if (reg.Indirect.ArrayID)
1774 first = array_first[reg.Indirect.ArrayID];
1775 else
1776 first = reg.Register.Index;
1777
1778 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1779
1780 /* pull the value from index_reg */
1781 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1782 temp_reg, 0,
1783 V_SQ_ALU_SRC_LITERAL, 16,
1784 addr_reg, 0,
1785 temp_reg, 0);
1786 if (r)
1787 return r;
1788
1789 param = r600_get_lds_unique_index(name[first],
1790 index[first]);
1791
1792 } else {
1793 param = r600_get_lds_unique_index(name[reg.Register.Index],
1794 index[reg.Register.Index]);
1795 }
1796
1797 /* add to base_addr - passed in temp_reg.x */
1798 if (param) {
1799 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1800 temp_reg, 0,
1801 temp_reg, 0,
1802 V_SQ_ALU_SRC_LITERAL, param * 16);
1803 if (r)
1804 return r;
1805
1806 }
1807 return 0;
1808 }
1809
do_lds_fetch_values(struct r600_shader_ctx * ctx,unsigned temp_reg,unsigned dst_reg,unsigned mask)1810 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1811 unsigned dst_reg, unsigned mask)
1812 {
1813 struct r600_bytecode_alu alu;
1814 int r, i, lasti;
1815
1816 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1817 ctx->bc->force_add_cf = 1;
1818
1819 lasti = tgsi_last_instruction(mask);
1820 for (i = 1; i <= lasti; i++) {
1821 if (!(mask & (1 << i)))
1822 continue;
1823
1824 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1825 temp_reg, i,
1826 temp_reg, 0,
1827 V_SQ_ALU_SRC_LITERAL, 4 * i);
1828 if (r)
1829 return r;
1830 }
1831 for (i = 0; i <= lasti; i++) {
1832 if (!(mask & (1 << i)))
1833 continue;
1834
1835 /* emit an LDS_READ_RET */
1836 memset(&alu, 0, sizeof(alu));
1837 alu.op = LDS_OP1_LDS_READ_RET;
1838 alu.src[0].sel = temp_reg;
1839 alu.src[0].chan = i;
1840 alu.src[1].sel = V_SQ_ALU_SRC_0;
1841 alu.src[2].sel = V_SQ_ALU_SRC_0;
1842 alu.dst.chan = 0;
1843 alu.is_lds_idx_op = true;
1844 alu.last = 1;
1845 r = r600_bytecode_add_alu(ctx->bc, &alu);
1846 if (r)
1847 return r;
1848 }
1849 for (i = 0; i <= lasti; i++) {
1850 if (!(mask & (1 << i)))
1851 continue;
1852
1853 /* then read from LDS_OQ_A_POP */
1854 memset(&alu, 0, sizeof(alu));
1855
1856 alu.op = ALU_OP1_MOV;
1857 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1858 alu.src[0].chan = 0;
1859 alu.dst.sel = dst_reg;
1860 alu.dst.chan = i;
1861 alu.dst.write = 1;
1862 alu.last = 1;
1863 r = r600_bytecode_add_alu(ctx->bc, &alu);
1864 if (r)
1865 return r;
1866 }
1867 return 0;
1868 }
1869
fetch_mask(struct tgsi_src_register * reg)1870 static int fetch_mask(struct tgsi_src_register *reg)
1871 {
1872 int mask = 0;
1873 mask |= 1 << reg->SwizzleX;
1874 mask |= 1 << reg->SwizzleY;
1875 mask |= 1 << reg->SwizzleZ;
1876 mask |= 1 << reg->SwizzleW;
1877 return mask;
1878 }
1879
fetch_tes_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1880 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1881 {
1882 int r;
1883 unsigned temp_reg = r600_get_temp(ctx);
1884
1885 r = get_lds_offset0(ctx, 2, temp_reg,
1886 src->Register.Dimension ? false : true);
1887 if (r)
1888 return r;
1889
1890 /* the base address is now in temp.x */
1891 r = r600_get_byte_address(ctx, temp_reg,
1892 NULL, src, ctx->tess_output_info, 1);
1893 if (r)
1894 return r;
1895
1896 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1897 if (r)
1898 return r;
1899 return 0;
1900 }
1901
fetch_tcs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1902 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1903 {
1904 int r;
1905 unsigned temp_reg = r600_get_temp(ctx);
1906
1907 /* t.x = ips * r0.y */
1908 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1909 temp_reg, 0,
1910 ctx->tess_input_info, 0,
1911 0, 1);
1912
1913 if (r)
1914 return r;
1915
1916 /* the base address is now in temp.x */
1917 r = r600_get_byte_address(ctx, temp_reg,
1918 NULL, src, ctx->tess_input_info, 1);
1919 if (r)
1920 return r;
1921
1922 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1923 if (r)
1924 return r;
1925 return 0;
1926 }
1927
fetch_tcs_output(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1928 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1929 {
1930 int r;
1931 unsigned temp_reg = r600_get_temp(ctx);
1932
1933 r = get_lds_offset0(ctx, 1, temp_reg,
1934 src->Register.Dimension ? false : true);
1935 if (r)
1936 return r;
1937 /* the base address is now in temp.x */
1938 r = r600_get_byte_address(ctx, temp_reg,
1939 NULL, src,
1940 ctx->tess_output_info, 1);
1941 if (r)
1942 return r;
1943
1944 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1945 if (r)
1946 return r;
1947 return 0;
1948 }
1949
tgsi_split_lds_inputs(struct r600_shader_ctx * ctx)1950 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1951 {
1952 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1953 unsigned i;
1954
1955 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1956 struct tgsi_full_src_register *src = &inst->Src[i];
1957
1958 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1959 int treg = r600_get_temp(ctx);
1960 fetch_tes_input(ctx, src, treg);
1961 ctx->src[i].sel = treg;
1962 ctx->src[i].rel = 0;
1963 }
1964 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1965 int treg = r600_get_temp(ctx);
1966 fetch_tcs_input(ctx, src, treg);
1967 ctx->src[i].sel = treg;
1968 ctx->src[i].rel = 0;
1969 }
1970 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1971 int treg = r600_get_temp(ctx);
1972 fetch_tcs_output(ctx, src, treg);
1973 ctx->src[i].sel = treg;
1974 ctx->src[i].rel = 0;
1975 }
1976 }
1977 return 0;
1978 }
1979
tgsi_split_constant(struct r600_shader_ctx * ctx)1980 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1981 {
1982 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1983 struct r600_bytecode_alu alu;
1984 int i, j, k, nconst, r;
1985
1986 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1987 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1988 nconst++;
1989 }
1990 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1991 }
1992 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1993 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1994 continue;
1995 }
1996
1997 if (ctx->src[i].rel) {
1998 int chan = inst->Src[i].Indirect.Swizzle;
1999 int treg = r600_get_temp(ctx);
2000 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2001 return r;
2002
2003 ctx->src[i].kc_bank = 0;
2004 ctx->src[i].kc_rel = 0;
2005 ctx->src[i].sel = treg;
2006 ctx->src[i].rel = 0;
2007 j--;
2008 } else if (j > 0) {
2009 int treg = r600_get_temp(ctx);
2010 for (k = 0; k < 4; k++) {
2011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2012 alu.op = ALU_OP1_MOV;
2013 alu.src[0].sel = ctx->src[i].sel;
2014 alu.src[0].chan = k;
2015 alu.src[0].rel = ctx->src[i].rel;
2016 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2017 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2018 alu.dst.sel = treg;
2019 alu.dst.chan = k;
2020 alu.dst.write = 1;
2021 if (k == 3)
2022 alu.last = 1;
2023 r = r600_bytecode_add_alu(ctx->bc, &alu);
2024 if (r)
2025 return r;
2026 }
2027 ctx->src[i].sel = treg;
2028 ctx->src[i].rel =0;
2029 j--;
2030 }
2031 }
2032 return 0;
2033 }
2034
2035 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
tgsi_split_literal_constant(struct r600_shader_ctx * ctx)2036 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2037 {
2038 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2039 struct r600_bytecode_alu alu;
2040 int i, j, k, nliteral, r;
2041
2042 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2043 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2044 nliteral++;
2045 }
2046 }
2047 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2048 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2049 int treg = r600_get_temp(ctx);
2050 for (k = 0; k < 4; k++) {
2051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2052 alu.op = ALU_OP1_MOV;
2053 alu.src[0].sel = ctx->src[i].sel;
2054 alu.src[0].chan = k;
2055 alu.src[0].value = ctx->src[i].value[k];
2056 alu.dst.sel = treg;
2057 alu.dst.chan = k;
2058 alu.dst.write = 1;
2059 if (k == 3)
2060 alu.last = 1;
2061 r = r600_bytecode_add_alu(ctx->bc, &alu);
2062 if (r)
2063 return r;
2064 }
2065 ctx->src[i].sel = treg;
2066 j--;
2067 }
2068 }
2069 return 0;
2070 }
2071
process_twoside_color_inputs(struct r600_shader_ctx * ctx)2072 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2073 {
2074 int i, r, count = ctx->shader->ninput;
2075
2076 for (i = 0; i < count; i++) {
2077 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2078 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2079 if (r)
2080 return r;
2081 }
2082 }
2083 return 0;
2084 }
2085
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)2086 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2087 int stream, unsigned *stream_item_size UNUSED)
2088 {
2089 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2090 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2091 int j, r;
2092 unsigned i;
2093
2094 /* Sanity checking. */
2095 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2096 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2097 r = -EINVAL;
2098 goto out_err;
2099 }
2100 for (i = 0; i < so->num_outputs; i++) {
2101 if (so->output[i].output_buffer >= 4) {
2102 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2103 so->output[i].output_buffer);
2104 r = -EINVAL;
2105 goto out_err;
2106 }
2107 }
2108
2109 /* Initialize locations where the outputs are stored. */
2110 for (i = 0; i < so->num_outputs; i++) {
2111
2112 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2113 start_comp[i] = so->output[i].start_component;
2114 /* Lower outputs with dst_offset < start_component.
2115 *
2116 * We can only output 4D vectors with a write mask, e.g. we can
2117 * only output the W component at offset 3, etc. If we want
2118 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2119 * to move it to X and output X. */
2120 if (so->output[i].dst_offset < so->output[i].start_component) {
2121 unsigned tmp = r600_get_temp(ctx);
2122
2123 for (j = 0; j < so->output[i].num_components; j++) {
2124 struct r600_bytecode_alu alu;
2125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2126 alu.op = ALU_OP1_MOV;
2127 alu.src[0].sel = so_gpr[i];
2128 alu.src[0].chan = so->output[i].start_component + j;
2129
2130 alu.dst.sel = tmp;
2131 alu.dst.chan = j;
2132 alu.dst.write = 1;
2133 if (j == so->output[i].num_components - 1)
2134 alu.last = 1;
2135 r = r600_bytecode_add_alu(ctx->bc, &alu);
2136 if (r)
2137 return r;
2138 }
2139 start_comp[i] = 0;
2140 so_gpr[i] = tmp;
2141 }
2142 }
2143
2144 /* Write outputs to buffers. */
2145 for (i = 0; i < so->num_outputs; i++) {
2146 struct r600_bytecode_output output;
2147
2148 if (stream != -1 && stream != so->output[i].stream)
2149 continue;
2150
2151 memset(&output, 0, sizeof(struct r600_bytecode_output));
2152 output.gpr = so_gpr[i];
2153 output.elem_size = so->output[i].num_components - 1;
2154 if (output.elem_size == 2)
2155 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2156 output.array_base = so->output[i].dst_offset - start_comp[i];
2157 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2158 output.burst_count = 1;
2159 /* array_size is an upper limit for the burst_count
2160 * with MEM_STREAM instructions */
2161 output.array_size = 0xFFF;
2162 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2163
2164 if (ctx->bc->chip_class >= EVERGREEN) {
2165 switch (so->output[i].output_buffer) {
2166 case 0:
2167 output.op = CF_OP_MEM_STREAM0_BUF0;
2168 break;
2169 case 1:
2170 output.op = CF_OP_MEM_STREAM0_BUF1;
2171 break;
2172 case 2:
2173 output.op = CF_OP_MEM_STREAM0_BUF2;
2174 break;
2175 case 3:
2176 output.op = CF_OP_MEM_STREAM0_BUF3;
2177 break;
2178 }
2179 output.op += so->output[i].stream * 4;
2180 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2181 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2182 } else {
2183 switch (so->output[i].output_buffer) {
2184 case 0:
2185 output.op = CF_OP_MEM_STREAM0;
2186 break;
2187 case 1:
2188 output.op = CF_OP_MEM_STREAM1;
2189 break;
2190 case 2:
2191 output.op = CF_OP_MEM_STREAM2;
2192 break;
2193 case 3:
2194 output.op = CF_OP_MEM_STREAM3;
2195 break;
2196 }
2197 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2198 }
2199 r = r600_bytecode_add_output(ctx->bc, &output);
2200 if (r)
2201 goto out_err;
2202 }
2203 return 0;
2204 out_err:
2205 return r;
2206 }
2207
convert_edgeflag_to_int(struct r600_shader_ctx * ctx)2208 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2209 {
2210 struct r600_bytecode_alu alu;
2211 unsigned reg;
2212
2213 if (!ctx->shader->vs_out_edgeflag)
2214 return;
2215
2216 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2217
2218 /* clamp(x, 0, 1) */
2219 memset(&alu, 0, sizeof(alu));
2220 alu.op = ALU_OP1_MOV;
2221 alu.src[0].sel = reg;
2222 alu.dst.sel = reg;
2223 alu.dst.write = 1;
2224 alu.dst.clamp = 1;
2225 alu.last = 1;
2226 r600_bytecode_add_alu(ctx->bc, &alu);
2227
2228 memset(&alu, 0, sizeof(alu));
2229 alu.op = ALU_OP1_FLT_TO_INT;
2230 alu.src[0].sel = reg;
2231 alu.dst.sel = reg;
2232 alu.dst.write = 1;
2233 alu.last = 1;
2234 r600_bytecode_add_alu(ctx->bc, &alu);
2235 }
2236
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)2237 static int generate_gs_copy_shader(struct r600_context *rctx,
2238 struct r600_pipe_shader *gs,
2239 struct pipe_stream_output_info *so)
2240 {
2241 struct r600_shader_ctx ctx = {};
2242 struct r600_shader *gs_shader = &gs->shader;
2243 struct r600_pipe_shader *cshader;
2244 unsigned ocnt = gs_shader->noutput;
2245 struct r600_bytecode_alu alu;
2246 struct r600_bytecode_vtx vtx;
2247 struct r600_bytecode_output output;
2248 struct r600_bytecode_cf *cf_jump, *cf_pop,
2249 *last_exp_pos = NULL, *last_exp_param = NULL;
2250 int next_clip_pos = 61, next_param = 0;
2251 unsigned i, j;
2252 int ring;
2253 bool only_ring_0 = true;
2254 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2255 if (!cshader)
2256 return 0;
2257
2258 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2259 sizeof(struct r600_shader_io));
2260
2261 cshader->shader.noutput = ocnt;
2262
2263 ctx.shader = &cshader->shader;
2264 ctx.bc = &ctx.shader->bc;
2265 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2266
2267 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2268 rctx->screen->has_compressed_msaa_texturing);
2269
2270 ctx.bc->isa = rctx->isa;
2271
2272 cf_jump = NULL;
2273 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2274
2275 /* R0.x = R0.x & 0x3fffffff */
2276 memset(&alu, 0, sizeof(alu));
2277 alu.op = ALU_OP2_AND_INT;
2278 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2279 alu.src[1].value = 0x3fffffff;
2280 alu.dst.write = 1;
2281 r600_bytecode_add_alu(ctx.bc, &alu);
2282
2283 /* R0.y = R0.x >> 30 */
2284 memset(&alu, 0, sizeof(alu));
2285 alu.op = ALU_OP2_LSHR_INT;
2286 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2287 alu.src[1].value = 0x1e;
2288 alu.dst.chan = 1;
2289 alu.dst.write = 1;
2290 alu.last = 1;
2291 r600_bytecode_add_alu(ctx.bc, &alu);
2292
2293 /* fetch vertex data from GSVS ring */
2294 for (i = 0; i < ocnt; ++i) {
2295 struct r600_shader_io *out = &ctx.shader->output[i];
2296
2297 out->gpr = i + 1;
2298 out->ring_offset = i * 16;
2299
2300 memset(&vtx, 0, sizeof(vtx));
2301 vtx.op = FETCH_OP_VFETCH;
2302 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2303 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2304 vtx.mega_fetch_count = 16;
2305 vtx.offset = out->ring_offset;
2306 vtx.dst_gpr = out->gpr;
2307 vtx.src_gpr = 0;
2308 vtx.dst_sel_x = 0;
2309 vtx.dst_sel_y = 1;
2310 vtx.dst_sel_z = 2;
2311 vtx.dst_sel_w = 3;
2312 if (rctx->b.chip_class >= EVERGREEN) {
2313 vtx.use_const_fields = 1;
2314 } else {
2315 vtx.data_format = FMT_32_32_32_32_FLOAT;
2316 }
2317
2318 r600_bytecode_add_vtx(ctx.bc, &vtx);
2319 }
2320 ctx.temp_reg = i + 1;
2321 for (ring = 3; ring >= 0; --ring) {
2322 bool enabled = false;
2323 for (i = 0; i < so->num_outputs; i++) {
2324 if (so->output[i].stream == ring) {
2325 enabled = true;
2326 if (ring > 0)
2327 only_ring_0 = false;
2328 break;
2329 }
2330 }
2331 if (ring != 0 && !enabled) {
2332 cshader->shader.ring_item_sizes[ring] = 0;
2333 continue;
2334 }
2335
2336 if (cf_jump) {
2337 // Patch up jump label
2338 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2339 cf_pop = ctx.bc->cf_last;
2340
2341 cf_jump->cf_addr = cf_pop->id + 2;
2342 cf_jump->pop_count = 1;
2343 cf_pop->cf_addr = cf_pop->id + 2;
2344 cf_pop->pop_count = 1;
2345 }
2346
2347 /* PRED_SETE_INT __, R0.y, ring */
2348 memset(&alu, 0, sizeof(alu));
2349 alu.op = ALU_OP2_PRED_SETE_INT;
2350 alu.src[0].chan = 1;
2351 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2352 alu.src[1].value = ring;
2353 alu.execute_mask = 1;
2354 alu.update_pred = 1;
2355 alu.last = 1;
2356 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2357
2358 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2359 cf_jump = ctx.bc->cf_last;
2360
2361 if (enabled)
2362 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2363 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2364 }
2365
2366 /* bc adds nops - copy it */
2367 if (ctx.bc->chip_class == R600) {
2368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2369 alu.op = ALU_OP0_NOP;
2370 alu.last = 1;
2371 r600_bytecode_add_alu(ctx.bc, &alu);
2372
2373 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2374 }
2375
2376 /* export vertex data */
2377 /* XXX factor out common code with r600_shader_from_tgsi ? */
2378 for (i = 0; i < ocnt; ++i) {
2379 struct r600_shader_io *out = &ctx.shader->output[i];
2380 bool instream0 = true;
2381 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2382 continue;
2383
2384 for (j = 0; j < so->num_outputs; j++) {
2385 if (so->output[j].register_index == i) {
2386 if (so->output[j].stream == 0)
2387 break;
2388 if (so->output[j].stream > 0)
2389 instream0 = false;
2390 }
2391 }
2392 if (!instream0)
2393 continue;
2394 memset(&output, 0, sizeof(output));
2395 output.gpr = out->gpr;
2396 output.elem_size = 3;
2397 output.swizzle_x = 0;
2398 output.swizzle_y = 1;
2399 output.swizzle_z = 2;
2400 output.swizzle_w = 3;
2401 output.burst_count = 1;
2402 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2403 output.op = CF_OP_EXPORT;
2404 switch (out->name) {
2405 case TGSI_SEMANTIC_POSITION:
2406 output.array_base = 60;
2407 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2408 break;
2409
2410 case TGSI_SEMANTIC_PSIZE:
2411 output.array_base = 61;
2412 if (next_clip_pos == 61)
2413 next_clip_pos = 62;
2414 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2415 output.swizzle_y = 7;
2416 output.swizzle_z = 7;
2417 output.swizzle_w = 7;
2418 ctx.shader->vs_out_misc_write = 1;
2419 ctx.shader->vs_out_point_size = 1;
2420 break;
2421 case TGSI_SEMANTIC_LAYER:
2422 if (out->spi_sid) {
2423 /* duplicate it as PARAM to pass to the pixel shader */
2424 output.array_base = next_param++;
2425 r600_bytecode_add_output(ctx.bc, &output);
2426 last_exp_param = ctx.bc->cf_last;
2427 }
2428 output.array_base = 61;
2429 if (next_clip_pos == 61)
2430 next_clip_pos = 62;
2431 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2432 output.swizzle_x = 7;
2433 output.swizzle_y = 7;
2434 output.swizzle_z = 0;
2435 output.swizzle_w = 7;
2436 ctx.shader->vs_out_misc_write = 1;
2437 ctx.shader->vs_out_layer = 1;
2438 break;
2439 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2440 if (out->spi_sid) {
2441 /* duplicate it as PARAM to pass to the pixel shader */
2442 output.array_base = next_param++;
2443 r600_bytecode_add_output(ctx.bc, &output);
2444 last_exp_param = ctx.bc->cf_last;
2445 }
2446 output.array_base = 61;
2447 if (next_clip_pos == 61)
2448 next_clip_pos = 62;
2449 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2450 ctx.shader->vs_out_misc_write = 1;
2451 ctx.shader->vs_out_viewport = 1;
2452 output.swizzle_x = 7;
2453 output.swizzle_y = 7;
2454 output.swizzle_z = 7;
2455 output.swizzle_w = 0;
2456 break;
2457 case TGSI_SEMANTIC_CLIPDIST:
2458 /* spi_sid is 0 for clipdistance outputs that were generated
2459 * for clipvertex - we don't need to pass them to PS */
2460 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2461 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2462 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2463 if (out->spi_sid) {
2464 /* duplicate it as PARAM to pass to the pixel shader */
2465 output.array_base = next_param++;
2466 r600_bytecode_add_output(ctx.bc, &output);
2467 last_exp_param = ctx.bc->cf_last;
2468 }
2469 output.array_base = next_clip_pos++;
2470 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2471 break;
2472 case TGSI_SEMANTIC_FOG:
2473 output.swizzle_y = 4; /* 0 */
2474 output.swizzle_z = 4; /* 0 */
2475 output.swizzle_w = 5; /* 1 */
2476 break;
2477 default:
2478 output.array_base = next_param++;
2479 break;
2480 }
2481 r600_bytecode_add_output(ctx.bc, &output);
2482 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2483 last_exp_param = ctx.bc->cf_last;
2484 else
2485 last_exp_pos = ctx.bc->cf_last;
2486 }
2487
2488 if (!last_exp_pos) {
2489 memset(&output, 0, sizeof(output));
2490 output.gpr = 0;
2491 output.elem_size = 3;
2492 output.swizzle_x = 7;
2493 output.swizzle_y = 7;
2494 output.swizzle_z = 7;
2495 output.swizzle_w = 7;
2496 output.burst_count = 1;
2497 output.type = 2;
2498 output.op = CF_OP_EXPORT;
2499 output.array_base = 60;
2500 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2501 r600_bytecode_add_output(ctx.bc, &output);
2502 last_exp_pos = ctx.bc->cf_last;
2503 }
2504
2505 if (!last_exp_param) {
2506 memset(&output, 0, sizeof(output));
2507 output.gpr = 0;
2508 output.elem_size = 3;
2509 output.swizzle_x = 7;
2510 output.swizzle_y = 7;
2511 output.swizzle_z = 7;
2512 output.swizzle_w = 7;
2513 output.burst_count = 1;
2514 output.type = 2;
2515 output.op = CF_OP_EXPORT;
2516 output.array_base = next_param++;
2517 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2518 r600_bytecode_add_output(ctx.bc, &output);
2519 last_exp_param = ctx.bc->cf_last;
2520 }
2521
2522 last_exp_pos->op = CF_OP_EXPORT_DONE;
2523 last_exp_param->op = CF_OP_EXPORT_DONE;
2524
2525 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2526 cf_pop = ctx.bc->cf_last;
2527
2528 cf_jump->cf_addr = cf_pop->id + 2;
2529 cf_jump->pop_count = 1;
2530 cf_pop->cf_addr = cf_pop->id + 2;
2531 cf_pop->pop_count = 1;
2532
2533 if (ctx.bc->chip_class == CAYMAN)
2534 cm_bytecode_add_cf_end(ctx.bc);
2535 else {
2536 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2537 ctx.bc->cf_last->end_of_program = 1;
2538 }
2539
2540 gs->gs_copy_shader = cshader;
2541 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2542
2543 ctx.bc->nstack = 1;
2544
2545 return r600_bytecode_build(ctx.bc);
2546 }
2547
emit_inc_ring_offset(struct r600_shader_ctx * ctx,int idx,bool ind)2548 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2549 {
2550 if (ind) {
2551 struct r600_bytecode_alu alu;
2552 int r;
2553
2554 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2555 alu.op = ALU_OP2_ADD_INT;
2556 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2557 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2558 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2559 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2560 alu.dst.write = 1;
2561 alu.last = 1;
2562 r = r600_bytecode_add_alu(ctx->bc, &alu);
2563 if (r)
2564 return r;
2565 }
2566 return 0;
2567 }
2568
emit_gs_ring_writes(struct r600_shader_ctx * ctx,const struct pipe_stream_output_info * so UNUSED,int stream,bool ind)2569 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2570 {
2571 struct r600_bytecode_output output;
2572 int ring_offset;
2573 unsigned i, k;
2574 int effective_stream = stream == -1 ? 0 : stream;
2575 int idx = 0;
2576
2577 for (i = 0; i < ctx->shader->noutput; i++) {
2578 if (ctx->gs_for_vs) {
2579 /* for ES we need to lookup corresponding ring offset expected by GS
2580 * (map this output to GS input by name and sid) */
2581 /* FIXME precompute offsets */
2582 ring_offset = -1;
2583 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2584 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2585 struct r600_shader_io *out = &ctx->shader->output[i];
2586 if (in->name == out->name && in->sid == out->sid)
2587 ring_offset = in->ring_offset;
2588 }
2589
2590 if (ring_offset == -1)
2591 continue;
2592 } else {
2593 ring_offset = idx * 16;
2594 idx++;
2595 }
2596
2597 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2598 continue;
2599 /* next_ring_offset after parsing input decls contains total size of
2600 * single vertex data, gs_next_vertex - current vertex index */
2601 if (!ind)
2602 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2603
2604 memset(&output, 0, sizeof(struct r600_bytecode_output));
2605 output.gpr = ctx->shader->output[i].gpr;
2606 output.elem_size = 3;
2607 output.comp_mask = 0xF;
2608 output.burst_count = 1;
2609
2610 if (ind)
2611 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2612 else
2613 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2614
2615 switch (stream) {
2616 default:
2617 case 0:
2618 output.op = CF_OP_MEM_RING; break;
2619 case 1:
2620 output.op = CF_OP_MEM_RING1; break;
2621 case 2:
2622 output.op = CF_OP_MEM_RING2; break;
2623 case 3:
2624 output.op = CF_OP_MEM_RING3; break;
2625 }
2626
2627 if (ind) {
2628 output.array_base = ring_offset >> 2; /* in dwords */
2629 output.array_size = 0xfff;
2630 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2631 } else
2632 output.array_base = ring_offset >> 2; /* in dwords */
2633 r600_bytecode_add_output(ctx->bc, &output);
2634 }
2635
2636 ++ctx->gs_next_vertex;
2637 return 0;
2638 }
2639
2640
r600_fetch_tess_io_info(struct r600_shader_ctx * ctx)2641 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2642 {
2643 int r;
2644 struct r600_bytecode_vtx vtx;
2645 int temp_val = ctx->temp_reg;
2646 /* need to store the TCS output somewhere */
2647 r = single_alu_op2(ctx, ALU_OP1_MOV,
2648 temp_val, 0,
2649 V_SQ_ALU_SRC_LITERAL, 0,
2650 0, 0);
2651 if (r)
2652 return r;
2653
2654 /* used by VS/TCS */
2655 if (ctx->tess_input_info) {
2656 /* fetch tcs input values into resv space */
2657 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2658 vtx.op = FETCH_OP_VFETCH;
2659 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2660 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2661 vtx.mega_fetch_count = 16;
2662 vtx.data_format = FMT_32_32_32_32;
2663 vtx.num_format_all = 2;
2664 vtx.format_comp_all = 1;
2665 vtx.use_const_fields = 0;
2666 vtx.endian = r600_endian_swap(32);
2667 vtx.srf_mode_all = 1;
2668 vtx.offset = 0;
2669 vtx.dst_gpr = ctx->tess_input_info;
2670 vtx.dst_sel_x = 0;
2671 vtx.dst_sel_y = 1;
2672 vtx.dst_sel_z = 2;
2673 vtx.dst_sel_w = 3;
2674 vtx.src_gpr = temp_val;
2675 vtx.src_sel_x = 0;
2676
2677 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2678 if (r)
2679 return r;
2680 }
2681
2682 /* used by TCS/TES */
2683 if (ctx->tess_output_info) {
2684 /* fetch tcs output values into resv space */
2685 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2686 vtx.op = FETCH_OP_VFETCH;
2687 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2688 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2689 vtx.mega_fetch_count = 16;
2690 vtx.data_format = FMT_32_32_32_32;
2691 vtx.num_format_all = 2;
2692 vtx.format_comp_all = 1;
2693 vtx.use_const_fields = 0;
2694 vtx.endian = r600_endian_swap(32);
2695 vtx.srf_mode_all = 1;
2696 vtx.offset = 16;
2697 vtx.dst_gpr = ctx->tess_output_info;
2698 vtx.dst_sel_x = 0;
2699 vtx.dst_sel_y = 1;
2700 vtx.dst_sel_z = 2;
2701 vtx.dst_sel_w = 3;
2702 vtx.src_gpr = temp_val;
2703 vtx.src_sel_x = 0;
2704
2705 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2706 if (r)
2707 return r;
2708 }
2709 return 0;
2710 }
2711
emit_lds_vs_writes(struct r600_shader_ctx * ctx)2712 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2713 {
2714 int j, r;
2715 int temp_reg;
2716 unsigned i;
2717
2718 /* fetch tcs input values into input_vals */
2719 ctx->tess_input_info = r600_get_temp(ctx);
2720 ctx->tess_output_info = 0;
2721 r = r600_fetch_tess_io_info(ctx);
2722 if (r)
2723 return r;
2724
2725 temp_reg = r600_get_temp(ctx);
2726 /* dst reg contains LDS address stride * idx */
2727 /* MUL vertexID, vertex_dw_stride */
2728 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2729 temp_reg, 0,
2730 ctx->tess_input_info, 1,
2731 0, 1); /* rel id in r0.y? */
2732 if (r)
2733 return r;
2734
2735 for (i = 0; i < ctx->shader->noutput; i++) {
2736 struct r600_bytecode_alu alu;
2737 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2738
2739 if (param) {
2740 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2741 temp_reg, 1,
2742 temp_reg, 0,
2743 V_SQ_ALU_SRC_LITERAL, param * 16);
2744 if (r)
2745 return r;
2746 }
2747
2748 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2749 temp_reg, 2,
2750 temp_reg, param ? 1 : 0,
2751 V_SQ_ALU_SRC_LITERAL, 8);
2752 if (r)
2753 return r;
2754
2755
2756 for (j = 0; j < 2; j++) {
2757 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2758 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2759 alu.op = LDS_OP3_LDS_WRITE_REL;
2760 alu.src[0].sel = temp_reg;
2761 alu.src[0].chan = chan;
2762 alu.src[1].sel = ctx->shader->output[i].gpr;
2763 alu.src[1].chan = j * 2;
2764 alu.src[2].sel = ctx->shader->output[i].gpr;
2765 alu.src[2].chan = (j * 2) + 1;
2766 alu.last = 1;
2767 alu.dst.chan = 0;
2768 alu.lds_idx = 1;
2769 alu.is_lds_idx_op = true;
2770 r = r600_bytecode_add_alu(ctx->bc, &alu);
2771 if (r)
2772 return r;
2773 }
2774 }
2775 return 0;
2776 }
2777
r600_store_tcs_output(struct r600_shader_ctx * ctx)2778 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2779 {
2780 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2781 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2782 int i, r, lasti;
2783 int temp_reg = r600_get_temp(ctx);
2784 struct r600_bytecode_alu alu;
2785 unsigned write_mask = dst->Register.WriteMask;
2786
2787 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2788 return 0;
2789
2790 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2791 if (r)
2792 return r;
2793
2794 /* the base address is now in temp.x */
2795 r = r600_get_byte_address(ctx, temp_reg,
2796 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2797 if (r)
2798 return r;
2799
2800 /* LDS write */
2801 lasti = tgsi_last_instruction(write_mask);
2802 for (i = 1; i <= lasti; i++) {
2803
2804 if (!(write_mask & (1 << i)))
2805 continue;
2806 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2807 temp_reg, i,
2808 temp_reg, 0,
2809 V_SQ_ALU_SRC_LITERAL, 4 * i);
2810 if (r)
2811 return r;
2812 }
2813
2814 for (i = 0; i <= lasti; i++) {
2815 if (!(write_mask & (1 << i)))
2816 continue;
2817
2818 if ((i == 0 && ((write_mask & 3) == 3)) ||
2819 (i == 2 && ((write_mask & 0xc) == 0xc))) {
2820 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2821 alu.op = LDS_OP3_LDS_WRITE_REL;
2822 alu.src[0].sel = temp_reg;
2823 alu.src[0].chan = i;
2824
2825 alu.src[1].sel = dst->Register.Index;
2826 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2827 alu.src[1].chan = i;
2828
2829 alu.src[2].sel = dst->Register.Index;
2830 alu.src[2].sel += ctx->file_offset[dst->Register.File];
2831 alu.src[2].chan = i + 1;
2832 alu.lds_idx = 1;
2833 alu.dst.chan = 0;
2834 alu.last = 1;
2835 alu.is_lds_idx_op = true;
2836 r = r600_bytecode_add_alu(ctx->bc, &alu);
2837 if (r)
2838 return r;
2839 i += 1;
2840 continue;
2841 }
2842 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2843 alu.op = LDS_OP2_LDS_WRITE;
2844 alu.src[0].sel = temp_reg;
2845 alu.src[0].chan = i;
2846
2847 alu.src[1].sel = dst->Register.Index;
2848 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2849 alu.src[1].chan = i;
2850
2851 alu.src[2].sel = V_SQ_ALU_SRC_0;
2852 alu.dst.chan = 0;
2853 alu.last = 1;
2854 alu.is_lds_idx_op = true;
2855 r = r600_bytecode_add_alu(ctx->bc, &alu);
2856 if (r)
2857 return r;
2858 }
2859 return 0;
2860 }
2861
r600_tess_factor_read(struct r600_shader_ctx * ctx,int output_idx,int nc)2862 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2863 int output_idx, int nc)
2864 {
2865 int param;
2866 unsigned temp_reg = r600_get_temp(ctx);
2867 unsigned name = ctx->shader->output[output_idx].name;
2868 int dreg = ctx->shader->output[output_idx].gpr;
2869 int r;
2870
2871 param = r600_get_lds_unique_index(name, 0);
2872 r = get_lds_offset0(ctx, 1, temp_reg, true);
2873 if (r)
2874 return r;
2875
2876 if (param) {
2877 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2878 temp_reg, 0,
2879 temp_reg, 0,
2880 V_SQ_ALU_SRC_LITERAL, param * 16);
2881 if (r)
2882 return r;
2883 }
2884
2885 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
2886 return 0;
2887 }
2888
r600_emit_tess_factor(struct r600_shader_ctx * ctx)2889 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2890 {
2891 int stride, outer_comps, inner_comps;
2892 int tessinner_idx = -1, tessouter_idx = -1;
2893 int i, r;
2894 unsigned j;
2895 int temp_reg = r600_get_temp(ctx);
2896 int treg[3] = {-1, -1, -1};
2897 struct r600_bytecode_alu alu;
2898 struct r600_bytecode_cf *cf_jump, *cf_pop;
2899
2900 /* only execute factor emission for invocation 0 */
2901 /* PRED_SETE_INT __, R0.x, 0 */
2902 memset(&alu, 0, sizeof(alu));
2903 alu.op = ALU_OP2_PRED_SETE_INT;
2904 alu.src[0].chan = 2;
2905 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2906 alu.execute_mask = 1;
2907 alu.update_pred = 1;
2908 alu.last = 1;
2909 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2910
2911 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2912 cf_jump = ctx->bc->cf_last;
2913
2914 treg[0] = r600_get_temp(ctx);
2915 switch (ctx->shader->tcs_prim_mode) {
2916 case PIPE_PRIM_LINES:
2917 stride = 8; /* 2 dwords, 1 vec2 store */
2918 outer_comps = 2;
2919 inner_comps = 0;
2920 break;
2921 case PIPE_PRIM_TRIANGLES:
2922 stride = 16; /* 4 dwords, 1 vec4 store */
2923 outer_comps = 3;
2924 inner_comps = 1;
2925 treg[1] = r600_get_temp(ctx);
2926 break;
2927 case PIPE_PRIM_QUADS:
2928 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2929 outer_comps = 4;
2930 inner_comps = 2;
2931 treg[1] = r600_get_temp(ctx);
2932 treg[2] = r600_get_temp(ctx);
2933 break;
2934 default:
2935 assert(0);
2936 return -1;
2937 }
2938
2939 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2940 /* TF_WRITE takes index in R.x, value in R.y */
2941 for (j = 0; j < ctx->shader->noutput; j++) {
2942 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2943 tessinner_idx = j;
2944 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2945 tessouter_idx = j;
2946 }
2947
2948 if (tessouter_idx == -1)
2949 return -1;
2950
2951 if (tessinner_idx == -1 && inner_comps)
2952 return -1;
2953
2954 if (tessouter_idx != -1) {
2955 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
2956 if (r)
2957 return r;
2958 }
2959
2960 if (tessinner_idx != -1) {
2961 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
2962 if (r)
2963 return r;
2964 }
2965
2966 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2967 /* r.x = relpatchid(r0.y) * tf_stride */
2968
2969 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
2970 /* add incoming r0.w to it: t.x = t.x + r0.w */
2971 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2972 temp_reg, 0,
2973 0, 1,
2974 V_SQ_ALU_SRC_LITERAL, stride,
2975 0, 3);
2976 if (r)
2977 return r;
2978
2979 for (i = 0; i < outer_comps + inner_comps; i++) {
2980 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2981 int out_comp = i >= outer_comps ? i - outer_comps : i;
2982
2983 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
2984 if (out_comp == 1)
2985 out_comp = 0;
2986 else if (out_comp == 0)
2987 out_comp = 1;
2988 }
2989
2990 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2991 treg[i / 2], (2 * (i % 2)),
2992 temp_reg, 0,
2993 V_SQ_ALU_SRC_LITERAL, 4 * i);
2994 if (r)
2995 return r;
2996 r = single_alu_op2(ctx, ALU_OP1_MOV,
2997 treg[i / 2], 1 + (2 * (i%2)),
2998 ctx->shader->output[out_idx].gpr, out_comp,
2999 0, 0);
3000 if (r)
3001 return r;
3002 }
3003 for (i = 0; i < outer_comps + inner_comps; i++) {
3004 struct r600_bytecode_gds gds;
3005
3006 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3007 gds.src_gpr = treg[i / 2];
3008 gds.src_sel_x = 2 * (i % 2);
3009 gds.src_sel_y = 1 + (2 * (i % 2));
3010 gds.src_sel_z = 4;
3011 gds.dst_sel_x = 7;
3012 gds.dst_sel_y = 7;
3013 gds.dst_sel_z = 7;
3014 gds.dst_sel_w = 7;
3015 gds.op = FETCH_OP_TF_WRITE;
3016 r = r600_bytecode_add_gds(ctx->bc, &gds);
3017 if (r)
3018 return r;
3019 }
3020
3021 // Patch up jump label
3022 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3023 cf_pop = ctx->bc->cf_last;
3024
3025 cf_jump->cf_addr = cf_pop->id + 2;
3026 cf_jump->pop_count = 1;
3027 cf_pop->cf_addr = cf_pop->id + 2;
3028 cf_pop->pop_count = 1;
3029
3030 return 0;
3031 }
3032
3033 /*
3034 * We have to work out the thread ID for load and atomic
3035 * operations, which store the returned value to an index
3036 * in an intermediate buffer.
3037 * The index is calculated by taking the thread id,
3038 * calculated from the MBCNT instructions.
3039 * Then the shader engine ID is multiplied by 256,
3040 * and the wave id is added.
3041 * Then the result is multipled by 64 and thread id is
3042 * added.
3043 */
load_thread_id_gpr(struct r600_shader_ctx * ctx)3044 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3045 {
3046 struct r600_bytecode_alu alu;
3047 int r;
3048
3049 if (ctx->thread_id_gpr_loaded)
3050 return 0;
3051
3052 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3053 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3054 alu.dst.sel = ctx->temp_reg;
3055 alu.dst.chan = 0;
3056 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3057 alu.src[0].value = 0xffffffff;
3058 alu.dst.write = 1;
3059 r = r600_bytecode_add_alu(ctx->bc, &alu);
3060 if (r)
3061 return r;
3062
3063 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3064 alu.op = ALU_OP1_MBCNT_32HI_INT;
3065 alu.dst.sel = ctx->temp_reg;
3066 alu.dst.chan = 1;
3067 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3068 alu.src[0].value = 0xffffffff;
3069 alu.dst.write = 1;
3070 r = r600_bytecode_add_alu(ctx->bc, &alu);
3071 if (r)
3072 return r;
3073
3074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3075 alu.op = ALU_OP3_MULADD_UINT24;
3076 alu.dst.sel = ctx->temp_reg;
3077 alu.dst.chan = 2;
3078 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3079 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3080 alu.src[1].value = 256;
3081 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3082 alu.dst.write = 1;
3083 alu.is_op3 = 1;
3084 alu.last = 1;
3085 r = r600_bytecode_add_alu(ctx->bc, &alu);
3086 if (r)
3087 return r;
3088
3089 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3090 ctx->thread_id_gpr, 1,
3091 ctx->temp_reg, 2,
3092 V_SQ_ALU_SRC_LITERAL, 0x40,
3093 ctx->temp_reg, 0);
3094 if (r)
3095 return r;
3096 ctx->thread_id_gpr_loaded = true;
3097 return 0;
3098 }
3099
r600_shader_from_tgsi(struct r600_context * rctx,struct r600_pipe_shader * pipeshader,union r600_shader_key key)3100 static int r600_shader_from_tgsi(struct r600_context *rctx,
3101 struct r600_pipe_shader *pipeshader,
3102 union r600_shader_key key)
3103 {
3104 struct r600_screen *rscreen = rctx->screen;
3105 struct r600_shader *shader = &pipeshader->shader;
3106 struct tgsi_token *tokens = pipeshader->selector->tokens;
3107 struct pipe_stream_output_info so = pipeshader->selector->so;
3108 struct tgsi_full_immediate *immediate;
3109 struct r600_shader_ctx ctx;
3110 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3111 unsigned output_done, noutput;
3112 unsigned opcode;
3113 int j, k, r = 0;
3114 unsigned i;
3115 int next_param_base = 0, next_clip_base;
3116 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3117 bool indirect_gprs;
3118 bool ring_outputs = false;
3119 bool lds_outputs = false;
3120 bool lds_inputs = false;
3121 bool pos_emitted = false;
3122
3123 ctx.bc = &shader->bc;
3124 ctx.shader = shader;
3125
3126 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3127 rscreen->has_compressed_msaa_texturing);
3128 ctx.tokens = tokens;
3129 tgsi_scan_shader(tokens, &ctx.info);
3130 shader->indirect_files = ctx.info.indirect_files;
3131
3132 shader->uses_doubles = ctx.info.uses_doubles;
3133 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3134 shader->nsys_inputs = 0;
3135
3136 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3137 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3138 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3139 tgsi_parse_init(&ctx.parse, tokens);
3140 ctx.type = ctx.info.processor;
3141 shader->processor_type = ctx.type;
3142 ctx.bc->type = shader->processor_type;
3143
3144 switch (ctx.type) {
3145 case PIPE_SHADER_VERTEX:
3146 shader->vs_as_gs_a = key.vs.as_gs_a;
3147 shader->vs_as_es = key.vs.as_es;
3148 shader->vs_as_ls = key.vs.as_ls;
3149 shader->atomic_base = key.vs.first_atomic_counter;
3150 if (shader->vs_as_es)
3151 ring_outputs = true;
3152 if (shader->vs_as_ls)
3153 lds_outputs = true;
3154 break;
3155 case PIPE_SHADER_GEOMETRY:
3156 ring_outputs = true;
3157 shader->atomic_base = key.gs.first_atomic_counter;
3158 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3159 break;
3160 case PIPE_SHADER_TESS_CTRL:
3161 shader->tcs_prim_mode = key.tcs.prim_mode;
3162 shader->atomic_base = key.tcs.first_atomic_counter;
3163 lds_outputs = true;
3164 lds_inputs = true;
3165 break;
3166 case PIPE_SHADER_TESS_EVAL:
3167 shader->tes_as_es = key.tes.as_es;
3168 shader->atomic_base = key.tes.first_atomic_counter;
3169 lds_inputs = true;
3170 if (shader->tes_as_es)
3171 ring_outputs = true;
3172 break;
3173 case PIPE_SHADER_FRAGMENT:
3174 shader->two_side = key.ps.color_two_side;
3175 shader->atomic_base = key.ps.first_atomic_counter;
3176 shader->rat_base = key.ps.nr_cbufs;
3177 shader->image_size_const_offset = key.ps.image_size_const_offset;
3178 break;
3179 case PIPE_SHADER_COMPUTE:
3180 shader->rat_base = 0;
3181 shader->image_size_const_offset = 0;
3182 break;
3183 default:
3184 break;
3185 }
3186
3187 if (shader->vs_as_es || shader->tes_as_es) {
3188 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3189 } else {
3190 ctx.gs_for_vs = NULL;
3191 }
3192
3193 ctx.next_ring_offset = 0;
3194 ctx.gs_out_ring_offset = 0;
3195 ctx.gs_next_vertex = 0;
3196 ctx.gs_stream_output_info = &so;
3197
3198 ctx.face_gpr = -1;
3199 ctx.fixed_pt_position_gpr = -1;
3200 ctx.fragcoord_input = -1;
3201 ctx.colors_used = 0;
3202 ctx.clip_vertex_write = 0;
3203 ctx.thread_id_gpr_loaded = false;
3204
3205 ctx.cs_block_size_reg = -1;
3206 ctx.cs_grid_size_reg = -1;
3207 ctx.cs_block_size_loaded = false;
3208 ctx.cs_grid_size_loaded = false;
3209
3210 shader->nr_ps_color_exports = 0;
3211 shader->nr_ps_max_color_exports = 0;
3212
3213
3214 /* register allocations */
3215 /* Values [0,127] correspond to GPR[0..127].
3216 * Values [128,159] correspond to constant buffer bank 0
3217 * Values [160,191] correspond to constant buffer bank 1
3218 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3219 * Values [256,287] correspond to constant buffer bank 2 (EG)
3220 * Values [288,319] correspond to constant buffer bank 3 (EG)
3221 * Other special values are shown in the list below.
3222 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3223 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3224 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3225 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3226 * 248 SQ_ALU_SRC_0: special constant 0.0.
3227 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3228 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3229 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3230 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3231 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3232 * 254 SQ_ALU_SRC_PV: previous vector result.
3233 * 255 SQ_ALU_SRC_PS: previous scalar result.
3234 */
3235 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3236 ctx.file_offset[i] = 0;
3237 }
3238
3239 if (ctx.type == PIPE_SHADER_VERTEX) {
3240
3241 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3242 if (ctx.info.num_inputs)
3243 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3244 }
3245 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3246 if (ctx.bc->chip_class >= EVERGREEN)
3247 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3248 else
3249 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3250 }
3251 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3252 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3253 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3254 }
3255 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3256 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3257 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3258 bool add_tesscoord = false, add_tess_inout = false;
3259 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3260 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3261 /* if we have tesscoord save one reg */
3262 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3263 add_tesscoord = true;
3264 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3265 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3266 add_tess_inout = true;
3267 }
3268 if (add_tesscoord || add_tess_inout)
3269 ctx.file_offset[TGSI_FILE_INPUT]++;
3270 if (add_tess_inout)
3271 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3272 }
3273 if (ctx.type == PIPE_SHADER_COMPUTE) {
3274 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3275 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3276 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3277 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3278 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3279 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3280 }
3281 }
3282
3283 ctx.file_offset[TGSI_FILE_OUTPUT] =
3284 ctx.file_offset[TGSI_FILE_INPUT] +
3285 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3286 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3287 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3288
3289 /* Outside the GPR range. This will be translated to one of the
3290 * kcache banks later. */
3291 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3292
3293 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3294 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3295 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3296 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3297 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3298
3299 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3300 ctx.tess_input_info = ctx.bc->ar_reg + 3;
3301 ctx.tess_output_info = ctx.bc->ar_reg + 4;
3302 ctx.temp_reg = ctx.bc->ar_reg + 5;
3303 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3304 ctx.tess_input_info = 0;
3305 ctx.tess_output_info = ctx.bc->ar_reg + 3;
3306 ctx.temp_reg = ctx.bc->ar_reg + 4;
3307 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3308 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3309 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3310 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3311 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3312 ctx.temp_reg = ctx.bc->ar_reg + 7;
3313 if (ctx.shader->gs_tri_strip_adj_fix) {
3314 ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
3315 ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
3316 ctx.temp_reg += 2;
3317 } else {
3318 ctx.gs_rotated_input[0] = 0;
3319 ctx.gs_rotated_input[1] = 1;
3320 }
3321 } else {
3322 ctx.temp_reg = ctx.bc->ar_reg + 3;
3323 }
3324
3325 if (shader->uses_images) {
3326 ctx.thread_id_gpr = ctx.temp_reg++;
3327 ctx.thread_id_gpr_loaded = false;
3328 }
3329
3330 shader->max_arrays = 0;
3331 shader->num_arrays = 0;
3332 if (indirect_gprs) {
3333
3334 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3335 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3336 ctx.file_offset[TGSI_FILE_OUTPUT] -
3337 ctx.file_offset[TGSI_FILE_INPUT],
3338 0x0F);
3339 }
3340 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3341 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3342 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3343 ctx.file_offset[TGSI_FILE_OUTPUT],
3344 0x0F);
3345 }
3346 }
3347
3348 ctx.nliterals = 0;
3349 ctx.literals = NULL;
3350 ctx.max_driver_temp_used = 0;
3351
3352 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3353 ctx.info.colors_written == 1;
3354 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3355 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3356
3357 if (ctx.type == PIPE_SHADER_VERTEX ||
3358 ctx.type == PIPE_SHADER_GEOMETRY ||
3359 ctx.type == PIPE_SHADER_TESS_EVAL) {
3360 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3361 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3362 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3363 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3364 }
3365
3366 if (shader->vs_as_gs_a)
3367 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3368
3369 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3370 r600_fetch_tess_io_info(&ctx);
3371
3372 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3373 tgsi_parse_token(&ctx.parse);
3374 switch (ctx.parse.FullToken.Token.Type) {
3375 case TGSI_TOKEN_TYPE_IMMEDIATE:
3376 immediate = &ctx.parse.FullToken.FullImmediate;
3377 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3378 if(ctx.literals == NULL) {
3379 r = -ENOMEM;
3380 goto out_err;
3381 }
3382 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3383 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3384 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3385 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3386 ctx.nliterals++;
3387 break;
3388 case TGSI_TOKEN_TYPE_DECLARATION:
3389 r = tgsi_declaration(&ctx);
3390 if (r)
3391 goto out_err;
3392 break;
3393 case TGSI_TOKEN_TYPE_INSTRUCTION:
3394 case TGSI_TOKEN_TYPE_PROPERTY:
3395 break;
3396 default:
3397 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3398 r = -EINVAL;
3399 goto out_err;
3400 }
3401 }
3402
3403 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3404 shader->ring_item_sizes[1] = 0;
3405 shader->ring_item_sizes[2] = 0;
3406 shader->ring_item_sizes[3] = 0;
3407
3408 /* Process two side if needed */
3409 if (shader->two_side && ctx.colors_used) {
3410 int i, count = ctx.shader->ninput;
3411 unsigned next_lds_loc = ctx.shader->nlds;
3412
3413 /* additional inputs will be allocated right after the existing inputs,
3414 * we won't need them after the color selection, so we don't need to
3415 * reserve these gprs for the rest of the shader code and to adjust
3416 * output offsets etc. */
3417 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3418 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3419
3420 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3421 if (ctx.face_gpr == -1) {
3422 i = ctx.shader->ninput++;
3423 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3424 ctx.shader->input[i].spi_sid = 0;
3425 ctx.shader->input[i].gpr = gpr++;
3426 ctx.face_gpr = ctx.shader->input[i].gpr;
3427 }
3428
3429 for (i = 0; i < count; i++) {
3430 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3431 int ni = ctx.shader->ninput++;
3432 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3433 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3434 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3435 ctx.shader->input[ni].gpr = gpr++;
3436 // TGSI to LLVM needs to know the lds position of inputs.
3437 // Non LLVM path computes it later (in process_twoside_color)
3438 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3439 ctx.shader->input[i].back_color_input = ni;
3440 if (ctx.bc->chip_class >= EVERGREEN) {
3441 if ((r = evergreen_interp_input(&ctx, ni)))
3442 return r;
3443 }
3444 }
3445 }
3446 }
3447
3448 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3449 shader->nr_ps_max_color_exports = 8;
3450
3451 if (ctx.fragcoord_input >= 0) {
3452 if (ctx.bc->chip_class == CAYMAN) {
3453 for (j = 0 ; j < 4; j++) {
3454 struct r600_bytecode_alu alu;
3455 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3456 alu.op = ALU_OP1_RECIP_IEEE;
3457 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3458 alu.src[0].chan = 3;
3459
3460 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3461 alu.dst.chan = j;
3462 alu.dst.write = (j == 3);
3463 alu.last = (j == 3);
3464 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3465 return r;
3466 }
3467 } else {
3468 struct r600_bytecode_alu alu;
3469 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3470 alu.op = ALU_OP1_RECIP_IEEE;
3471 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3472 alu.src[0].chan = 3;
3473
3474 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3475 alu.dst.chan = 3;
3476 alu.dst.write = 1;
3477 alu.last = 1;
3478 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3479 return r;
3480 }
3481 }
3482
3483 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3484 struct r600_bytecode_alu alu;
3485 int r;
3486
3487 /* GS thread with no output workaround - emit a cut at start of GS */
3488 if (ctx.bc->chip_class == R600)
3489 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3490
3491 for (j = 0; j < 4; j++) {
3492 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3493 alu.op = ALU_OP1_MOV;
3494 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3495 alu.src[0].value = 0;
3496 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3497 alu.dst.write = 1;
3498 alu.last = 1;
3499 r = r600_bytecode_add_alu(ctx.bc, &alu);
3500 if (r)
3501 return r;
3502 }
3503
3504 if (ctx.shader->gs_tri_strip_adj_fix) {
3505 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3506 ctx.gs_rotated_input[0], 2,
3507 0, 2,
3508 V_SQ_ALU_SRC_LITERAL, 1);
3509 if (r)
3510 return r;
3511
3512 for (i = 0; i < 6; i++) {
3513 int rotated = (i + 4) % 6;
3514 int offset_reg = i / 3;
3515 int offset_chan = i % 3;
3516 int rotated_offset_reg = rotated / 3;
3517 int rotated_offset_chan = rotated % 3;
3518
3519 if (offset_reg == 0 && offset_chan == 2)
3520 offset_chan = 3;
3521 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3522 rotated_offset_chan = 3;
3523
3524 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3525 ctx.gs_rotated_input[offset_reg], offset_chan,
3526 ctx.gs_rotated_input[0], 2,
3527 offset_reg, offset_chan,
3528 rotated_offset_reg, rotated_offset_chan);
3529 if (r)
3530 return r;
3531 }
3532 }
3533 }
3534
3535 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3536 r600_fetch_tess_io_info(&ctx);
3537
3538 if (shader->two_side && ctx.colors_used) {
3539 if ((r = process_twoside_color_inputs(&ctx)))
3540 return r;
3541 }
3542
3543 tgsi_parse_init(&ctx.parse, tokens);
3544 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3545 tgsi_parse_token(&ctx.parse);
3546 switch (ctx.parse.FullToken.Token.Type) {
3547 case TGSI_TOKEN_TYPE_INSTRUCTION:
3548 r = tgsi_is_supported(&ctx);
3549 if (r)
3550 goto out_err;
3551 ctx.max_driver_temp_used = 0;
3552 /* reserve first tmp for everyone */
3553 r600_get_temp(&ctx);
3554
3555 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3556 if ((r = tgsi_split_constant(&ctx)))
3557 goto out_err;
3558 if ((r = tgsi_split_literal_constant(&ctx)))
3559 goto out_err;
3560 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3561 if ((r = tgsi_split_gs_inputs(&ctx)))
3562 goto out_err;
3563 } else if (lds_inputs) {
3564 if ((r = tgsi_split_lds_inputs(&ctx)))
3565 goto out_err;
3566 }
3567 if (ctx.bc->chip_class == CAYMAN)
3568 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3569 else if (ctx.bc->chip_class >= EVERGREEN)
3570 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3571 else
3572 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3573 r = ctx.inst_info->process(&ctx);
3574 if (r)
3575 goto out_err;
3576
3577 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3578 r = r600_store_tcs_output(&ctx);
3579 if (r)
3580 goto out_err;
3581 }
3582 break;
3583 default:
3584 break;
3585 }
3586 }
3587
3588 /* Reset the temporary register counter. */
3589 ctx.max_driver_temp_used = 0;
3590
3591 noutput = shader->noutput;
3592
3593 if (!ring_outputs && ctx.clip_vertex_write) {
3594 unsigned clipdist_temp[2];
3595
3596 clipdist_temp[0] = r600_get_temp(&ctx);
3597 clipdist_temp[1] = r600_get_temp(&ctx);
3598
3599 /* need to convert a clipvertex write into clipdistance writes and not export
3600 the clip vertex anymore */
3601
3602 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3603 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3604 shader->output[noutput].gpr = clipdist_temp[0];
3605 noutput++;
3606 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3607 shader->output[noutput].gpr = clipdist_temp[1];
3608 noutput++;
3609
3610 /* reset spi_sid for clipvertex output to avoid confusing spi */
3611 shader->output[ctx.cv_output].spi_sid = 0;
3612
3613 shader->clip_dist_write = 0xFF;
3614 shader->cc_dist_mask = 0xFF;
3615
3616 for (i = 0; i < 8; i++) {
3617 int oreg = i >> 2;
3618 int ochan = i & 3;
3619
3620 for (j = 0; j < 4; j++) {
3621 struct r600_bytecode_alu alu;
3622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3623 alu.op = ALU_OP2_DOT4;
3624 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3625 alu.src[0].chan = j;
3626
3627 alu.src[1].sel = 512 + i;
3628 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3629 alu.src[1].chan = j;
3630
3631 alu.dst.sel = clipdist_temp[oreg];
3632 alu.dst.chan = j;
3633 alu.dst.write = (j == ochan);
3634 if (j == 3)
3635 alu.last = 1;
3636 r = r600_bytecode_add_alu(ctx.bc, &alu);
3637 if (r)
3638 return r;
3639 }
3640 }
3641 }
3642
3643 /* Add stream outputs. */
3644 if (so.num_outputs) {
3645 bool emit = false;
3646 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3647 emit = true;
3648 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3649 emit = true;
3650 if (emit)
3651 emit_streamout(&ctx, &so, -1, NULL);
3652 }
3653 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3654 convert_edgeflag_to_int(&ctx);
3655
3656 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3657 r600_emit_tess_factor(&ctx);
3658
3659 if (lds_outputs) {
3660 if (ctx.type == PIPE_SHADER_VERTEX) {
3661 if (ctx.shader->noutput)
3662 emit_lds_vs_writes(&ctx);
3663 }
3664 } else if (ring_outputs) {
3665 if (shader->vs_as_es || shader->tes_as_es) {
3666 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3667 ctx.gs_export_gpr_tregs[1] = -1;
3668 ctx.gs_export_gpr_tregs[2] = -1;
3669 ctx.gs_export_gpr_tregs[3] = -1;
3670
3671 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3672 }
3673 } else {
3674 /* Export output */
3675 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3676
3677 for (i = 0, j = 0; i < noutput; i++, j++) {
3678 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3679 output[j].gpr = shader->output[i].gpr;
3680 output[j].elem_size = 3;
3681 output[j].swizzle_x = 0;
3682 output[j].swizzle_y = 1;
3683 output[j].swizzle_z = 2;
3684 output[j].swizzle_w = 3;
3685 output[j].burst_count = 1;
3686 output[j].type = 0xffffffff;
3687 output[j].op = CF_OP_EXPORT;
3688 switch (ctx.type) {
3689 case PIPE_SHADER_VERTEX:
3690 case PIPE_SHADER_TESS_EVAL:
3691 switch (shader->output[i].name) {
3692 case TGSI_SEMANTIC_POSITION:
3693 output[j].array_base = 60;
3694 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3695 pos_emitted = true;
3696 break;
3697
3698 case TGSI_SEMANTIC_PSIZE:
3699 output[j].array_base = 61;
3700 output[j].swizzle_y = 7;
3701 output[j].swizzle_z = 7;
3702 output[j].swizzle_w = 7;
3703 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3704 pos_emitted = true;
3705 break;
3706 case TGSI_SEMANTIC_EDGEFLAG:
3707 output[j].array_base = 61;
3708 output[j].swizzle_x = 7;
3709 output[j].swizzle_y = 0;
3710 output[j].swizzle_z = 7;
3711 output[j].swizzle_w = 7;
3712 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3713 pos_emitted = true;
3714 break;
3715 case TGSI_SEMANTIC_LAYER:
3716 /* spi_sid is 0 for outputs that are
3717 * not consumed by PS */
3718 if (shader->output[i].spi_sid) {
3719 output[j].array_base = next_param_base++;
3720 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3721 j++;
3722 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3723 }
3724 output[j].array_base = 61;
3725 output[j].swizzle_x = 7;
3726 output[j].swizzle_y = 7;
3727 output[j].swizzle_z = 0;
3728 output[j].swizzle_w = 7;
3729 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3730 pos_emitted = true;
3731 break;
3732 case TGSI_SEMANTIC_VIEWPORT_INDEX:
3733 /* spi_sid is 0 for outputs that are
3734 * not consumed by PS */
3735 if (shader->output[i].spi_sid) {
3736 output[j].array_base = next_param_base++;
3737 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3738 j++;
3739 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3740 }
3741 output[j].array_base = 61;
3742 output[j].swizzle_x = 7;
3743 output[j].swizzle_y = 7;
3744 output[j].swizzle_z = 7;
3745 output[j].swizzle_w = 0;
3746 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3747 pos_emitted = true;
3748 break;
3749 case TGSI_SEMANTIC_CLIPVERTEX:
3750 j--;
3751 break;
3752 case TGSI_SEMANTIC_CLIPDIST:
3753 output[j].array_base = next_clip_base++;
3754 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3755 pos_emitted = true;
3756 /* spi_sid is 0 for clipdistance outputs that were generated
3757 * for clipvertex - we don't need to pass them to PS */
3758 if (shader->output[i].spi_sid) {
3759 j++;
3760 /* duplicate it as PARAM to pass to the pixel shader */
3761 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3762 output[j].array_base = next_param_base++;
3763 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3764 }
3765 break;
3766 case TGSI_SEMANTIC_FOG:
3767 output[j].swizzle_y = 4; /* 0 */
3768 output[j].swizzle_z = 4; /* 0 */
3769 output[j].swizzle_w = 5; /* 1 */
3770 break;
3771 case TGSI_SEMANTIC_PRIMID:
3772 output[j].swizzle_x = 2;
3773 output[j].swizzle_y = 4; /* 0 */
3774 output[j].swizzle_z = 4; /* 0 */
3775 output[j].swizzle_w = 4; /* 0 */
3776 break;
3777 }
3778
3779 break;
3780 case PIPE_SHADER_FRAGMENT:
3781 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3782 /* never export more colors than the number of CBs */
3783 if (shader->output[i].sid >= max_color_exports) {
3784 /* skip export */
3785 j--;
3786 continue;
3787 }
3788 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3789 output[j].array_base = shader->output[i].sid;
3790 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3791 shader->nr_ps_color_exports++;
3792 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3793 for (k = 1; k < max_color_exports; k++) {
3794 j++;
3795 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3796 output[j].gpr = shader->output[i].gpr;
3797 output[j].elem_size = 3;
3798 output[j].swizzle_x = 0;
3799 output[j].swizzle_y = 1;
3800 output[j].swizzle_z = 2;
3801 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3802 output[j].burst_count = 1;
3803 output[j].array_base = k;
3804 output[j].op = CF_OP_EXPORT;
3805 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3806 shader->nr_ps_color_exports++;
3807 }
3808 }
3809 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3810 output[j].array_base = 61;
3811 output[j].swizzle_x = 2;
3812 output[j].swizzle_y = 7;
3813 output[j].swizzle_z = output[j].swizzle_w = 7;
3814 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3815 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3816 output[j].array_base = 61;
3817 output[j].swizzle_x = 7;
3818 output[j].swizzle_y = 1;
3819 output[j].swizzle_z = output[j].swizzle_w = 7;
3820 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3821 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3822 output[j].array_base = 61;
3823 output[j].swizzle_x = 7;
3824 output[j].swizzle_y = 7;
3825 output[j].swizzle_z = 0;
3826 output[j].swizzle_w = 7;
3827 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3828 } else {
3829 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3830 r = -EINVAL;
3831 goto out_err;
3832 }
3833 break;
3834 case PIPE_SHADER_TESS_CTRL:
3835 break;
3836 default:
3837 R600_ERR("unsupported processor type %d\n", ctx.type);
3838 r = -EINVAL;
3839 goto out_err;
3840 }
3841
3842 if (output[j].type == 0xffffffff) {
3843 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3844 output[j].array_base = next_param_base++;
3845 }
3846 }
3847
3848 /* add fake position export */
3849 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3850 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3851 output[j].gpr = 0;
3852 output[j].elem_size = 3;
3853 output[j].swizzle_x = 7;
3854 output[j].swizzle_y = 7;
3855 output[j].swizzle_z = 7;
3856 output[j].swizzle_w = 7;
3857 output[j].burst_count = 1;
3858 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3859 output[j].array_base = 60;
3860 output[j].op = CF_OP_EXPORT;
3861 j++;
3862 }
3863
3864 /* add fake param output for vertex shader if no param is exported */
3865 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3866 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3867 output[j].gpr = 0;
3868 output[j].elem_size = 3;
3869 output[j].swizzle_x = 7;
3870 output[j].swizzle_y = 7;
3871 output[j].swizzle_z = 7;
3872 output[j].swizzle_w = 7;
3873 output[j].burst_count = 1;
3874 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3875 output[j].array_base = 0;
3876 output[j].op = CF_OP_EXPORT;
3877 j++;
3878 }
3879
3880 /* add fake pixel export */
3881 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3882 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3883 output[j].gpr = 0;
3884 output[j].elem_size = 3;
3885 output[j].swizzle_x = 7;
3886 output[j].swizzle_y = 7;
3887 output[j].swizzle_z = 7;
3888 output[j].swizzle_w = 7;
3889 output[j].burst_count = 1;
3890 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3891 output[j].array_base = 0;
3892 output[j].op = CF_OP_EXPORT;
3893 j++;
3894 shader->nr_ps_color_exports++;
3895 }
3896
3897 noutput = j;
3898
3899 /* set export done on last export of each type */
3900 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
3901 if (!(output_done & (1 << output[k].type))) {
3902 output_done |= (1 << output[k].type);
3903 output[k].op = CF_OP_EXPORT_DONE;
3904 }
3905 }
3906 /* add output to bytecode */
3907 for (i = 0; i < noutput; i++) {
3908 r = r600_bytecode_add_output(ctx.bc, &output[i]);
3909 if (r)
3910 goto out_err;
3911 }
3912 }
3913
3914 /* add program end */
3915 if (ctx.bc->chip_class == CAYMAN)
3916 cm_bytecode_add_cf_end(ctx.bc);
3917 else {
3918 const struct cf_op_info *last = NULL;
3919
3920 if (ctx.bc->cf_last)
3921 last = r600_isa_cf(ctx.bc->cf_last->op);
3922
3923 /* alu clause instructions don't have EOP bit, so add NOP */
3924 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
3925 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3926
3927 ctx.bc->cf_last->end_of_program = 1;
3928 }
3929
3930 /* check GPR limit - we have 124 = 128 - 4
3931 * (4 are reserved as alu clause temporary registers) */
3932 if (ctx.bc->ngpr > 124) {
3933 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3934 r = -ENOMEM;
3935 goto out_err;
3936 }
3937
3938 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3939 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3940 return r;
3941 }
3942
3943 free(ctx.literals);
3944 tgsi_parse_free(&ctx.parse);
3945 return 0;
3946 out_err:
3947 free(ctx.literals);
3948 tgsi_parse_free(&ctx.parse);
3949 return r;
3950 }
3951
tgsi_unsupported(struct r600_shader_ctx * ctx)3952 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3953 {
3954 const unsigned tgsi_opcode =
3955 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3956 R600_ERR("%s tgsi opcode unsupported\n",
3957 tgsi_get_opcode_name(tgsi_opcode));
3958 return -EINVAL;
3959 }
3960
tgsi_end(struct r600_shader_ctx * ctx UNUSED)3961 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
3962 {
3963 return 0;
3964 }
3965
r600_bytecode_src(struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src,unsigned chan)3966 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3967 const struct r600_shader_src *shader_src,
3968 unsigned chan)
3969 {
3970 bc_src->sel = shader_src->sel;
3971 bc_src->chan = shader_src->swizzle[chan];
3972 bc_src->neg = shader_src->neg;
3973 bc_src->abs = shader_src->abs;
3974 bc_src->rel = shader_src->rel;
3975 bc_src->value = shader_src->value[bc_src->chan];
3976 bc_src->kc_bank = shader_src->kc_bank;
3977 bc_src->kc_rel = shader_src->kc_rel;
3978 }
3979
r600_bytecode_src_set_abs(struct r600_bytecode_alu_src * bc_src)3980 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3981 {
3982 bc_src->abs = 1;
3983 bc_src->neg = 0;
3984 }
3985
r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src * bc_src)3986 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3987 {
3988 bc_src->neg = !bc_src->neg;
3989 }
3990
tgsi_dst(struct r600_shader_ctx * ctx,const struct tgsi_full_dst_register * tgsi_dst,unsigned swizzle,struct r600_bytecode_alu_dst * r600_dst)3991 static void tgsi_dst(struct r600_shader_ctx *ctx,
3992 const struct tgsi_full_dst_register *tgsi_dst,
3993 unsigned swizzle,
3994 struct r600_bytecode_alu_dst *r600_dst)
3995 {
3996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3997
3998 r600_dst->sel = tgsi_dst->Register.Index;
3999 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4000 r600_dst->chan = swizzle;
4001 r600_dst->write = 1;
4002 if (inst->Instruction.Saturate) {
4003 r600_dst->clamp = 1;
4004 }
4005 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4006 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4007 return;
4008 }
4009 }
4010 if (tgsi_dst->Register.Indirect)
4011 r600_dst->rel = V_SQ_REL_RELATIVE;
4012
4013 }
4014
tgsi_op2_64_params(struct r600_shader_ctx * ctx,bool singledest,bool swap,int dest_temp,int op_override)4015 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4016 {
4017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4018 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4019 struct r600_bytecode_alu alu;
4020 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4021 int use_tmp = 0;
4022 int swizzle_x = inst->Src[0].Register.SwizzleX;
4023
4024 if (singledest) {
4025 switch (write_mask) {
4026 case 0x1:
4027 if (swizzle_x == 2) {
4028 write_mask = 0xc;
4029 use_tmp = 3;
4030 } else
4031 write_mask = 0x3;
4032 break;
4033 case 0x2:
4034 if (swizzle_x == 2) {
4035 write_mask = 0xc;
4036 use_tmp = 3;
4037 } else {
4038 write_mask = 0x3;
4039 use_tmp = 1;
4040 }
4041 break;
4042 case 0x4:
4043 if (swizzle_x == 0) {
4044 write_mask = 0x3;
4045 use_tmp = 1;
4046 } else
4047 write_mask = 0xc;
4048 break;
4049 case 0x8:
4050 if (swizzle_x == 0) {
4051 write_mask = 0x3;
4052 use_tmp = 1;
4053 } else {
4054 write_mask = 0xc;
4055 use_tmp = 3;
4056 }
4057 break;
4058 }
4059 }
4060
4061 lasti = tgsi_last_instruction(write_mask);
4062 for (i = 0; i <= lasti; i++) {
4063
4064 if (!(write_mask & (1 << i)))
4065 continue;
4066
4067 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4068
4069 if (singledest) {
4070 if (use_tmp || dest_temp) {
4071 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4072 alu.dst.chan = i;
4073 alu.dst.write = 1;
4074 } else {
4075 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4076 }
4077 if (i == 1 || i == 3)
4078 alu.dst.write = 0;
4079 } else
4080 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4081
4082 alu.op = op_override ? op_override : ctx->inst_info->op;
4083 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4084 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4085 } else if (!swap) {
4086 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4087 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4088 }
4089 } else {
4090 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4091 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4092 }
4093
4094 /* handle some special cases */
4095 if (i == 1 || i == 3) {
4096 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4097 case TGSI_OPCODE_DABS:
4098 r600_bytecode_src_set_abs(&alu.src[0]);
4099 break;
4100 default:
4101 break;
4102 }
4103 }
4104 if (i == lasti) {
4105 alu.last = 1;
4106 }
4107 r = r600_bytecode_add_alu(ctx->bc, &alu);
4108 if (r)
4109 return r;
4110 }
4111
4112 if (use_tmp) {
4113 write_mask = inst->Dst[0].Register.WriteMask;
4114
4115 lasti = tgsi_last_instruction(write_mask);
4116 /* move result from temp to dst */
4117 for (i = 0; i <= lasti; i++) {
4118 if (!(write_mask & (1 << i)))
4119 continue;
4120
4121 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4122 alu.op = ALU_OP1_MOV;
4123
4124 if (dest_temp) {
4125 alu.dst.sel = dest_temp;
4126 alu.dst.chan = i;
4127 alu.dst.write = 1;
4128 } else
4129 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4130 alu.src[0].sel = ctx->temp_reg;
4131 alu.src[0].chan = use_tmp - 1;
4132 alu.last = (i == lasti);
4133
4134 r = r600_bytecode_add_alu(ctx->bc, &alu);
4135 if (r)
4136 return r;
4137 }
4138 }
4139 return 0;
4140 }
4141
tgsi_op2_64(struct r600_shader_ctx * ctx)4142 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4143 {
4144 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4145 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4146 /* confirm writemasking */
4147 if ((write_mask & 0x3) != 0x3 &&
4148 (write_mask & 0xc) != 0xc) {
4149 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4150 return -1;
4151 }
4152 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4153 }
4154
tgsi_op2_64_single_dest(struct r600_shader_ctx * ctx)4155 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4156 {
4157 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4158 }
4159
tgsi_op2_64_single_dest_s(struct r600_shader_ctx * ctx)4160 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4161 {
4162 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4163 }
4164
tgsi_op3_64(struct r600_shader_ctx * ctx)4165 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4166 {
4167 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4168 struct r600_bytecode_alu alu;
4169 int i, j, r;
4170 int lasti = 3;
4171 int tmp = r600_get_temp(ctx);
4172
4173 for (i = 0; i < lasti + 1; i++) {
4174
4175 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4176 alu.op = ctx->inst_info->op;
4177 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4178 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4179 }
4180
4181 if (inst->Dst[0].Register.WriteMask & (1 << i))
4182 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4183 else
4184 alu.dst.sel = tmp;
4185
4186 alu.dst.chan = i;
4187 alu.is_op3 = 1;
4188 if (i == lasti) {
4189 alu.last = 1;
4190 }
4191 r = r600_bytecode_add_alu(ctx->bc, &alu);
4192 if (r)
4193 return r;
4194 }
4195 return 0;
4196 }
4197
tgsi_op2_s(struct r600_shader_ctx * ctx,int swap,int trans_only)4198 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4199 {
4200 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4201 struct r600_bytecode_alu alu;
4202 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4203 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4204 /* use temp register if trans_only and more than one dst component */
4205 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4206 unsigned op = ctx->inst_info->op;
4207
4208 if (op == ALU_OP2_MUL_IEEE &&
4209 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4210 op = ALU_OP2_MUL;
4211
4212 for (i = 0; i <= lasti; i++) {
4213 if (!(write_mask & (1 << i)))
4214 continue;
4215
4216 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4217 if (use_tmp) {
4218 alu.dst.sel = ctx->temp_reg;
4219 alu.dst.chan = i;
4220 alu.dst.write = 1;
4221 } else
4222 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4223
4224 alu.op = op;
4225 if (!swap) {
4226 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4227 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4228 }
4229 } else {
4230 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4231 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4232 }
4233 if (i == lasti || trans_only) {
4234 alu.last = 1;
4235 }
4236 r = r600_bytecode_add_alu(ctx->bc, &alu);
4237 if (r)
4238 return r;
4239 }
4240
4241 if (use_tmp) {
4242 /* move result from temp to dst */
4243 for (i = 0; i <= lasti; i++) {
4244 if (!(write_mask & (1 << i)))
4245 continue;
4246
4247 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4248 alu.op = ALU_OP1_MOV;
4249 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4250 alu.src[0].sel = ctx->temp_reg;
4251 alu.src[0].chan = i;
4252 alu.last = (i == lasti);
4253
4254 r = r600_bytecode_add_alu(ctx->bc, &alu);
4255 if (r)
4256 return r;
4257 }
4258 }
4259 return 0;
4260 }
4261
tgsi_op2(struct r600_shader_ctx * ctx)4262 static int tgsi_op2(struct r600_shader_ctx *ctx)
4263 {
4264 return tgsi_op2_s(ctx, 0, 0);
4265 }
4266
tgsi_op2_swap(struct r600_shader_ctx * ctx)4267 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4268 {
4269 return tgsi_op2_s(ctx, 1, 0);
4270 }
4271
tgsi_op2_trans(struct r600_shader_ctx * ctx)4272 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4273 {
4274 return tgsi_op2_s(ctx, 0, 1);
4275 }
4276
tgsi_ineg(struct r600_shader_ctx * ctx)4277 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4278 {
4279 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4280 struct r600_bytecode_alu alu;
4281 int i, r;
4282 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4283
4284 for (i = 0; i < lasti + 1; i++) {
4285
4286 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4287 continue;
4288 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4289 alu.op = ctx->inst_info->op;
4290
4291 alu.src[0].sel = V_SQ_ALU_SRC_0;
4292
4293 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4294
4295 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4296
4297 if (i == lasti) {
4298 alu.last = 1;
4299 }
4300 r = r600_bytecode_add_alu(ctx->bc, &alu);
4301 if (r)
4302 return r;
4303 }
4304 return 0;
4305
4306 }
4307
tgsi_dneg(struct r600_shader_ctx * ctx)4308 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4309 {
4310 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4311 struct r600_bytecode_alu alu;
4312 int i, r;
4313 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4314
4315 for (i = 0; i < lasti + 1; i++) {
4316
4317 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4318 continue;
4319 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4320 alu.op = ALU_OP1_MOV;
4321
4322 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4323
4324 if (i == 1 || i == 3)
4325 r600_bytecode_src_toggle_neg(&alu.src[0]);
4326 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4327
4328 if (i == lasti) {
4329 alu.last = 1;
4330 }
4331 r = r600_bytecode_add_alu(ctx->bc, &alu);
4332 if (r)
4333 return r;
4334 }
4335 return 0;
4336
4337 }
4338
tgsi_dfracexp(struct r600_shader_ctx * ctx)4339 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4340 {
4341 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4342 struct r600_bytecode_alu alu;
4343 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4344 int i, j, r;
4345
4346 for (i = 0; i <= 3; i++) {
4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4348 alu.op = ctx->inst_info->op;
4349
4350 alu.dst.sel = ctx->temp_reg;
4351 alu.dst.chan = i;
4352 alu.dst.write = 1;
4353 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4354 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4355 }
4356
4357 if (i == 3)
4358 alu.last = 1;
4359
4360 r = r600_bytecode_add_alu(ctx->bc, &alu);
4361 if (r)
4362 return r;
4363 }
4364
4365 /* Replicate significand result across channels. */
4366 for (i = 0; i <= 3; i++) {
4367 if (!(write_mask & (1 << i)))
4368 continue;
4369
4370 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4371 alu.op = ALU_OP1_MOV;
4372 alu.src[0].chan = (i & 1) + 2;
4373 alu.src[0].sel = ctx->temp_reg;
4374
4375 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4376 alu.dst.write = 1;
4377 alu.last = 1;
4378 r = r600_bytecode_add_alu(ctx->bc, &alu);
4379 if (r)
4380 return r;
4381 }
4382
4383 for (i = 0; i <= 3; i++) {
4384 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4385 /* MOV third channels to writemask dst1 */
4386 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4387 alu.op = ALU_OP1_MOV;
4388 alu.src[0].chan = 1;
4389 alu.src[0].sel = ctx->temp_reg;
4390
4391 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4392 alu.last = 1;
4393 r = r600_bytecode_add_alu(ctx->bc, &alu);
4394 if (r)
4395 return r;
4396 break;
4397 }
4398 }
4399 return 0;
4400 }
4401
4402
egcm_int_to_double(struct r600_shader_ctx * ctx)4403 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4404 {
4405 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4406 struct r600_bytecode_alu alu;
4407 int i, r;
4408 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4409
4410 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4411 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4412
4413 for (i = 0; i <= (lasti+1)/2; i++) {
4414 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4415 alu.op = ctx->inst_info->op;
4416
4417 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4418 alu.dst.sel = ctx->temp_reg;
4419 alu.dst.chan = i;
4420 alu.dst.write = 1;
4421 alu.last = 1;
4422
4423 r = r600_bytecode_add_alu(ctx->bc, &alu);
4424 if (r)
4425 return r;
4426 }
4427
4428 for (i = 0; i <= lasti; i++) {
4429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4430 alu.op = ALU_OP1_FLT32_TO_FLT64;
4431
4432 alu.src[0].chan = i/2;
4433 if (i%2 == 0)
4434 alu.src[0].sel = ctx->temp_reg;
4435 else {
4436 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4437 alu.src[0].value = 0x0;
4438 }
4439 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4440 alu.last = i == lasti;
4441
4442 r = r600_bytecode_add_alu(ctx->bc, &alu);
4443 if (r)
4444 return r;
4445 }
4446
4447 return 0;
4448 }
4449
egcm_double_to_int(struct r600_shader_ctx * ctx)4450 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4451 {
4452 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4453 struct r600_bytecode_alu alu;
4454 int i, r;
4455 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4456 int treg = r600_get_temp(ctx);
4457 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4458 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4459
4460 /* do a 64->32 into a temp register */
4461 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4462 if (r)
4463 return r;
4464
4465 for (i = 0; i <= lasti; i++) {
4466 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4467 continue;
4468 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4469 alu.op = ctx->inst_info->op;
4470
4471 alu.src[0].chan = i;
4472 alu.src[0].sel = treg;
4473 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4474 alu.last = (i == lasti);
4475
4476 r = r600_bytecode_add_alu(ctx->bc, &alu);
4477 if (r)
4478 return r;
4479 }
4480
4481 return 0;
4482 }
4483
cayman_emit_unary_double_raw(struct r600_bytecode * bc,unsigned op,int dst_reg,struct r600_shader_src * src,bool abs)4484 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4485 unsigned op,
4486 int dst_reg,
4487 struct r600_shader_src *src,
4488 bool abs)
4489 {
4490 struct r600_bytecode_alu alu;
4491 const int last_slot = 3;
4492 int r;
4493
4494 /* these have to write the result to X/Y by the looks of it */
4495 for (int i = 0 ; i < last_slot; i++) {
4496 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4497 alu.op = op;
4498
4499 r600_bytecode_src(&alu.src[0], src, 1);
4500 r600_bytecode_src(&alu.src[1], src, 0);
4501
4502 if (abs)
4503 r600_bytecode_src_set_abs(&alu.src[1]);
4504
4505 alu.dst.sel = dst_reg;
4506 alu.dst.chan = i;
4507 alu.dst.write = (i == 0 || i == 1);
4508
4509 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4510 alu.last = 1;
4511 r = r600_bytecode_add_alu(bc, &alu);
4512 if (r)
4513 return r;
4514 }
4515
4516 return 0;
4517 }
4518
cayman_emit_double_instr(struct r600_shader_ctx * ctx)4519 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4520 {
4521 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4522 int i, r;
4523 struct r600_bytecode_alu alu;
4524 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4525 int t1 = ctx->temp_reg;
4526
4527 /* should only be one src regs */
4528 assert(inst->Instruction.NumSrcRegs == 1);
4529
4530 /* only support one double at a time */
4531 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4532 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4533
4534 r = cayman_emit_unary_double_raw(
4535 ctx->bc, ctx->inst_info->op, t1,
4536 &ctx->src[0],
4537 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4538 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4539 if (r)
4540 return r;
4541
4542 for (i = 0 ; i <= lasti; i++) {
4543 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4544 continue;
4545 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4546 alu.op = ALU_OP1_MOV;
4547 alu.src[0].sel = t1;
4548 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4549 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4550 alu.dst.write = 1;
4551 if (i == lasti)
4552 alu.last = 1;
4553 r = r600_bytecode_add_alu(ctx->bc, &alu);
4554 if (r)
4555 return r;
4556 }
4557 return 0;
4558 }
4559
cayman_emit_float_instr(struct r600_shader_ctx * ctx)4560 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4561 {
4562 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4563 int i, j, r;
4564 struct r600_bytecode_alu alu;
4565 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4566
4567 for (i = 0 ; i < last_slot; i++) {
4568 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4569 alu.op = ctx->inst_info->op;
4570 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4571 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4572
4573 /* RSQ should take the absolute value of src */
4574 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4575 r600_bytecode_src_set_abs(&alu.src[j]);
4576 }
4577 }
4578 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4579 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4580
4581 if (i == last_slot - 1)
4582 alu.last = 1;
4583 r = r600_bytecode_add_alu(ctx->bc, &alu);
4584 if (r)
4585 return r;
4586 }
4587 return 0;
4588 }
4589
cayman_mul_int_instr(struct r600_shader_ctx * ctx)4590 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4591 {
4592 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4593 int i, j, k, r;
4594 struct r600_bytecode_alu alu;
4595 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4596 int t1 = ctx->temp_reg;
4597
4598 for (k = 0; k <= lasti; k++) {
4599 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4600 continue;
4601
4602 for (i = 0 ; i < 4; i++) {
4603 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4604 alu.op = ctx->inst_info->op;
4605 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4606 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4607 }
4608 alu.dst.sel = t1;
4609 alu.dst.chan = i;
4610 alu.dst.write = (i == k);
4611 if (i == 3)
4612 alu.last = 1;
4613 r = r600_bytecode_add_alu(ctx->bc, &alu);
4614 if (r)
4615 return r;
4616 }
4617 }
4618
4619 for (i = 0 ; i <= lasti; i++) {
4620 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4621 continue;
4622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4623 alu.op = ALU_OP1_MOV;
4624 alu.src[0].sel = t1;
4625 alu.src[0].chan = i;
4626 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4627 alu.dst.write = 1;
4628 if (i == lasti)
4629 alu.last = 1;
4630 r = r600_bytecode_add_alu(ctx->bc, &alu);
4631 if (r)
4632 return r;
4633 }
4634
4635 return 0;
4636 }
4637
4638
cayman_mul_double_instr(struct r600_shader_ctx * ctx)4639 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4640 {
4641 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4642 int i, j, k, r;
4643 struct r600_bytecode_alu alu;
4644 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4645 int t1 = ctx->temp_reg;
4646
4647 /* t1 would get overwritten below if we actually tried to
4648 * multiply two pairs of doubles at a time. */
4649 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4650 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4651
4652 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4653
4654 for (i = 0; i < 4; i++) {
4655 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4656 alu.op = ctx->inst_info->op;
4657 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4658 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4659 }
4660 alu.dst.sel = t1;
4661 alu.dst.chan = i;
4662 alu.dst.write = 1;
4663 if (i == 3)
4664 alu.last = 1;
4665 r = r600_bytecode_add_alu(ctx->bc, &alu);
4666 if (r)
4667 return r;
4668 }
4669
4670 for (i = 0; i <= lasti; i++) {
4671 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4672 continue;
4673 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4674 alu.op = ALU_OP1_MOV;
4675 alu.src[0].sel = t1;
4676 alu.src[0].chan = i;
4677 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4678 alu.dst.write = 1;
4679 if (i == lasti)
4680 alu.last = 1;
4681 r = r600_bytecode_add_alu(ctx->bc, &alu);
4682 if (r)
4683 return r;
4684 }
4685
4686 return 0;
4687 }
4688
4689 /*
4690 * Emit RECIP_64 + MUL_64 to implement division.
4691 */
cayman_ddiv_instr(struct r600_shader_ctx * ctx)4692 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4693 {
4694 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4695 int r;
4696 struct r600_bytecode_alu alu;
4697 int t1 = ctx->temp_reg;
4698 int k;
4699
4700 /* Only support one double at a time. This is the same constraint as
4701 * in DMUL lowering. */
4702 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4703 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4704
4705 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4706
4707 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4708 if (r)
4709 return r;
4710
4711 for (int i = 0; i < 4; i++) {
4712 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4713 alu.op = ALU_OP2_MUL_64;
4714
4715 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4716
4717 alu.src[1].sel = t1;
4718 alu.src[1].chan = (i == 3) ? 0 : 1;
4719
4720 alu.dst.sel = t1;
4721 alu.dst.chan = i;
4722 alu.dst.write = 1;
4723 if (i == 3)
4724 alu.last = 1;
4725 r = r600_bytecode_add_alu(ctx->bc, &alu);
4726 if (r)
4727 return r;
4728 }
4729
4730 for (int i = 0; i < 2; i++) {
4731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4732 alu.op = ALU_OP1_MOV;
4733 alu.src[0].sel = t1;
4734 alu.src[0].chan = i;
4735 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4736 alu.dst.write = 1;
4737 if (i == 1)
4738 alu.last = 1;
4739 r = r600_bytecode_add_alu(ctx->bc, &alu);
4740 if (r)
4741 return r;
4742 }
4743 return 0;
4744 }
4745
4746 /*
4747 * r600 - trunc to -PI..PI range
4748 * r700 - normalize by dividing by 2PI
4749 * see fdo bug 27901
4750 */
tgsi_setup_trig(struct r600_shader_ctx * ctx)4751 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4752 {
4753 int r;
4754 struct r600_bytecode_alu alu;
4755
4756 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4757 alu.op = ALU_OP3_MULADD;
4758 alu.is_op3 = 1;
4759
4760 alu.dst.chan = 0;
4761 alu.dst.sel = ctx->temp_reg;
4762 alu.dst.write = 1;
4763
4764 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4765
4766 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4767 alu.src[1].chan = 0;
4768 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4769 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4770 alu.src[2].chan = 0;
4771 alu.last = 1;
4772 r = r600_bytecode_add_alu(ctx->bc, &alu);
4773 if (r)
4774 return r;
4775
4776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4777 alu.op = ALU_OP1_FRACT;
4778
4779 alu.dst.chan = 0;
4780 alu.dst.sel = ctx->temp_reg;
4781 alu.dst.write = 1;
4782
4783 alu.src[0].sel = ctx->temp_reg;
4784 alu.src[0].chan = 0;
4785 alu.last = 1;
4786 r = r600_bytecode_add_alu(ctx->bc, &alu);
4787 if (r)
4788 return r;
4789
4790 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4791 alu.op = ALU_OP3_MULADD;
4792 alu.is_op3 = 1;
4793
4794 alu.dst.chan = 0;
4795 alu.dst.sel = ctx->temp_reg;
4796 alu.dst.write = 1;
4797
4798 alu.src[0].sel = ctx->temp_reg;
4799 alu.src[0].chan = 0;
4800
4801 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4802 alu.src[1].chan = 0;
4803 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4804 alu.src[2].chan = 0;
4805
4806 if (ctx->bc->chip_class == R600) {
4807 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4808 alu.src[2].value = u_bitcast_f2u(-M_PI);
4809 } else {
4810 alu.src[1].sel = V_SQ_ALU_SRC_1;
4811 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4812 alu.src[2].neg = 1;
4813 }
4814
4815 alu.last = 1;
4816 r = r600_bytecode_add_alu(ctx->bc, &alu);
4817 if (r)
4818 return r;
4819 return 0;
4820 }
4821
cayman_trig(struct r600_shader_ctx * ctx)4822 static int cayman_trig(struct r600_shader_ctx *ctx)
4823 {
4824 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4825 struct r600_bytecode_alu alu;
4826 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4827 int i, r;
4828
4829 r = tgsi_setup_trig(ctx);
4830 if (r)
4831 return r;
4832
4833
4834 for (i = 0; i < last_slot; i++) {
4835 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4836 alu.op = ctx->inst_info->op;
4837 alu.dst.chan = i;
4838
4839 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4840 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4841
4842 alu.src[0].sel = ctx->temp_reg;
4843 alu.src[0].chan = 0;
4844 if (i == last_slot - 1)
4845 alu.last = 1;
4846 r = r600_bytecode_add_alu(ctx->bc, &alu);
4847 if (r)
4848 return r;
4849 }
4850 return 0;
4851 }
4852
tgsi_trig(struct r600_shader_ctx * ctx)4853 static int tgsi_trig(struct r600_shader_ctx *ctx)
4854 {
4855 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4856 struct r600_bytecode_alu alu;
4857 int i, r;
4858 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4859
4860 r = tgsi_setup_trig(ctx);
4861 if (r)
4862 return r;
4863
4864 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4865 alu.op = ctx->inst_info->op;
4866 alu.dst.chan = 0;
4867 alu.dst.sel = ctx->temp_reg;
4868 alu.dst.write = 1;
4869
4870 alu.src[0].sel = ctx->temp_reg;
4871 alu.src[0].chan = 0;
4872 alu.last = 1;
4873 r = r600_bytecode_add_alu(ctx->bc, &alu);
4874 if (r)
4875 return r;
4876
4877 /* replicate result */
4878 for (i = 0; i < lasti + 1; i++) {
4879 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4880 continue;
4881
4882 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4883 alu.op = ALU_OP1_MOV;
4884
4885 alu.src[0].sel = ctx->temp_reg;
4886 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4887 if (i == lasti)
4888 alu.last = 1;
4889 r = r600_bytecode_add_alu(ctx->bc, &alu);
4890 if (r)
4891 return r;
4892 }
4893 return 0;
4894 }
4895
tgsi_kill(struct r600_shader_ctx * ctx)4896 static int tgsi_kill(struct r600_shader_ctx *ctx)
4897 {
4898 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4899 struct r600_bytecode_alu alu;
4900 int i, r;
4901
4902 for (i = 0; i < 4; i++) {
4903 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4904 alu.op = ctx->inst_info->op;
4905
4906 alu.dst.chan = i;
4907
4908 alu.src[0].sel = V_SQ_ALU_SRC_0;
4909
4910 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4911 alu.src[1].sel = V_SQ_ALU_SRC_1;
4912 alu.src[1].neg = 1;
4913 } else {
4914 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4915 }
4916 if (i == 3) {
4917 alu.last = 1;
4918 }
4919 r = r600_bytecode_add_alu(ctx->bc, &alu);
4920 if (r)
4921 return r;
4922 }
4923
4924 /* kill must be last in ALU */
4925 ctx->bc->force_add_cf = 1;
4926 ctx->shader->uses_kill = TRUE;
4927 return 0;
4928 }
4929
tgsi_lit(struct r600_shader_ctx * ctx)4930 static int tgsi_lit(struct r600_shader_ctx *ctx)
4931 {
4932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4933 struct r600_bytecode_alu alu;
4934 int r;
4935
4936 /* tmp.x = max(src.y, 0.0) */
4937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4938 alu.op = ALU_OP2_MAX;
4939 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4940 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4941 alu.src[1].chan = 1;
4942
4943 alu.dst.sel = ctx->temp_reg;
4944 alu.dst.chan = 0;
4945 alu.dst.write = 1;
4946
4947 alu.last = 1;
4948 r = r600_bytecode_add_alu(ctx->bc, &alu);
4949 if (r)
4950 return r;
4951
4952 if (inst->Dst[0].Register.WriteMask & (1 << 2))
4953 {
4954 int chan;
4955 int sel;
4956 unsigned i;
4957
4958 if (ctx->bc->chip_class == CAYMAN) {
4959 for (i = 0; i < 3; i++) {
4960 /* tmp.z = log(tmp.x) */
4961 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4962 alu.op = ALU_OP1_LOG_CLAMPED;
4963 alu.src[0].sel = ctx->temp_reg;
4964 alu.src[0].chan = 0;
4965 alu.dst.sel = ctx->temp_reg;
4966 alu.dst.chan = i;
4967 if (i == 2) {
4968 alu.dst.write = 1;
4969 alu.last = 1;
4970 } else
4971 alu.dst.write = 0;
4972
4973 r = r600_bytecode_add_alu(ctx->bc, &alu);
4974 if (r)
4975 return r;
4976 }
4977 } else {
4978 /* tmp.z = log(tmp.x) */
4979 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4980 alu.op = ALU_OP1_LOG_CLAMPED;
4981 alu.src[0].sel = ctx->temp_reg;
4982 alu.src[0].chan = 0;
4983 alu.dst.sel = ctx->temp_reg;
4984 alu.dst.chan = 2;
4985 alu.dst.write = 1;
4986 alu.last = 1;
4987 r = r600_bytecode_add_alu(ctx->bc, &alu);
4988 if (r)
4989 return r;
4990 }
4991
4992 chan = alu.dst.chan;
4993 sel = alu.dst.sel;
4994
4995 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4996 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4997 alu.op = ALU_OP3_MUL_LIT;
4998 alu.src[0].sel = sel;
4999 alu.src[0].chan = chan;
5000 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5001 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5002 alu.dst.sel = ctx->temp_reg;
5003 alu.dst.chan = 0;
5004 alu.dst.write = 1;
5005 alu.is_op3 = 1;
5006 alu.last = 1;
5007 r = r600_bytecode_add_alu(ctx->bc, &alu);
5008 if (r)
5009 return r;
5010
5011 if (ctx->bc->chip_class == CAYMAN) {
5012 for (i = 0; i < 3; i++) {
5013 /* dst.z = exp(tmp.x) */
5014 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5015 alu.op = ALU_OP1_EXP_IEEE;
5016 alu.src[0].sel = ctx->temp_reg;
5017 alu.src[0].chan = 0;
5018 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5019 if (i == 2) {
5020 alu.dst.write = 1;
5021 alu.last = 1;
5022 } else
5023 alu.dst.write = 0;
5024 r = r600_bytecode_add_alu(ctx->bc, &alu);
5025 if (r)
5026 return r;
5027 }
5028 } else {
5029 /* dst.z = exp(tmp.x) */
5030 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5031 alu.op = ALU_OP1_EXP_IEEE;
5032 alu.src[0].sel = ctx->temp_reg;
5033 alu.src[0].chan = 0;
5034 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5035 alu.last = 1;
5036 r = r600_bytecode_add_alu(ctx->bc, &alu);
5037 if (r)
5038 return r;
5039 }
5040 }
5041
5042 /* dst.x, <- 1.0 */
5043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5044 alu.op = ALU_OP1_MOV;
5045 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5046 alu.src[0].chan = 0;
5047 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5048 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5049 r = r600_bytecode_add_alu(ctx->bc, &alu);
5050 if (r)
5051 return r;
5052
5053 /* dst.y = max(src.x, 0.0) */
5054 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5055 alu.op = ALU_OP2_MAX;
5056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5057 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5058 alu.src[1].chan = 0;
5059 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5060 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5061 r = r600_bytecode_add_alu(ctx->bc, &alu);
5062 if (r)
5063 return r;
5064
5065 /* dst.w, <- 1.0 */
5066 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5067 alu.op = ALU_OP1_MOV;
5068 alu.src[0].sel = V_SQ_ALU_SRC_1;
5069 alu.src[0].chan = 0;
5070 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5071 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5072 alu.last = 1;
5073 r = r600_bytecode_add_alu(ctx->bc, &alu);
5074 if (r)
5075 return r;
5076
5077 return 0;
5078 }
5079
tgsi_rsq(struct r600_shader_ctx * ctx)5080 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5081 {
5082 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5083 struct r600_bytecode_alu alu;
5084 int i, r;
5085
5086 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5087
5088 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5089
5090 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5091 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5092 r600_bytecode_src_set_abs(&alu.src[i]);
5093 }
5094 alu.dst.sel = ctx->temp_reg;
5095 alu.dst.write = 1;
5096 alu.last = 1;
5097 r = r600_bytecode_add_alu(ctx->bc, &alu);
5098 if (r)
5099 return r;
5100 /* replicate result */
5101 return tgsi_helper_tempx_replicate(ctx);
5102 }
5103
tgsi_helper_tempx_replicate(struct r600_shader_ctx * ctx)5104 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5105 {
5106 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5107 struct r600_bytecode_alu alu;
5108 int i, r;
5109
5110 for (i = 0; i < 4; i++) {
5111 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5112 alu.src[0].sel = ctx->temp_reg;
5113 alu.op = ALU_OP1_MOV;
5114 alu.dst.chan = i;
5115 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5116 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5117 if (i == 3)
5118 alu.last = 1;
5119 r = r600_bytecode_add_alu(ctx->bc, &alu);
5120 if (r)
5121 return r;
5122 }
5123 return 0;
5124 }
5125
tgsi_trans_srcx_replicate(struct r600_shader_ctx * ctx)5126 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5127 {
5128 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5129 struct r600_bytecode_alu alu;
5130 int i, r;
5131
5132 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5133 alu.op = ctx->inst_info->op;
5134 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5135 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5136 }
5137 alu.dst.sel = ctx->temp_reg;
5138 alu.dst.write = 1;
5139 alu.last = 1;
5140 r = r600_bytecode_add_alu(ctx->bc, &alu);
5141 if (r)
5142 return r;
5143 /* replicate result */
5144 return tgsi_helper_tempx_replicate(ctx);
5145 }
5146
cayman_pow(struct r600_shader_ctx * ctx)5147 static int cayman_pow(struct r600_shader_ctx *ctx)
5148 {
5149 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5150 int i, r;
5151 struct r600_bytecode_alu alu;
5152 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5153
5154 for (i = 0; i < 3; i++) {
5155 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5156 alu.op = ALU_OP1_LOG_IEEE;
5157 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5158 alu.dst.sel = ctx->temp_reg;
5159 alu.dst.chan = i;
5160 alu.dst.write = 1;
5161 if (i == 2)
5162 alu.last = 1;
5163 r = r600_bytecode_add_alu(ctx->bc, &alu);
5164 if (r)
5165 return r;
5166 }
5167
5168 /* b * LOG2(a) */
5169 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5170 alu.op = ALU_OP2_MUL;
5171 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5172 alu.src[1].sel = ctx->temp_reg;
5173 alu.dst.sel = ctx->temp_reg;
5174 alu.dst.write = 1;
5175 alu.last = 1;
5176 r = r600_bytecode_add_alu(ctx->bc, &alu);
5177 if (r)
5178 return r;
5179
5180 for (i = 0; i < last_slot; i++) {
5181 /* POW(a,b) = EXP2(b * LOG2(a))*/
5182 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183 alu.op = ALU_OP1_EXP_IEEE;
5184 alu.src[0].sel = ctx->temp_reg;
5185
5186 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5187 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5188 if (i == last_slot - 1)
5189 alu.last = 1;
5190 r = r600_bytecode_add_alu(ctx->bc, &alu);
5191 if (r)
5192 return r;
5193 }
5194 return 0;
5195 }
5196
tgsi_pow(struct r600_shader_ctx * ctx)5197 static int tgsi_pow(struct r600_shader_ctx *ctx)
5198 {
5199 struct r600_bytecode_alu alu;
5200 int r;
5201
5202 /* LOG2(a) */
5203 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5204 alu.op = ALU_OP1_LOG_IEEE;
5205 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5206 alu.dst.sel = ctx->temp_reg;
5207 alu.dst.write = 1;
5208 alu.last = 1;
5209 r = r600_bytecode_add_alu(ctx->bc, &alu);
5210 if (r)
5211 return r;
5212 /* b * LOG2(a) */
5213 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5214 alu.op = ALU_OP2_MUL;
5215 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5216 alu.src[1].sel = ctx->temp_reg;
5217 alu.dst.sel = ctx->temp_reg;
5218 alu.dst.write = 1;
5219 alu.last = 1;
5220 r = r600_bytecode_add_alu(ctx->bc, &alu);
5221 if (r)
5222 return r;
5223 /* POW(a,b) = EXP2(b * LOG2(a))*/
5224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5225 alu.op = ALU_OP1_EXP_IEEE;
5226 alu.src[0].sel = ctx->temp_reg;
5227 alu.dst.sel = ctx->temp_reg;
5228 alu.dst.write = 1;
5229 alu.last = 1;
5230 r = r600_bytecode_add_alu(ctx->bc, &alu);
5231 if (r)
5232 return r;
5233 return tgsi_helper_tempx_replicate(ctx);
5234 }
5235
tgsi_divmod(struct r600_shader_ctx * ctx,int mod,int signed_op)5236 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5237 {
5238 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5239 struct r600_bytecode_alu alu;
5240 int i, r, j;
5241 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5242 int tmp0 = ctx->temp_reg;
5243 int tmp1 = r600_get_temp(ctx);
5244 int tmp2 = r600_get_temp(ctx);
5245 int tmp3 = r600_get_temp(ctx);
5246 /* Unsigned path:
5247 *
5248 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5249 *
5250 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5251 * 2. tmp0.z = lo (tmp0.x * src2)
5252 * 3. tmp0.w = -tmp0.z
5253 * 4. tmp0.y = hi (tmp0.x * src2)
5254 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5255 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5256 * 7. tmp1.x = tmp0.x - tmp0.w
5257 * 8. tmp1.y = tmp0.x + tmp0.w
5258 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5259 * 10. tmp0.z = hi(tmp0.x * src1) = q
5260 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5261 *
5262 * 12. tmp0.w = src1 - tmp0.y = r
5263 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5264 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5265 *
5266 * if DIV
5267 *
5268 * 15. tmp1.z = tmp0.z + 1 = q + 1
5269 * 16. tmp1.w = tmp0.z - 1 = q - 1
5270 *
5271 * else MOD
5272 *
5273 * 15. tmp1.z = tmp0.w - src2 = r - src2
5274 * 16. tmp1.w = tmp0.w + src2 = r + src2
5275 *
5276 * endif
5277 *
5278 * 17. tmp1.x = tmp1.x & tmp1.y
5279 *
5280 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5281 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5282 *
5283 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5284 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5285 *
5286 * Signed path:
5287 *
5288 * Same as unsigned, using abs values of the operands,
5289 * and fixing the sign of the result in the end.
5290 */
5291
5292 for (i = 0; i < 4; i++) {
5293 if (!(write_mask & (1<<i)))
5294 continue;
5295
5296 if (signed_op) {
5297
5298 /* tmp2.x = -src0 */
5299 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5300 alu.op = ALU_OP2_SUB_INT;
5301
5302 alu.dst.sel = tmp2;
5303 alu.dst.chan = 0;
5304 alu.dst.write = 1;
5305
5306 alu.src[0].sel = V_SQ_ALU_SRC_0;
5307
5308 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5309
5310 alu.last = 1;
5311 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5312 return r;
5313
5314 /* tmp2.y = -src1 */
5315 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5316 alu.op = ALU_OP2_SUB_INT;
5317
5318 alu.dst.sel = tmp2;
5319 alu.dst.chan = 1;
5320 alu.dst.write = 1;
5321
5322 alu.src[0].sel = V_SQ_ALU_SRC_0;
5323
5324 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5325
5326 alu.last = 1;
5327 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5328 return r;
5329
5330 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5331 /* it will be a sign of the quotient */
5332 if (!mod) {
5333
5334 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5335 alu.op = ALU_OP2_XOR_INT;
5336
5337 alu.dst.sel = tmp2;
5338 alu.dst.chan = 2;
5339 alu.dst.write = 1;
5340
5341 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5342 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5343
5344 alu.last = 1;
5345 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5346 return r;
5347 }
5348
5349 /* tmp2.x = |src0| */
5350 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5351 alu.op = ALU_OP3_CNDGE_INT;
5352 alu.is_op3 = 1;
5353
5354 alu.dst.sel = tmp2;
5355 alu.dst.chan = 0;
5356 alu.dst.write = 1;
5357
5358 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5359 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5360 alu.src[2].sel = tmp2;
5361 alu.src[2].chan = 0;
5362
5363 alu.last = 1;
5364 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5365 return r;
5366
5367 /* tmp2.y = |src1| */
5368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5369 alu.op = ALU_OP3_CNDGE_INT;
5370 alu.is_op3 = 1;
5371
5372 alu.dst.sel = tmp2;
5373 alu.dst.chan = 1;
5374 alu.dst.write = 1;
5375
5376 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5377 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5378 alu.src[2].sel = tmp2;
5379 alu.src[2].chan = 1;
5380
5381 alu.last = 1;
5382 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5383 return r;
5384
5385 }
5386
5387 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5388 if (ctx->bc->chip_class == CAYMAN) {
5389 /* tmp3.x = u2f(src2) */
5390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5391 alu.op = ALU_OP1_UINT_TO_FLT;
5392
5393 alu.dst.sel = tmp3;
5394 alu.dst.chan = 0;
5395 alu.dst.write = 1;
5396
5397 if (signed_op) {
5398 alu.src[0].sel = tmp2;
5399 alu.src[0].chan = 1;
5400 } else {
5401 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5402 }
5403
5404 alu.last = 1;
5405 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5406 return r;
5407
5408 /* tmp0.x = recip(tmp3.x) */
5409 for (j = 0 ; j < 3; j++) {
5410 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5411 alu.op = ALU_OP1_RECIP_IEEE;
5412
5413 alu.dst.sel = tmp0;
5414 alu.dst.chan = j;
5415 alu.dst.write = (j == 0);
5416
5417 alu.src[0].sel = tmp3;
5418 alu.src[0].chan = 0;
5419
5420 if (j == 2)
5421 alu.last = 1;
5422 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5423 return r;
5424 }
5425
5426 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5427 alu.op = ALU_OP2_MUL;
5428
5429 alu.src[0].sel = tmp0;
5430 alu.src[0].chan = 0;
5431
5432 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5433 alu.src[1].value = 0x4f800000;
5434
5435 alu.dst.sel = tmp3;
5436 alu.dst.write = 1;
5437 alu.last = 1;
5438 r = r600_bytecode_add_alu(ctx->bc, &alu);
5439 if (r)
5440 return r;
5441
5442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5443 alu.op = ALU_OP1_FLT_TO_UINT;
5444
5445 alu.dst.sel = tmp0;
5446 alu.dst.chan = 0;
5447 alu.dst.write = 1;
5448
5449 alu.src[0].sel = tmp3;
5450 alu.src[0].chan = 0;
5451
5452 alu.last = 1;
5453 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5454 return r;
5455
5456 } else {
5457 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5458 alu.op = ALU_OP1_RECIP_UINT;
5459
5460 alu.dst.sel = tmp0;
5461 alu.dst.chan = 0;
5462 alu.dst.write = 1;
5463
5464 if (signed_op) {
5465 alu.src[0].sel = tmp2;
5466 alu.src[0].chan = 1;
5467 } else {
5468 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5469 }
5470
5471 alu.last = 1;
5472 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5473 return r;
5474 }
5475
5476 /* 2. tmp0.z = lo (tmp0.x * src2) */
5477 if (ctx->bc->chip_class == CAYMAN) {
5478 for (j = 0 ; j < 4; j++) {
5479 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5480 alu.op = ALU_OP2_MULLO_UINT;
5481
5482 alu.dst.sel = tmp0;
5483 alu.dst.chan = j;
5484 alu.dst.write = (j == 2);
5485
5486 alu.src[0].sel = tmp0;
5487 alu.src[0].chan = 0;
5488 if (signed_op) {
5489 alu.src[1].sel = tmp2;
5490 alu.src[1].chan = 1;
5491 } else {
5492 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5493 }
5494
5495 alu.last = (j == 3);
5496 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5497 return r;
5498 }
5499 } else {
5500 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5501 alu.op = ALU_OP2_MULLO_UINT;
5502
5503 alu.dst.sel = tmp0;
5504 alu.dst.chan = 2;
5505 alu.dst.write = 1;
5506
5507 alu.src[0].sel = tmp0;
5508 alu.src[0].chan = 0;
5509 if (signed_op) {
5510 alu.src[1].sel = tmp2;
5511 alu.src[1].chan = 1;
5512 } else {
5513 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5514 }
5515
5516 alu.last = 1;
5517 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5518 return r;
5519 }
5520
5521 /* 3. tmp0.w = -tmp0.z */
5522 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5523 alu.op = ALU_OP2_SUB_INT;
5524
5525 alu.dst.sel = tmp0;
5526 alu.dst.chan = 3;
5527 alu.dst.write = 1;
5528
5529 alu.src[0].sel = V_SQ_ALU_SRC_0;
5530 alu.src[1].sel = tmp0;
5531 alu.src[1].chan = 2;
5532
5533 alu.last = 1;
5534 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5535 return r;
5536
5537 /* 4. tmp0.y = hi (tmp0.x * src2) */
5538 if (ctx->bc->chip_class == CAYMAN) {
5539 for (j = 0 ; j < 4; j++) {
5540 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5541 alu.op = ALU_OP2_MULHI_UINT;
5542
5543 alu.dst.sel = tmp0;
5544 alu.dst.chan = j;
5545 alu.dst.write = (j == 1);
5546
5547 alu.src[0].sel = tmp0;
5548 alu.src[0].chan = 0;
5549
5550 if (signed_op) {
5551 alu.src[1].sel = tmp2;
5552 alu.src[1].chan = 1;
5553 } else {
5554 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5555 }
5556 alu.last = (j == 3);
5557 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5558 return r;
5559 }
5560 } else {
5561 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5562 alu.op = ALU_OP2_MULHI_UINT;
5563
5564 alu.dst.sel = tmp0;
5565 alu.dst.chan = 1;
5566 alu.dst.write = 1;
5567
5568 alu.src[0].sel = tmp0;
5569 alu.src[0].chan = 0;
5570
5571 if (signed_op) {
5572 alu.src[1].sel = tmp2;
5573 alu.src[1].chan = 1;
5574 } else {
5575 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5576 }
5577
5578 alu.last = 1;
5579 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5580 return r;
5581 }
5582
5583 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
5584 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5585 alu.op = ALU_OP3_CNDE_INT;
5586 alu.is_op3 = 1;
5587
5588 alu.dst.sel = tmp0;
5589 alu.dst.chan = 2;
5590 alu.dst.write = 1;
5591
5592 alu.src[0].sel = tmp0;
5593 alu.src[0].chan = 1;
5594 alu.src[1].sel = tmp0;
5595 alu.src[1].chan = 3;
5596 alu.src[2].sel = tmp0;
5597 alu.src[2].chan = 2;
5598
5599 alu.last = 1;
5600 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5601 return r;
5602
5603 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
5604 if (ctx->bc->chip_class == CAYMAN) {
5605 for (j = 0 ; j < 4; j++) {
5606 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5607 alu.op = ALU_OP2_MULHI_UINT;
5608
5609 alu.dst.sel = tmp0;
5610 alu.dst.chan = j;
5611 alu.dst.write = (j == 3);
5612
5613 alu.src[0].sel = tmp0;
5614 alu.src[0].chan = 2;
5615
5616 alu.src[1].sel = tmp0;
5617 alu.src[1].chan = 0;
5618
5619 alu.last = (j == 3);
5620 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5621 return r;
5622 }
5623 } else {
5624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5625 alu.op = ALU_OP2_MULHI_UINT;
5626
5627 alu.dst.sel = tmp0;
5628 alu.dst.chan = 3;
5629 alu.dst.write = 1;
5630
5631 alu.src[0].sel = tmp0;
5632 alu.src[0].chan = 2;
5633
5634 alu.src[1].sel = tmp0;
5635 alu.src[1].chan = 0;
5636
5637 alu.last = 1;
5638 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5639 return r;
5640 }
5641
5642 /* 7. tmp1.x = tmp0.x - tmp0.w */
5643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5644 alu.op = ALU_OP2_SUB_INT;
5645
5646 alu.dst.sel = tmp1;
5647 alu.dst.chan = 0;
5648 alu.dst.write = 1;
5649
5650 alu.src[0].sel = tmp0;
5651 alu.src[0].chan = 0;
5652 alu.src[1].sel = tmp0;
5653 alu.src[1].chan = 3;
5654
5655 alu.last = 1;
5656 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5657 return r;
5658
5659 /* 8. tmp1.y = tmp0.x + tmp0.w */
5660 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5661 alu.op = ALU_OP2_ADD_INT;
5662
5663 alu.dst.sel = tmp1;
5664 alu.dst.chan = 1;
5665 alu.dst.write = 1;
5666
5667 alu.src[0].sel = tmp0;
5668 alu.src[0].chan = 0;
5669 alu.src[1].sel = tmp0;
5670 alu.src[1].chan = 3;
5671
5672 alu.last = 1;
5673 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5674 return r;
5675
5676 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5677 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5678 alu.op = ALU_OP3_CNDE_INT;
5679 alu.is_op3 = 1;
5680
5681 alu.dst.sel = tmp0;
5682 alu.dst.chan = 0;
5683 alu.dst.write = 1;
5684
5685 alu.src[0].sel = tmp0;
5686 alu.src[0].chan = 1;
5687 alu.src[1].sel = tmp1;
5688 alu.src[1].chan = 1;
5689 alu.src[2].sel = tmp1;
5690 alu.src[2].chan = 0;
5691
5692 alu.last = 1;
5693 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5694 return r;
5695
5696 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
5697 if (ctx->bc->chip_class == CAYMAN) {
5698 for (j = 0 ; j < 4; j++) {
5699 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5700 alu.op = ALU_OP2_MULHI_UINT;
5701
5702 alu.dst.sel = tmp0;
5703 alu.dst.chan = j;
5704 alu.dst.write = (j == 2);
5705
5706 alu.src[0].sel = tmp0;
5707 alu.src[0].chan = 0;
5708
5709 if (signed_op) {
5710 alu.src[1].sel = tmp2;
5711 alu.src[1].chan = 0;
5712 } else {
5713 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5714 }
5715
5716 alu.last = (j == 3);
5717 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5718 return r;
5719 }
5720 } else {
5721 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5722 alu.op = ALU_OP2_MULHI_UINT;
5723
5724 alu.dst.sel = tmp0;
5725 alu.dst.chan = 2;
5726 alu.dst.write = 1;
5727
5728 alu.src[0].sel = tmp0;
5729 alu.src[0].chan = 0;
5730
5731 if (signed_op) {
5732 alu.src[1].sel = tmp2;
5733 alu.src[1].chan = 0;
5734 } else {
5735 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5736 }
5737
5738 alu.last = 1;
5739 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5740 return r;
5741 }
5742
5743 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
5744 if (ctx->bc->chip_class == CAYMAN) {
5745 for (j = 0 ; j < 4; j++) {
5746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5747 alu.op = ALU_OP2_MULLO_UINT;
5748
5749 alu.dst.sel = tmp0;
5750 alu.dst.chan = j;
5751 alu.dst.write = (j == 1);
5752
5753 if (signed_op) {
5754 alu.src[0].sel = tmp2;
5755 alu.src[0].chan = 1;
5756 } else {
5757 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5758 }
5759
5760 alu.src[1].sel = tmp0;
5761 alu.src[1].chan = 2;
5762
5763 alu.last = (j == 3);
5764 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5765 return r;
5766 }
5767 } else {
5768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5769 alu.op = ALU_OP2_MULLO_UINT;
5770
5771 alu.dst.sel = tmp0;
5772 alu.dst.chan = 1;
5773 alu.dst.write = 1;
5774
5775 if (signed_op) {
5776 alu.src[0].sel = tmp2;
5777 alu.src[0].chan = 1;
5778 } else {
5779 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5780 }
5781
5782 alu.src[1].sel = tmp0;
5783 alu.src[1].chan = 2;
5784
5785 alu.last = 1;
5786 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5787 return r;
5788 }
5789
5790 /* 12. tmp0.w = src1 - tmp0.y = r */
5791 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5792 alu.op = ALU_OP2_SUB_INT;
5793
5794 alu.dst.sel = tmp0;
5795 alu.dst.chan = 3;
5796 alu.dst.write = 1;
5797
5798 if (signed_op) {
5799 alu.src[0].sel = tmp2;
5800 alu.src[0].chan = 0;
5801 } else {
5802 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5803 }
5804
5805 alu.src[1].sel = tmp0;
5806 alu.src[1].chan = 1;
5807
5808 alu.last = 1;
5809 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5810 return r;
5811
5812 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
5813 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5814 alu.op = ALU_OP2_SETGE_UINT;
5815
5816 alu.dst.sel = tmp1;
5817 alu.dst.chan = 0;
5818 alu.dst.write = 1;
5819
5820 alu.src[0].sel = tmp0;
5821 alu.src[0].chan = 3;
5822 if (signed_op) {
5823 alu.src[1].sel = tmp2;
5824 alu.src[1].chan = 1;
5825 } else {
5826 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5827 }
5828
5829 alu.last = 1;
5830 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5831 return r;
5832
5833 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
5834 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5835 alu.op = ALU_OP2_SETGE_UINT;
5836
5837 alu.dst.sel = tmp1;
5838 alu.dst.chan = 1;
5839 alu.dst.write = 1;
5840
5841 if (signed_op) {
5842 alu.src[0].sel = tmp2;
5843 alu.src[0].chan = 0;
5844 } else {
5845 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5846 }
5847
5848 alu.src[1].sel = tmp0;
5849 alu.src[1].chan = 1;
5850
5851 alu.last = 1;
5852 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5853 return r;
5854
5855 if (mod) { /* UMOD */
5856
5857 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
5858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5859 alu.op = ALU_OP2_SUB_INT;
5860
5861 alu.dst.sel = tmp1;
5862 alu.dst.chan = 2;
5863 alu.dst.write = 1;
5864
5865 alu.src[0].sel = tmp0;
5866 alu.src[0].chan = 3;
5867
5868 if (signed_op) {
5869 alu.src[1].sel = tmp2;
5870 alu.src[1].chan = 1;
5871 } else {
5872 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5873 }
5874
5875 alu.last = 1;
5876 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5877 return r;
5878
5879 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
5880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5881 alu.op = ALU_OP2_ADD_INT;
5882
5883 alu.dst.sel = tmp1;
5884 alu.dst.chan = 3;
5885 alu.dst.write = 1;
5886
5887 alu.src[0].sel = tmp0;
5888 alu.src[0].chan = 3;
5889 if (signed_op) {
5890 alu.src[1].sel = tmp2;
5891 alu.src[1].chan = 1;
5892 } else {
5893 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5894 }
5895
5896 alu.last = 1;
5897 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5898 return r;
5899
5900 } else { /* UDIV */
5901
5902 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
5903 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5904 alu.op = ALU_OP2_ADD_INT;
5905
5906 alu.dst.sel = tmp1;
5907 alu.dst.chan = 2;
5908 alu.dst.write = 1;
5909
5910 alu.src[0].sel = tmp0;
5911 alu.src[0].chan = 2;
5912 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5913
5914 alu.last = 1;
5915 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5916 return r;
5917
5918 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
5919 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5920 alu.op = ALU_OP2_ADD_INT;
5921
5922 alu.dst.sel = tmp1;
5923 alu.dst.chan = 3;
5924 alu.dst.write = 1;
5925
5926 alu.src[0].sel = tmp0;
5927 alu.src[0].chan = 2;
5928 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5929
5930 alu.last = 1;
5931 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5932 return r;
5933
5934 }
5935
5936 /* 17. tmp1.x = tmp1.x & tmp1.y */
5937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5938 alu.op = ALU_OP2_AND_INT;
5939
5940 alu.dst.sel = tmp1;
5941 alu.dst.chan = 0;
5942 alu.dst.write = 1;
5943
5944 alu.src[0].sel = tmp1;
5945 alu.src[0].chan = 0;
5946 alu.src[1].sel = tmp1;
5947 alu.src[1].chan = 1;
5948
5949 alu.last = 1;
5950 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5951 return r;
5952
5953 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
5954 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
5955 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5956 alu.op = ALU_OP3_CNDE_INT;
5957 alu.is_op3 = 1;
5958
5959 alu.dst.sel = tmp0;
5960 alu.dst.chan = 2;
5961 alu.dst.write = 1;
5962
5963 alu.src[0].sel = tmp1;
5964 alu.src[0].chan = 0;
5965 alu.src[1].sel = tmp0;
5966 alu.src[1].chan = mod ? 3 : 2;
5967 alu.src[2].sel = tmp1;
5968 alu.src[2].chan = 2;
5969
5970 alu.last = 1;
5971 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5972 return r;
5973
5974 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5975 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5976 alu.op = ALU_OP3_CNDE_INT;
5977 alu.is_op3 = 1;
5978
5979 if (signed_op) {
5980 alu.dst.sel = tmp0;
5981 alu.dst.chan = 2;
5982 alu.dst.write = 1;
5983 } else {
5984 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5985 }
5986
5987 alu.src[0].sel = tmp1;
5988 alu.src[0].chan = 1;
5989 alu.src[1].sel = tmp1;
5990 alu.src[1].chan = 3;
5991 alu.src[2].sel = tmp0;
5992 alu.src[2].chan = 2;
5993
5994 alu.last = 1;
5995 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5996 return r;
5997
5998 if (signed_op) {
5999
6000 /* fix the sign of the result */
6001
6002 if (mod) {
6003
6004 /* tmp0.x = -tmp0.z */
6005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6006 alu.op = ALU_OP2_SUB_INT;
6007
6008 alu.dst.sel = tmp0;
6009 alu.dst.chan = 0;
6010 alu.dst.write = 1;
6011
6012 alu.src[0].sel = V_SQ_ALU_SRC_0;
6013 alu.src[1].sel = tmp0;
6014 alu.src[1].chan = 2;
6015
6016 alu.last = 1;
6017 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6018 return r;
6019
6020 /* sign of the remainder is the same as the sign of src0 */
6021 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6022 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6023 alu.op = ALU_OP3_CNDGE_INT;
6024 alu.is_op3 = 1;
6025
6026 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6027
6028 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6029 alu.src[1].sel = tmp0;
6030 alu.src[1].chan = 2;
6031 alu.src[2].sel = tmp0;
6032 alu.src[2].chan = 0;
6033
6034 alu.last = 1;
6035 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6036 return r;
6037
6038 } else {
6039
6040 /* tmp0.x = -tmp0.z */
6041 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6042 alu.op = ALU_OP2_SUB_INT;
6043
6044 alu.dst.sel = tmp0;
6045 alu.dst.chan = 0;
6046 alu.dst.write = 1;
6047
6048 alu.src[0].sel = V_SQ_ALU_SRC_0;
6049 alu.src[1].sel = tmp0;
6050 alu.src[1].chan = 2;
6051
6052 alu.last = 1;
6053 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6054 return r;
6055
6056 /* fix the quotient sign (same as the sign of src0*src1) */
6057 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6058 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6059 alu.op = ALU_OP3_CNDGE_INT;
6060 alu.is_op3 = 1;
6061
6062 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6063
6064 alu.src[0].sel = tmp2;
6065 alu.src[0].chan = 2;
6066 alu.src[1].sel = tmp0;
6067 alu.src[1].chan = 2;
6068 alu.src[2].sel = tmp0;
6069 alu.src[2].chan = 0;
6070
6071 alu.last = 1;
6072 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6073 return r;
6074 }
6075 }
6076 }
6077 return 0;
6078 }
6079
tgsi_udiv(struct r600_shader_ctx * ctx)6080 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6081 {
6082 return tgsi_divmod(ctx, 0, 0);
6083 }
6084
tgsi_umod(struct r600_shader_ctx * ctx)6085 static int tgsi_umod(struct r600_shader_ctx *ctx)
6086 {
6087 return tgsi_divmod(ctx, 1, 0);
6088 }
6089
tgsi_idiv(struct r600_shader_ctx * ctx)6090 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6091 {
6092 return tgsi_divmod(ctx, 0, 1);
6093 }
6094
tgsi_imod(struct r600_shader_ctx * ctx)6095 static int tgsi_imod(struct r600_shader_ctx *ctx)
6096 {
6097 return tgsi_divmod(ctx, 1, 1);
6098 }
6099
6100
tgsi_f2i(struct r600_shader_ctx * ctx)6101 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6102 {
6103 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6104 struct r600_bytecode_alu alu;
6105 int i, r;
6106 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6107 int last_inst = tgsi_last_instruction(write_mask);
6108
6109 for (i = 0; i < 4; i++) {
6110 if (!(write_mask & (1<<i)))
6111 continue;
6112
6113 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6114 alu.op = ALU_OP1_TRUNC;
6115
6116 alu.dst.sel = ctx->temp_reg;
6117 alu.dst.chan = i;
6118 alu.dst.write = 1;
6119
6120 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6121 if (i == last_inst)
6122 alu.last = 1;
6123 r = r600_bytecode_add_alu(ctx->bc, &alu);
6124 if (r)
6125 return r;
6126 }
6127
6128 for (i = 0; i < 4; i++) {
6129 if (!(write_mask & (1<<i)))
6130 continue;
6131
6132 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6133 alu.op = ctx->inst_info->op;
6134
6135 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6136
6137 alu.src[0].sel = ctx->temp_reg;
6138 alu.src[0].chan = i;
6139
6140 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6141 alu.last = 1;
6142 r = r600_bytecode_add_alu(ctx->bc, &alu);
6143 if (r)
6144 return r;
6145 }
6146
6147 return 0;
6148 }
6149
tgsi_iabs(struct r600_shader_ctx * ctx)6150 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6151 {
6152 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6153 struct r600_bytecode_alu alu;
6154 int i, r;
6155 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6156 int last_inst = tgsi_last_instruction(write_mask);
6157
6158 /* tmp = -src */
6159 for (i = 0; i < 4; i++) {
6160 if (!(write_mask & (1<<i)))
6161 continue;
6162
6163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6164 alu.op = ALU_OP2_SUB_INT;
6165
6166 alu.dst.sel = ctx->temp_reg;
6167 alu.dst.chan = i;
6168 alu.dst.write = 1;
6169
6170 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6171 alu.src[0].sel = V_SQ_ALU_SRC_0;
6172
6173 if (i == last_inst)
6174 alu.last = 1;
6175 r = r600_bytecode_add_alu(ctx->bc, &alu);
6176 if (r)
6177 return r;
6178 }
6179
6180 /* dst = (src >= 0 ? src : tmp) */
6181 for (i = 0; i < 4; i++) {
6182 if (!(write_mask & (1<<i)))
6183 continue;
6184
6185 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6186 alu.op = ALU_OP3_CNDGE_INT;
6187 alu.is_op3 = 1;
6188 alu.dst.write = 1;
6189
6190 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6191
6192 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6193 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6194 alu.src[2].sel = ctx->temp_reg;
6195 alu.src[2].chan = i;
6196
6197 if (i == last_inst)
6198 alu.last = 1;
6199 r = r600_bytecode_add_alu(ctx->bc, &alu);
6200 if (r)
6201 return r;
6202 }
6203 return 0;
6204 }
6205
tgsi_issg(struct r600_shader_ctx * ctx)6206 static int tgsi_issg(struct r600_shader_ctx *ctx)
6207 {
6208 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6209 struct r600_bytecode_alu alu;
6210 int i, r;
6211 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6212 int last_inst = tgsi_last_instruction(write_mask);
6213
6214 /* tmp = (src >= 0 ? src : -1) */
6215 for (i = 0; i < 4; i++) {
6216 if (!(write_mask & (1<<i)))
6217 continue;
6218
6219 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6220 alu.op = ALU_OP3_CNDGE_INT;
6221 alu.is_op3 = 1;
6222
6223 alu.dst.sel = ctx->temp_reg;
6224 alu.dst.chan = i;
6225 alu.dst.write = 1;
6226
6227 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6228 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6229 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6230
6231 if (i == last_inst)
6232 alu.last = 1;
6233 r = r600_bytecode_add_alu(ctx->bc, &alu);
6234 if (r)
6235 return r;
6236 }
6237
6238 /* dst = (tmp > 0 ? 1 : tmp) */
6239 for (i = 0; i < 4; i++) {
6240 if (!(write_mask & (1<<i)))
6241 continue;
6242
6243 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6244 alu.op = ALU_OP3_CNDGT_INT;
6245 alu.is_op3 = 1;
6246 alu.dst.write = 1;
6247
6248 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6249
6250 alu.src[0].sel = ctx->temp_reg;
6251 alu.src[0].chan = i;
6252
6253 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6254
6255 alu.src[2].sel = ctx->temp_reg;
6256 alu.src[2].chan = i;
6257
6258 if (i == last_inst)
6259 alu.last = 1;
6260 r = r600_bytecode_add_alu(ctx->bc, &alu);
6261 if (r)
6262 return r;
6263 }
6264 return 0;
6265 }
6266
6267
6268
tgsi_ssg(struct r600_shader_ctx * ctx)6269 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6270 {
6271 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6272 struct r600_bytecode_alu alu;
6273 int i, r;
6274
6275 /* tmp = (src > 0 ? 1 : src) */
6276 for (i = 0; i < 4; i++) {
6277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6278 alu.op = ALU_OP3_CNDGT;
6279 alu.is_op3 = 1;
6280
6281 alu.dst.sel = ctx->temp_reg;
6282 alu.dst.chan = i;
6283
6284 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6285 alu.src[1].sel = V_SQ_ALU_SRC_1;
6286 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6287
6288 if (i == 3)
6289 alu.last = 1;
6290 r = r600_bytecode_add_alu(ctx->bc, &alu);
6291 if (r)
6292 return r;
6293 }
6294
6295 /* dst = (-tmp > 0 ? -1 : tmp) */
6296 for (i = 0; i < 4; i++) {
6297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6298 alu.op = ALU_OP3_CNDGT;
6299 alu.is_op3 = 1;
6300 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6301
6302 alu.src[0].sel = ctx->temp_reg;
6303 alu.src[0].chan = i;
6304 alu.src[0].neg = 1;
6305
6306 alu.src[1].sel = V_SQ_ALU_SRC_1;
6307 alu.src[1].neg = 1;
6308
6309 alu.src[2].sel = ctx->temp_reg;
6310 alu.src[2].chan = i;
6311
6312 if (i == 3)
6313 alu.last = 1;
6314 r = r600_bytecode_add_alu(ctx->bc, &alu);
6315 if (r)
6316 return r;
6317 }
6318 return 0;
6319 }
6320
tgsi_bfi(struct r600_shader_ctx * ctx)6321 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6322 {
6323 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6324 struct r600_bytecode_alu alu;
6325 int i, r, t1, t2;
6326
6327 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6328 int last_inst = tgsi_last_instruction(write_mask);
6329
6330 t1 = r600_get_temp(ctx);
6331
6332 for (i = 0; i < 4; i++) {
6333 if (!(write_mask & (1<<i)))
6334 continue;
6335
6336 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6337 alu.op = ALU_OP2_SETGE_INT;
6338 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6339 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6340 alu.src[1].value = 32;
6341 alu.dst.sel = ctx->temp_reg;
6342 alu.dst.chan = i;
6343 alu.dst.write = 1;
6344 alu.last = i == last_inst;
6345 r = r600_bytecode_add_alu(ctx->bc, &alu);
6346 if (r)
6347 return r;
6348 }
6349
6350 for (i = 0; i < 4; i++) {
6351 if (!(write_mask & (1<<i)))
6352 continue;
6353
6354 /* create mask tmp */
6355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6356 alu.op = ALU_OP2_BFM_INT;
6357 alu.dst.sel = t1;
6358 alu.dst.chan = i;
6359 alu.dst.write = 1;
6360 alu.last = i == last_inst;
6361
6362 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6363 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6364
6365 r = r600_bytecode_add_alu(ctx->bc, &alu);
6366 if (r)
6367 return r;
6368 }
6369
6370 t2 = r600_get_temp(ctx);
6371
6372 for (i = 0; i < 4; i++) {
6373 if (!(write_mask & (1<<i)))
6374 continue;
6375
6376 /* shift insert left */
6377 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6378 alu.op = ALU_OP2_LSHL_INT;
6379 alu.dst.sel = t2;
6380 alu.dst.chan = i;
6381 alu.dst.write = 1;
6382 alu.last = i == last_inst;
6383
6384 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6385 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6386
6387 r = r600_bytecode_add_alu(ctx->bc, &alu);
6388 if (r)
6389 return r;
6390 }
6391
6392 for (i = 0; i < 4; i++) {
6393 if (!(write_mask & (1<<i)))
6394 continue;
6395
6396 /* actual bitfield insert */
6397 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6398 alu.op = ALU_OP3_BFI_INT;
6399 alu.is_op3 = 1;
6400 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6401 alu.dst.chan = i;
6402 alu.dst.write = 1;
6403 alu.last = i == last_inst;
6404
6405 alu.src[0].sel = t1;
6406 alu.src[0].chan = i;
6407 alu.src[1].sel = t2;
6408 alu.src[1].chan = i;
6409 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6410
6411 r = r600_bytecode_add_alu(ctx->bc, &alu);
6412 if (r)
6413 return r;
6414 }
6415
6416 for (i = 0; i < 4; i++) {
6417 if (!(write_mask & (1<<i)))
6418 continue;
6419 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6420 alu.op = ALU_OP3_CNDE_INT;
6421 alu.is_op3 = 1;
6422 alu.src[0].sel = ctx->temp_reg;
6423 alu.src[0].chan = i;
6424 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6425
6426 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6427
6428 alu.src[1].sel = alu.dst.sel;
6429 alu.src[1].chan = i;
6430
6431 alu.last = i == last_inst;
6432 r = r600_bytecode_add_alu(ctx->bc, &alu);
6433 if (r)
6434 return r;
6435 }
6436 return 0;
6437 }
6438
tgsi_msb(struct r600_shader_ctx * ctx)6439 static int tgsi_msb(struct r600_shader_ctx *ctx)
6440 {
6441 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6442 struct r600_bytecode_alu alu;
6443 int i, r, t1, t2;
6444
6445 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6446 int last_inst = tgsi_last_instruction(write_mask);
6447
6448 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6449 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6450
6451 t1 = ctx->temp_reg;
6452
6453 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6454 for (i = 0; i < 4; i++) {
6455 if (!(write_mask & (1<<i)))
6456 continue;
6457
6458 /* t1 = FFBH_INT / FFBH_UINT */
6459 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6460 alu.op = ctx->inst_info->op;
6461 alu.dst.sel = t1;
6462 alu.dst.chan = i;
6463 alu.dst.write = 1;
6464 alu.last = i == last_inst;
6465
6466 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6467
6468 r = r600_bytecode_add_alu(ctx->bc, &alu);
6469 if (r)
6470 return r;
6471 }
6472
6473 t2 = r600_get_temp(ctx);
6474
6475 for (i = 0; i < 4; i++) {
6476 if (!(write_mask & (1<<i)))
6477 continue;
6478
6479 /* t2 = 31 - t1 */
6480 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6481 alu.op = ALU_OP2_SUB_INT;
6482 alu.dst.sel = t2;
6483 alu.dst.chan = i;
6484 alu.dst.write = 1;
6485 alu.last = i == last_inst;
6486
6487 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6488 alu.src[0].value = 31;
6489 alu.src[1].sel = t1;
6490 alu.src[1].chan = i;
6491
6492 r = r600_bytecode_add_alu(ctx->bc, &alu);
6493 if (r)
6494 return r;
6495 }
6496
6497 for (i = 0; i < 4; i++) {
6498 if (!(write_mask & (1<<i)))
6499 continue;
6500
6501 /* result = t1 >= 0 ? t2 : t1 */
6502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6503 alu.op = ALU_OP3_CNDGE_INT;
6504 alu.is_op3 = 1;
6505 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6506 alu.dst.chan = i;
6507 alu.dst.write = 1;
6508 alu.last = i == last_inst;
6509
6510 alu.src[0].sel = t1;
6511 alu.src[0].chan = i;
6512 alu.src[1].sel = t2;
6513 alu.src[1].chan = i;
6514 alu.src[2].sel = t1;
6515 alu.src[2].chan = i;
6516
6517 r = r600_bytecode_add_alu(ctx->bc, &alu);
6518 if (r)
6519 return r;
6520 }
6521
6522 return 0;
6523 }
6524
tgsi_interp_egcm(struct r600_shader_ctx * ctx)6525 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6526 {
6527 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6528 struct r600_bytecode_alu alu;
6529 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6530 unsigned location;
6531 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6532
6533 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6534
6535 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6536 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6537 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6538 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6539 }
6540 else {
6541 location = TGSI_INTERPOLATE_LOC_CENTROID;
6542 }
6543
6544 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6545 if (k < 0)
6546 k = 0;
6547 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6548 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6549
6550 /* NOTE: currently offset is not perspective correct */
6551 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6552 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6553 int sample_gpr = -1;
6554 int gradientsH, gradientsV;
6555 struct r600_bytecode_tex tex;
6556
6557 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6558 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6559 }
6560
6561 gradientsH = r600_get_temp(ctx);
6562 gradientsV = r600_get_temp(ctx);
6563 for (i = 0; i < 2; i++) {
6564 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6565 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6566 tex.src_gpr = interp_gpr;
6567 tex.src_sel_x = interp_base_chan + 0;
6568 tex.src_sel_y = interp_base_chan + 1;
6569 tex.src_sel_z = 0;
6570 tex.src_sel_w = 0;
6571 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6572 tex.dst_sel_x = 0;
6573 tex.dst_sel_y = 1;
6574 tex.dst_sel_z = 7;
6575 tex.dst_sel_w = 7;
6576 tex.inst_mod = 1; // Use per pixel gradient calculation
6577 tex.sampler_id = 0;
6578 tex.resource_id = tex.sampler_id;
6579 r = r600_bytecode_add_tex(ctx->bc, &tex);
6580 if (r)
6581 return r;
6582 }
6583
6584 for (i = 0; i < 2; i++) {
6585 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6586 alu.op = ALU_OP3_MULADD;
6587 alu.is_op3 = 1;
6588 alu.src[0].sel = gradientsH;
6589 alu.src[0].chan = i;
6590 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6591 alu.src[1].sel = sample_gpr;
6592 alu.src[1].chan = 2;
6593 }
6594 else {
6595 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6596 }
6597 alu.src[2].sel = interp_gpr;
6598 alu.src[2].chan = interp_base_chan + i;
6599 alu.dst.sel = ctx->temp_reg;
6600 alu.dst.chan = i;
6601 alu.last = i == 1;
6602
6603 r = r600_bytecode_add_alu(ctx->bc, &alu);
6604 if (r)
6605 return r;
6606 }
6607
6608 for (i = 0; i < 2; i++) {
6609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6610 alu.op = ALU_OP3_MULADD;
6611 alu.is_op3 = 1;
6612 alu.src[0].sel = gradientsV;
6613 alu.src[0].chan = i;
6614 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6615 alu.src[1].sel = sample_gpr;
6616 alu.src[1].chan = 3;
6617 }
6618 else {
6619 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6620 }
6621 alu.src[2].sel = ctx->temp_reg;
6622 alu.src[2].chan = i;
6623 alu.dst.sel = ctx->temp_reg;
6624 alu.dst.chan = i;
6625 alu.last = i == 1;
6626
6627 r = r600_bytecode_add_alu(ctx->bc, &alu);
6628 if (r)
6629 return r;
6630 }
6631 }
6632
6633 tmp = r600_get_temp(ctx);
6634 for (i = 0; i < 8; i++) {
6635 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6636 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6637
6638 alu.dst.sel = tmp;
6639 if ((i > 1 && i < 6)) {
6640 alu.dst.write = 1;
6641 }
6642 else {
6643 alu.dst.write = 0;
6644 }
6645 alu.dst.chan = i % 4;
6646
6647 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6648 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6649 alu.src[0].sel = ctx->temp_reg;
6650 alu.src[0].chan = 1 - (i % 2);
6651 } else {
6652 alu.src[0].sel = interp_gpr;
6653 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6654 }
6655 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6656 alu.src[1].chan = 0;
6657
6658 alu.last = i % 4 == 3;
6659 alu.bank_swizzle_force = SQ_ALU_VEC_210;
6660
6661 r = r600_bytecode_add_alu(ctx->bc, &alu);
6662 if (r)
6663 return r;
6664 }
6665
6666 // INTERP can't swizzle dst
6667 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6668 for (i = 0; i <= lasti; i++) {
6669 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6670 continue;
6671
6672 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6673 alu.op = ALU_OP1_MOV;
6674 alu.src[0].sel = tmp;
6675 alu.src[0].chan = ctx->src[0].swizzle[i];
6676 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6677 alu.dst.write = 1;
6678 alu.last = i == lasti;
6679 r = r600_bytecode_add_alu(ctx->bc, &alu);
6680 if (r)
6681 return r;
6682 }
6683
6684 return 0;
6685 }
6686
6687
tgsi_helper_copy(struct r600_shader_ctx * ctx,struct tgsi_full_instruction * inst)6688 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6689 {
6690 struct r600_bytecode_alu alu;
6691 int i, r;
6692
6693 for (i = 0; i < 4; i++) {
6694 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6695 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6696 alu.op = ALU_OP0_NOP;
6697 alu.dst.chan = i;
6698 } else {
6699 alu.op = ALU_OP1_MOV;
6700 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6701 alu.src[0].sel = ctx->temp_reg;
6702 alu.src[0].chan = i;
6703 }
6704 if (i == 3) {
6705 alu.last = 1;
6706 }
6707 r = r600_bytecode_add_alu(ctx->bc, &alu);
6708 if (r)
6709 return r;
6710 }
6711 return 0;
6712 }
6713
tgsi_make_src_for_op3(struct r600_shader_ctx * ctx,unsigned temp,int chan,struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src)6714 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6715 unsigned temp, int chan,
6716 struct r600_bytecode_alu_src *bc_src,
6717 const struct r600_shader_src *shader_src)
6718 {
6719 struct r600_bytecode_alu alu;
6720 int r;
6721
6722 r600_bytecode_src(bc_src, shader_src, chan);
6723
6724 /* op3 operands don't support abs modifier */
6725 if (bc_src->abs) {
6726 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
6727 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6728 alu.op = ALU_OP1_MOV;
6729 alu.dst.sel = temp;
6730 alu.dst.chan = chan;
6731 alu.dst.write = 1;
6732
6733 alu.src[0] = *bc_src;
6734 alu.last = true; // sufficient?
6735 r = r600_bytecode_add_alu(ctx->bc, &alu);
6736 if (r)
6737 return r;
6738
6739 memset(bc_src, 0, sizeof(*bc_src));
6740 bc_src->sel = temp;
6741 bc_src->chan = chan;
6742 }
6743 return 0;
6744 }
6745
tgsi_op3_dst(struct r600_shader_ctx * ctx,int dst)6746 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
6747 {
6748 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6749 struct r600_bytecode_alu alu;
6750 int i, j, r;
6751 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6752 int temp_regs[4];
6753 unsigned op = ctx->inst_info->op;
6754
6755 if (op == ALU_OP3_MULADD_IEEE &&
6756 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6757 op = ALU_OP3_MULADD;
6758
6759 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6760 temp_regs[j] = 0;
6761 if (ctx->src[j].abs)
6762 temp_regs[j] = r600_get_temp(ctx);
6763 }
6764 for (i = 0; i < lasti + 1; i++) {
6765 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6766 continue;
6767
6768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6769 alu.op = op;
6770 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6771 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6772 if (r)
6773 return r;
6774 }
6775
6776 if (dst == -1) {
6777 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6778 } else {
6779 alu.dst.sel = dst;
6780 }
6781 alu.dst.chan = i;
6782 alu.dst.write = 1;
6783 alu.is_op3 = 1;
6784 if (i == lasti) {
6785 alu.last = 1;
6786 }
6787 r = r600_bytecode_add_alu(ctx->bc, &alu);
6788 if (r)
6789 return r;
6790 }
6791 return 0;
6792 }
6793
tgsi_op3(struct r600_shader_ctx * ctx)6794 static int tgsi_op3(struct r600_shader_ctx *ctx)
6795 {
6796 return tgsi_op3_dst(ctx, -1);
6797 }
6798
tgsi_dp(struct r600_shader_ctx * ctx)6799 static int tgsi_dp(struct r600_shader_ctx *ctx)
6800 {
6801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6802 struct r600_bytecode_alu alu;
6803 int i, j, r;
6804 unsigned op = ctx->inst_info->op;
6805 if (op == ALU_OP2_DOT4_IEEE &&
6806 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6807 op = ALU_OP2_DOT4;
6808
6809 for (i = 0; i < 4; i++) {
6810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6811 alu.op = op;
6812 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6813 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6814 }
6815
6816 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6817 alu.dst.chan = i;
6818 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6819 /* handle some special cases */
6820 switch (inst->Instruction.Opcode) {
6821 case TGSI_OPCODE_DP2:
6822 if (i > 1) {
6823 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6824 alu.src[0].chan = alu.src[1].chan = 0;
6825 }
6826 break;
6827 case TGSI_OPCODE_DP3:
6828 if (i > 2) {
6829 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6830 alu.src[0].chan = alu.src[1].chan = 0;
6831 }
6832 break;
6833 default:
6834 break;
6835 }
6836 if (i == 3) {
6837 alu.last = 1;
6838 }
6839 r = r600_bytecode_add_alu(ctx->bc, &alu);
6840 if (r)
6841 return r;
6842 }
6843 return 0;
6844 }
6845
tgsi_tex_src_requires_loading(struct r600_shader_ctx * ctx,unsigned index)6846 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6847 unsigned index)
6848 {
6849 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6850 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6851 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6852 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6853 ctx->src[index].neg || ctx->src[index].abs ||
6854 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6855 }
6856
tgsi_tex_get_src_gpr(struct r600_shader_ctx * ctx,unsigned index)6857 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6858 unsigned index)
6859 {
6860 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6861 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6862 }
6863
do_vtx_fetch_inst(struct r600_shader_ctx * ctx,boolean src_requires_loading)6864 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6865 {
6866 struct r600_bytecode_vtx vtx;
6867 struct r600_bytecode_alu alu;
6868 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6869 int src_gpr, r, i;
6870 int id = tgsi_tex_get_src_gpr(ctx, 1);
6871 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6872
6873 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6874 if (src_requires_loading) {
6875 for (i = 0; i < 4; i++) {
6876 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6877 alu.op = ALU_OP1_MOV;
6878 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6879 alu.dst.sel = ctx->temp_reg;
6880 alu.dst.chan = i;
6881 if (i == 3)
6882 alu.last = 1;
6883 alu.dst.write = 1;
6884 r = r600_bytecode_add_alu(ctx->bc, &alu);
6885 if (r)
6886 return r;
6887 }
6888 src_gpr = ctx->temp_reg;
6889 }
6890
6891 memset(&vtx, 0, sizeof(vtx));
6892 vtx.op = FETCH_OP_VFETCH;
6893 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6894 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6895 vtx.src_gpr = src_gpr;
6896 vtx.mega_fetch_count = 16;
6897 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6898 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6899 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
6900 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
6901 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
6902 vtx.use_const_fields = 1;
6903 vtx.buffer_index_mode = sampler_index_mode;
6904
6905 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6906 return r;
6907
6908 if (ctx->bc->chip_class >= EVERGREEN)
6909 return 0;
6910
6911 for (i = 0; i < 4; i++) {
6912 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6913 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6914 continue;
6915
6916 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6917 alu.op = ALU_OP2_AND_INT;
6918
6919 alu.dst.chan = i;
6920 alu.dst.sel = vtx.dst_gpr;
6921 alu.dst.write = 1;
6922
6923 alu.src[0].sel = vtx.dst_gpr;
6924 alu.src[0].chan = i;
6925
6926 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6927 alu.src[1].sel += (id * 2);
6928 alu.src[1].chan = i % 4;
6929 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6930
6931 if (i == lasti)
6932 alu.last = 1;
6933 r = r600_bytecode_add_alu(ctx->bc, &alu);
6934 if (r)
6935 return r;
6936 }
6937
6938 if (inst->Dst[0].Register.WriteMask & 3) {
6939 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6940 alu.op = ALU_OP2_OR_INT;
6941
6942 alu.dst.chan = 3;
6943 alu.dst.sel = vtx.dst_gpr;
6944 alu.dst.write = 1;
6945
6946 alu.src[0].sel = vtx.dst_gpr;
6947 alu.src[0].chan = 3;
6948
6949 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6950 alu.src[1].chan = 0;
6951 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6952
6953 alu.last = 1;
6954 r = r600_bytecode_add_alu(ctx->bc, &alu);
6955 if (r)
6956 return r;
6957 }
6958 return 0;
6959 }
6960
r600_do_buffer_txq(struct r600_shader_ctx * ctx,int reg_idx,int offset)6961 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
6962 {
6963 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6964 int r;
6965 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
6966 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6967
6968 if (ctx->bc->chip_class < EVERGREEN) {
6969 struct r600_bytecode_alu alu;
6970 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6971 alu.op = ALU_OP1_MOV;
6972 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6973 /* r600 we have them at channel 2 of the second dword */
6974 alu.src[0].sel += (id * 2) + 1;
6975 alu.src[0].chan = 1;
6976 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6977 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6978 alu.last = 1;
6979 r = r600_bytecode_add_alu(ctx->bc, &alu);
6980 if (r)
6981 return r;
6982 return 0;
6983 } else {
6984 struct r600_bytecode_vtx vtx;
6985 memset(&vtx, 0, sizeof(vtx));
6986 vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
6987 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6988 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6989 vtx.src_gpr = 0;
6990 vtx.mega_fetch_count = 16; /* no idea here really... */
6991 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6992 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6993 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
6994 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
6995 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
6996 vtx.data_format = FMT_32_32_32_32;
6997 vtx.buffer_index_mode = sampler_index_mode;
6998
6999 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7000 return r;
7001 return 0;
7002 }
7003 }
7004
7005
tgsi_tex(struct r600_shader_ctx * ctx)7006 static int tgsi_tex(struct r600_shader_ctx *ctx)
7007 {
7008 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7009 struct r600_bytecode_tex tex;
7010 struct r600_bytecode_alu alu;
7011 unsigned src_gpr;
7012 int r, i, j;
7013 int opcode;
7014 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7015 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7016 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7017 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7018
7019 bool txf_add_offsets = inst->Texture.NumOffsets &&
7020 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7021 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7022
7023 /* Texture fetch instructions can only use gprs as source.
7024 * Also they cannot negate the source or take the absolute value */
7025 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7026 tgsi_tex_src_requires_loading(ctx, 0)) ||
7027 read_compressed_msaa || txf_add_offsets;
7028
7029 boolean src_loaded = FALSE;
7030 unsigned sampler_src_reg = 1;
7031 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7032 boolean has_txq_cube_array_z = false;
7033 unsigned sampler_index_mode;
7034
7035 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7036 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7037 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7038 if (inst->Dst[0].Register.WriteMask & 4) {
7039 ctx->shader->has_txq_cube_array_z_comp = true;
7040 has_txq_cube_array_z = true;
7041 }
7042
7043 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7044 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7045 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7046 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7047 sampler_src_reg = 2;
7048
7049 /* TGSI moves the sampler to src reg 3 for TXD */
7050 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7051 sampler_src_reg = 3;
7052
7053 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7054
7055 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7056
7057 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7058 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7059 if (ctx->bc->chip_class < EVERGREEN)
7060 ctx->shader->uses_tex_buffers = true;
7061 return r600_do_buffer_txq(ctx, 1, 0);
7062 }
7063 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7064 if (ctx->bc->chip_class < EVERGREEN)
7065 ctx->shader->uses_tex_buffers = true;
7066 return do_vtx_fetch_inst(ctx, src_requires_loading);
7067 }
7068 }
7069
7070 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7071 int out_chan;
7072 /* Add perspective divide */
7073 if (ctx->bc->chip_class == CAYMAN) {
7074 out_chan = 2;
7075 for (i = 0; i < 3; i++) {
7076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7077 alu.op = ALU_OP1_RECIP_IEEE;
7078 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7079
7080 alu.dst.sel = ctx->temp_reg;
7081 alu.dst.chan = i;
7082 if (i == 2)
7083 alu.last = 1;
7084 if (out_chan == i)
7085 alu.dst.write = 1;
7086 r = r600_bytecode_add_alu(ctx->bc, &alu);
7087 if (r)
7088 return r;
7089 }
7090
7091 } else {
7092 out_chan = 3;
7093 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7094 alu.op = ALU_OP1_RECIP_IEEE;
7095 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7096
7097 alu.dst.sel = ctx->temp_reg;
7098 alu.dst.chan = out_chan;
7099 alu.last = 1;
7100 alu.dst.write = 1;
7101 r = r600_bytecode_add_alu(ctx->bc, &alu);
7102 if (r)
7103 return r;
7104 }
7105
7106 for (i = 0; i < 3; i++) {
7107 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7108 alu.op = ALU_OP2_MUL;
7109 alu.src[0].sel = ctx->temp_reg;
7110 alu.src[0].chan = out_chan;
7111 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7112 alu.dst.sel = ctx->temp_reg;
7113 alu.dst.chan = i;
7114 alu.dst.write = 1;
7115 r = r600_bytecode_add_alu(ctx->bc, &alu);
7116 if (r)
7117 return r;
7118 }
7119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7120 alu.op = ALU_OP1_MOV;
7121 alu.src[0].sel = V_SQ_ALU_SRC_1;
7122 alu.src[0].chan = 0;
7123 alu.dst.sel = ctx->temp_reg;
7124 alu.dst.chan = 3;
7125 alu.last = 1;
7126 alu.dst.write = 1;
7127 r = r600_bytecode_add_alu(ctx->bc, &alu);
7128 if (r)
7129 return r;
7130 src_loaded = TRUE;
7131 src_gpr = ctx->temp_reg;
7132 }
7133
7134
7135 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7136 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7137 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7138 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7139 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7140
7141 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7142 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7143
7144 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7145 for (i = 0; i < 4; i++) {
7146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7147 alu.op = ALU_OP2_CUBE;
7148 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7149 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7150 alu.dst.sel = ctx->temp_reg;
7151 alu.dst.chan = i;
7152 if (i == 3)
7153 alu.last = 1;
7154 alu.dst.write = 1;
7155 r = r600_bytecode_add_alu(ctx->bc, &alu);
7156 if (r)
7157 return r;
7158 }
7159
7160 /* tmp1.z = RCP_e(|tmp1.z|) */
7161 if (ctx->bc->chip_class == CAYMAN) {
7162 for (i = 0; i < 3; i++) {
7163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7164 alu.op = ALU_OP1_RECIP_IEEE;
7165 alu.src[0].sel = ctx->temp_reg;
7166 alu.src[0].chan = 2;
7167 alu.src[0].abs = 1;
7168 alu.dst.sel = ctx->temp_reg;
7169 alu.dst.chan = i;
7170 if (i == 2)
7171 alu.dst.write = 1;
7172 if (i == 2)
7173 alu.last = 1;
7174 r = r600_bytecode_add_alu(ctx->bc, &alu);
7175 if (r)
7176 return r;
7177 }
7178 } else {
7179 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7180 alu.op = ALU_OP1_RECIP_IEEE;
7181 alu.src[0].sel = ctx->temp_reg;
7182 alu.src[0].chan = 2;
7183 alu.src[0].abs = 1;
7184 alu.dst.sel = ctx->temp_reg;
7185 alu.dst.chan = 2;
7186 alu.dst.write = 1;
7187 alu.last = 1;
7188 r = r600_bytecode_add_alu(ctx->bc, &alu);
7189 if (r)
7190 return r;
7191 }
7192
7193 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7194 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7195 * muladd has no writemask, have to use another temp
7196 */
7197 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7198 alu.op = ALU_OP3_MULADD;
7199 alu.is_op3 = 1;
7200
7201 alu.src[0].sel = ctx->temp_reg;
7202 alu.src[0].chan = 0;
7203 alu.src[1].sel = ctx->temp_reg;
7204 alu.src[1].chan = 2;
7205
7206 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7207 alu.src[2].chan = 0;
7208 alu.src[2].value = u_bitcast_f2u(1.5f);
7209
7210 alu.dst.sel = ctx->temp_reg;
7211 alu.dst.chan = 0;
7212 alu.dst.write = 1;
7213
7214 r = r600_bytecode_add_alu(ctx->bc, &alu);
7215 if (r)
7216 return r;
7217
7218 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7219 alu.op = ALU_OP3_MULADD;
7220 alu.is_op3 = 1;
7221
7222 alu.src[0].sel = ctx->temp_reg;
7223 alu.src[0].chan = 1;
7224 alu.src[1].sel = ctx->temp_reg;
7225 alu.src[1].chan = 2;
7226
7227 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7228 alu.src[2].chan = 0;
7229 alu.src[2].value = u_bitcast_f2u(1.5f);
7230
7231 alu.dst.sel = ctx->temp_reg;
7232 alu.dst.chan = 1;
7233 alu.dst.write = 1;
7234
7235 alu.last = 1;
7236 r = r600_bytecode_add_alu(ctx->bc, &alu);
7237 if (r)
7238 return r;
7239 /* write initial compare value into Z component
7240 - W src 0 for shadow cube
7241 - X src 1 for shadow cube array */
7242 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7243 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7244 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7245 alu.op = ALU_OP1_MOV;
7246 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7247 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7248 else
7249 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7250 alu.dst.sel = ctx->temp_reg;
7251 alu.dst.chan = 2;
7252 alu.dst.write = 1;
7253 alu.last = 1;
7254 r = r600_bytecode_add_alu(ctx->bc, &alu);
7255 if (r)
7256 return r;
7257 }
7258
7259 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7260 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7261 if (ctx->bc->chip_class >= EVERGREEN) {
7262 int mytmp = r600_get_temp(ctx);
7263 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7264 alu.op = ALU_OP1_MOV;
7265 alu.src[0].sel = ctx->temp_reg;
7266 alu.src[0].chan = 3;
7267 alu.dst.sel = mytmp;
7268 alu.dst.chan = 0;
7269 alu.dst.write = 1;
7270 alu.last = 1;
7271 r = r600_bytecode_add_alu(ctx->bc, &alu);
7272 if (r)
7273 return r;
7274
7275 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7276 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7277 alu.op = ALU_OP3_MULADD;
7278 alu.is_op3 = 1;
7279 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7280 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7281 alu.src[1].chan = 0;
7282 alu.src[1].value = u_bitcast_f2u(8.0f);
7283 alu.src[2].sel = mytmp;
7284 alu.src[2].chan = 0;
7285 alu.dst.sel = ctx->temp_reg;
7286 alu.dst.chan = 3;
7287 alu.dst.write = 1;
7288 alu.last = 1;
7289 r = r600_bytecode_add_alu(ctx->bc, &alu);
7290 if (r)
7291 return r;
7292 } else if (ctx->bc->chip_class < EVERGREEN) {
7293 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7294 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7295 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7296 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7297 tex.src_gpr = r600_get_temp(ctx);
7298 tex.src_sel_x = 0;
7299 tex.src_sel_y = 0;
7300 tex.src_sel_z = 0;
7301 tex.src_sel_w = 0;
7302 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7303 tex.coord_type_x = 1;
7304 tex.coord_type_y = 1;
7305 tex.coord_type_z = 1;
7306 tex.coord_type_w = 1;
7307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7308 alu.op = ALU_OP1_MOV;
7309 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7310 alu.dst.sel = tex.src_gpr;
7311 alu.dst.chan = 0;
7312 alu.last = 1;
7313 alu.dst.write = 1;
7314 r = r600_bytecode_add_alu(ctx->bc, &alu);
7315 if (r)
7316 return r;
7317
7318 r = r600_bytecode_add_tex(ctx->bc, &tex);
7319 if (r)
7320 return r;
7321 }
7322
7323 }
7324
7325 /* for cube forms of lod and bias we need to route things */
7326 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7327 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7328 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7329 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7330 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7331 alu.op = ALU_OP1_MOV;
7332 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7333 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7334 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7335 else
7336 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7337 alu.dst.sel = ctx->temp_reg;
7338 alu.dst.chan = 2;
7339 alu.last = 1;
7340 alu.dst.write = 1;
7341 r = r600_bytecode_add_alu(ctx->bc, &alu);
7342 if (r)
7343 return r;
7344 }
7345
7346 src_loaded = TRUE;
7347 src_gpr = ctx->temp_reg;
7348 }
7349
7350 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7351 int temp_h = 0, temp_v = 0;
7352 int start_val = 0;
7353
7354 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7355 if (src_loaded == TRUE)
7356 start_val = 1;
7357 else
7358 src_loaded = TRUE;
7359 for (i = start_val; i < 3; i++) {
7360 int treg = r600_get_temp(ctx);
7361
7362 if (i == 0)
7363 src_gpr = treg;
7364 else if (i == 1)
7365 temp_h = treg;
7366 else
7367 temp_v = treg;
7368
7369 for (j = 0; j < 4; j++) {
7370 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7371 alu.op = ALU_OP1_MOV;
7372 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7373 alu.dst.sel = treg;
7374 alu.dst.chan = j;
7375 if (j == 3)
7376 alu.last = 1;
7377 alu.dst.write = 1;
7378 r = r600_bytecode_add_alu(ctx->bc, &alu);
7379 if (r)
7380 return r;
7381 }
7382 }
7383 for (i = 1; i < 3; i++) {
7384 /* set gradients h/v */
7385 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7386 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7387 FETCH_OP_SET_GRADIENTS_V;
7388 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7389 tex.sampler_index_mode = sampler_index_mode;
7390 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7391 tex.resource_index_mode = sampler_index_mode;
7392
7393 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7394 tex.src_sel_x = 0;
7395 tex.src_sel_y = 1;
7396 tex.src_sel_z = 2;
7397 tex.src_sel_w = 3;
7398
7399 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7400 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7401 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7402 tex.coord_type_x = 1;
7403 tex.coord_type_y = 1;
7404 tex.coord_type_z = 1;
7405 tex.coord_type_w = 1;
7406 }
7407 r = r600_bytecode_add_tex(ctx->bc, &tex);
7408 if (r)
7409 return r;
7410 }
7411 }
7412
7413 if (src_requires_loading && !src_loaded) {
7414 for (i = 0; i < 4; i++) {
7415 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7416 alu.op = ALU_OP1_MOV;
7417 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7418 alu.dst.sel = ctx->temp_reg;
7419 alu.dst.chan = i;
7420 if (i == 3)
7421 alu.last = 1;
7422 alu.dst.write = 1;
7423 r = r600_bytecode_add_alu(ctx->bc, &alu);
7424 if (r)
7425 return r;
7426 }
7427 src_loaded = TRUE;
7428 src_gpr = ctx->temp_reg;
7429 }
7430
7431 /* get offset values */
7432 if (inst->Texture.NumOffsets) {
7433 assert(inst->Texture.NumOffsets == 1);
7434
7435 /* The texture offset feature doesn't work with the TXF instruction
7436 * and must be emulated by adding the offset to the texture coordinates. */
7437 if (txf_add_offsets) {
7438 const struct tgsi_texture_offset *off = inst->TexOffsets;
7439
7440 switch (inst->Texture.Texture) {
7441 case TGSI_TEXTURE_3D:
7442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7443 alu.op = ALU_OP2_ADD_INT;
7444 alu.src[0].sel = src_gpr;
7445 alu.src[0].chan = 2;
7446 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7447 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7448 alu.dst.sel = src_gpr;
7449 alu.dst.chan = 2;
7450 alu.dst.write = 1;
7451 alu.last = 1;
7452 r = r600_bytecode_add_alu(ctx->bc, &alu);
7453 if (r)
7454 return r;
7455 /* fall through */
7456
7457 case TGSI_TEXTURE_2D:
7458 case TGSI_TEXTURE_SHADOW2D:
7459 case TGSI_TEXTURE_RECT:
7460 case TGSI_TEXTURE_SHADOWRECT:
7461 case TGSI_TEXTURE_2D_ARRAY:
7462 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7463 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7464 alu.op = ALU_OP2_ADD_INT;
7465 alu.src[0].sel = src_gpr;
7466 alu.src[0].chan = 1;
7467 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7468 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7469 alu.dst.sel = src_gpr;
7470 alu.dst.chan = 1;
7471 alu.dst.write = 1;
7472 alu.last = 1;
7473 r = r600_bytecode_add_alu(ctx->bc, &alu);
7474 if (r)
7475 return r;
7476 /* fall through */
7477
7478 case TGSI_TEXTURE_1D:
7479 case TGSI_TEXTURE_SHADOW1D:
7480 case TGSI_TEXTURE_1D_ARRAY:
7481 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7482 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7483 alu.op = ALU_OP2_ADD_INT;
7484 alu.src[0].sel = src_gpr;
7485 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7486 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7487 alu.dst.sel = src_gpr;
7488 alu.dst.write = 1;
7489 alu.last = 1;
7490 r = r600_bytecode_add_alu(ctx->bc, &alu);
7491 if (r)
7492 return r;
7493 break;
7494 /* texture offsets do not apply to other texture targets */
7495 }
7496 } else {
7497 switch (inst->Texture.Texture) {
7498 case TGSI_TEXTURE_3D:
7499 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7500 /* fallthrough */
7501 case TGSI_TEXTURE_2D:
7502 case TGSI_TEXTURE_SHADOW2D:
7503 case TGSI_TEXTURE_RECT:
7504 case TGSI_TEXTURE_SHADOWRECT:
7505 case TGSI_TEXTURE_2D_ARRAY:
7506 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7507 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7508 /* fallthrough */
7509 case TGSI_TEXTURE_1D:
7510 case TGSI_TEXTURE_SHADOW1D:
7511 case TGSI_TEXTURE_1D_ARRAY:
7512 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7513 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7514 }
7515 }
7516 }
7517
7518 /* Obtain the sample index for reading a compressed MSAA color texture.
7519 * To read the FMASK, we use the ldfptr instruction, which tells us
7520 * where the samples are stored.
7521 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7522 * which is the identity mapping. Each nibble says which physical sample
7523 * should be fetched to get that sample.
7524 *
7525 * Assume src.z contains the sample index. It should be modified like this:
7526 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7527 * Then fetch the texel with src.
7528 */
7529 if (read_compressed_msaa) {
7530 unsigned sample_chan = 3;
7531 unsigned temp = r600_get_temp(ctx);
7532 assert(src_loaded);
7533
7534 /* temp.w = ldfptr() */
7535 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7536 tex.op = FETCH_OP_LD;
7537 tex.inst_mod = 1; /* to indicate this is ldfptr */
7538 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7539 tex.sampler_index_mode = sampler_index_mode;
7540 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7541 tex.resource_index_mode = sampler_index_mode;
7542 tex.src_gpr = src_gpr;
7543 tex.dst_gpr = temp;
7544 tex.dst_sel_x = 7; /* mask out these components */
7545 tex.dst_sel_y = 7;
7546 tex.dst_sel_z = 7;
7547 tex.dst_sel_w = 0; /* store X */
7548 tex.src_sel_x = 0;
7549 tex.src_sel_y = 1;
7550 tex.src_sel_z = 2;
7551 tex.src_sel_w = 3;
7552 tex.offset_x = offset_x;
7553 tex.offset_y = offset_y;
7554 tex.offset_z = offset_z;
7555 r = r600_bytecode_add_tex(ctx->bc, &tex);
7556 if (r)
7557 return r;
7558
7559 /* temp.x = sample_index*4 */
7560 if (ctx->bc->chip_class == CAYMAN) {
7561 for (i = 0 ; i < 4; i++) {
7562 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7563 alu.op = ALU_OP2_MULLO_INT;
7564 alu.src[0].sel = src_gpr;
7565 alu.src[0].chan = sample_chan;
7566 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7567 alu.src[1].value = 4;
7568 alu.dst.sel = temp;
7569 alu.dst.chan = i;
7570 alu.dst.write = i == 0;
7571 if (i == 3)
7572 alu.last = 1;
7573 r = r600_bytecode_add_alu(ctx->bc, &alu);
7574 if (r)
7575 return r;
7576 }
7577 } else {
7578 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7579 alu.op = ALU_OP2_MULLO_INT;
7580 alu.src[0].sel = src_gpr;
7581 alu.src[0].chan = sample_chan;
7582 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7583 alu.src[1].value = 4;
7584 alu.dst.sel = temp;
7585 alu.dst.chan = 0;
7586 alu.dst.write = 1;
7587 alu.last = 1;
7588 r = r600_bytecode_add_alu(ctx->bc, &alu);
7589 if (r)
7590 return r;
7591 }
7592
7593 /* sample_index = temp.w >> temp.x */
7594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7595 alu.op = ALU_OP2_LSHR_INT;
7596 alu.src[0].sel = temp;
7597 alu.src[0].chan = 3;
7598 alu.src[1].sel = temp;
7599 alu.src[1].chan = 0;
7600 alu.dst.sel = src_gpr;
7601 alu.dst.chan = sample_chan;
7602 alu.dst.write = 1;
7603 alu.last = 1;
7604 r = r600_bytecode_add_alu(ctx->bc, &alu);
7605 if (r)
7606 return r;
7607
7608 /* sample_index & 0xF */
7609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7610 alu.op = ALU_OP2_AND_INT;
7611 alu.src[0].sel = src_gpr;
7612 alu.src[0].chan = sample_chan;
7613 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7614 alu.src[1].value = 0xF;
7615 alu.dst.sel = src_gpr;
7616 alu.dst.chan = sample_chan;
7617 alu.dst.write = 1;
7618 alu.last = 1;
7619 r = r600_bytecode_add_alu(ctx->bc, &alu);
7620 if (r)
7621 return r;
7622 #if 0
7623 /* visualize the FMASK */
7624 for (i = 0; i < 4; i++) {
7625 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7626 alu.op = ALU_OP1_INT_TO_FLT;
7627 alu.src[0].sel = src_gpr;
7628 alu.src[0].chan = sample_chan;
7629 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7630 alu.dst.chan = i;
7631 alu.dst.write = 1;
7632 alu.last = 1;
7633 r = r600_bytecode_add_alu(ctx->bc, &alu);
7634 if (r)
7635 return r;
7636 }
7637 return 0;
7638 #endif
7639 }
7640
7641 /* does this shader want a num layers from TXQ for a cube array? */
7642 if (has_txq_cube_array_z) {
7643 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7644
7645 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7646 alu.op = ALU_OP1_MOV;
7647
7648 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7649 if (ctx->bc->chip_class >= EVERGREEN) {
7650 /* with eg each dword is number of cubes */
7651 alu.src[0].sel += id / 4;
7652 alu.src[0].chan = id % 4;
7653 } else {
7654 /* r600 we have them at channel 2 of the second dword */
7655 alu.src[0].sel += (id * 2) + 1;
7656 alu.src[0].chan = 2;
7657 }
7658 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7659 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7660 alu.last = 1;
7661 r = r600_bytecode_add_alu(ctx->bc, &alu);
7662 if (r)
7663 return r;
7664 /* disable writemask from texture instruction */
7665 inst->Dst[0].Register.WriteMask &= ~4;
7666 }
7667
7668 opcode = ctx->inst_info->op;
7669 if (opcode == FETCH_OP_GATHER4 &&
7670 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7671 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7672 opcode = FETCH_OP_GATHER4_O;
7673
7674 /* GATHER4_O/GATHER4_C_O use offset values loaded by
7675 SET_TEXTURE_OFFSETS instruction. The immediate offset values
7676 encoded in the instruction are ignored. */
7677 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7678 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7679 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7680 tex.sampler_index_mode = sampler_index_mode;
7681 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7682 tex.resource_index_mode = sampler_index_mode;
7683
7684 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7685 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7686 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7687 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7688 tex.src_sel_w = 4;
7689
7690 tex.dst_sel_x = 7;
7691 tex.dst_sel_y = 7;
7692 tex.dst_sel_z = 7;
7693 tex.dst_sel_w = 7;
7694
7695 r = r600_bytecode_add_tex(ctx->bc, &tex);
7696 if (r)
7697 return r;
7698 }
7699
7700 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7701 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7702 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7703 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7704 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7705 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7706 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7707 switch (opcode) {
7708 case FETCH_OP_SAMPLE:
7709 opcode = FETCH_OP_SAMPLE_C;
7710 break;
7711 case FETCH_OP_SAMPLE_L:
7712 opcode = FETCH_OP_SAMPLE_C_L;
7713 break;
7714 case FETCH_OP_SAMPLE_LB:
7715 opcode = FETCH_OP_SAMPLE_C_LB;
7716 break;
7717 case FETCH_OP_SAMPLE_G:
7718 opcode = FETCH_OP_SAMPLE_C_G;
7719 break;
7720 /* Texture gather variants */
7721 case FETCH_OP_GATHER4:
7722 opcode = FETCH_OP_GATHER4_C;
7723 break;
7724 case FETCH_OP_GATHER4_O:
7725 opcode = FETCH_OP_GATHER4_C_O;
7726 break;
7727 }
7728 }
7729
7730 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7731 tex.op = opcode;
7732
7733 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7734 tex.sampler_index_mode = sampler_index_mode;
7735 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7736 tex.resource_index_mode = sampler_index_mode;
7737 tex.src_gpr = src_gpr;
7738 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7739
7740 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7741 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7742 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7743 }
7744
7745 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7746 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7747 tex.inst_mod = texture_component_select;
7748
7749 if (ctx->bc->chip_class == CAYMAN) {
7750 /* GATHER4 result order is different from TGSI TG4 */
7751 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7752 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7753 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7754 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7755 } else {
7756 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7757 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7758 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7759 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7760 }
7761 }
7762 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7763 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7764 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7765 tex.dst_sel_z = 7;
7766 tex.dst_sel_w = 7;
7767 }
7768 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7769 tex.dst_sel_x = 3;
7770 tex.dst_sel_y = 7;
7771 tex.dst_sel_z = 7;
7772 tex.dst_sel_w = 7;
7773 }
7774 else {
7775 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7776 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7777 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7778 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7779 }
7780
7781
7782 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7783 tex.src_sel_x = 4;
7784 tex.src_sel_y = 4;
7785 tex.src_sel_z = 4;
7786 tex.src_sel_w = 4;
7787 } else if (src_loaded) {
7788 tex.src_sel_x = 0;
7789 tex.src_sel_y = 1;
7790 tex.src_sel_z = 2;
7791 tex.src_sel_w = 3;
7792 } else {
7793 tex.src_sel_x = ctx->src[0].swizzle[0];
7794 tex.src_sel_y = ctx->src[0].swizzle[1];
7795 tex.src_sel_z = ctx->src[0].swizzle[2];
7796 tex.src_sel_w = ctx->src[0].swizzle[3];
7797 tex.src_rel = ctx->src[0].rel;
7798 }
7799
7800 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7801 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7802 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7803 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7804 tex.src_sel_x = 1;
7805 tex.src_sel_y = 0;
7806 tex.src_sel_z = 3;
7807 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7808 }
7809
7810 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7811 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7812 tex.coord_type_x = 1;
7813 tex.coord_type_y = 1;
7814 }
7815 tex.coord_type_z = 1;
7816 tex.coord_type_w = 1;
7817
7818 tex.offset_x = offset_x;
7819 tex.offset_y = offset_y;
7820 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7821 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7822 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7823 tex.offset_z = 0;
7824 }
7825 else {
7826 tex.offset_z = offset_z;
7827 }
7828
7829 /* Put the depth for comparison in W.
7830 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7831 * Some instructions expect the depth in Z. */
7832 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7833 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7834 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7835 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7836 opcode != FETCH_OP_SAMPLE_C_L &&
7837 opcode != FETCH_OP_SAMPLE_C_LB) {
7838 tex.src_sel_w = tex.src_sel_z;
7839 }
7840
7841 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7842 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7843 if (opcode == FETCH_OP_SAMPLE_C_L ||
7844 opcode == FETCH_OP_SAMPLE_C_LB) {
7845 /* the array index is read from Y */
7846 tex.coord_type_y = 0;
7847 } else {
7848 /* the array index is read from Z */
7849 tex.coord_type_z = 0;
7850 tex.src_sel_z = tex.src_sel_y;
7851 }
7852 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7853 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7854 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7855 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7856 (ctx->bc->chip_class >= EVERGREEN)))
7857 /* the array index is read from Z */
7858 tex.coord_type_z = 0;
7859
7860 /* mask unused source components */
7861 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7862 switch (inst->Texture.Texture) {
7863 case TGSI_TEXTURE_2D:
7864 case TGSI_TEXTURE_RECT:
7865 tex.src_sel_z = 7;
7866 tex.src_sel_w = 7;
7867 break;
7868 case TGSI_TEXTURE_1D_ARRAY:
7869 tex.src_sel_y = 7;
7870 tex.src_sel_w = 7;
7871 break;
7872 case TGSI_TEXTURE_1D:
7873 tex.src_sel_y = 7;
7874 tex.src_sel_z = 7;
7875 tex.src_sel_w = 7;
7876 break;
7877 }
7878 }
7879
7880 r = r600_bytecode_add_tex(ctx->bc, &tex);
7881 if (r)
7882 return r;
7883
7884 /* add shadow ambient support - gallium doesn't do it yet */
7885 return 0;
7886 }
7887
find_hw_atomic_counter(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src)7888 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
7889 struct tgsi_full_src_register *src)
7890 {
7891 unsigned i;
7892
7893 if (src->Register.Indirect) {
7894 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7895 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
7896 return ctx->shader->atomics[i].hw_idx;
7897 }
7898 } else {
7899 uint32_t index = src->Register.Index;
7900 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7901 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
7902 continue;
7903 if (index > ctx->shader->atomics[i].end)
7904 continue;
7905 if (index < ctx->shader->atomics[i].start)
7906 continue;
7907 uint32_t offset = (index - ctx->shader->atomics[i].start);
7908 return ctx->shader->atomics[i].hw_idx + offset;
7909 }
7910 }
7911 assert(0);
7912 return -1;
7913 }
7914
tgsi_set_gds_temp(struct r600_shader_ctx * ctx,int * uav_id_p,int * uav_index_mode_p)7915 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
7916 int *uav_id_p, int *uav_index_mode_p)
7917 {
7918 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7919 int uav_id, uav_index_mode = 0;
7920 int r;
7921 bool is_cm = (ctx->bc->chip_class == CAYMAN);
7922
7923 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
7924
7925 if (inst->Src[0].Register.Indirect) {
7926 if (is_cm) {
7927 struct r600_bytecode_alu alu;
7928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7929 alu.op = ALU_OP2_LSHL_INT;
7930 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
7931 alu.src[0].chan = 0;
7932 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7933 alu.src[1].value = 2;
7934 alu.dst.sel = ctx->temp_reg;
7935 alu.dst.chan = 0;
7936 alu.dst.write = 1;
7937 alu.last = 1;
7938 r = r600_bytecode_add_alu(ctx->bc, &alu);
7939 if (r)
7940 return r;
7941
7942 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
7943 ctx->temp_reg, 0,
7944 ctx->temp_reg, 0,
7945 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
7946 if (r)
7947 return r;
7948 } else
7949 uav_index_mode = 2;
7950 } else if (is_cm) {
7951 r = single_alu_op2(ctx, ALU_OP1_MOV,
7952 ctx->temp_reg, 0,
7953 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
7954 0, 0);
7955 if (r)
7956 return r;
7957 }
7958 *uav_id_p = uav_id;
7959 *uav_index_mode_p = uav_index_mode;
7960 return 0;
7961 }
7962
tgsi_load_gds(struct r600_shader_ctx * ctx)7963 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
7964 {
7965 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7966 int r;
7967 struct r600_bytecode_gds gds;
7968 int uav_id = 0;
7969 int uav_index_mode = 0;
7970 bool is_cm = (ctx->bc->chip_class == CAYMAN);
7971
7972 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
7973 if (r)
7974 return r;
7975
7976 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
7977 gds.op = FETCH_OP_GDS_READ_RET;
7978 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7979 gds.uav_id = is_cm ? 0 : uav_id;
7980 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
7981 gds.src_gpr = ctx->temp_reg;
7982 gds.src_sel_x = (is_cm) ? 0 : 4;
7983 gds.src_sel_y = 4;
7984 gds.src_sel_z = 4;
7985 gds.dst_sel_x = 0;
7986 gds.dst_sel_y = 7;
7987 gds.dst_sel_z = 7;
7988 gds.dst_sel_w = 7;
7989 gds.src_gpr2 = 0;
7990 gds.alloc_consume = !is_cm;
7991 r = r600_bytecode_add_gds(ctx->bc, &gds);
7992 if (r)
7993 return r;
7994
7995 ctx->bc->cf_last->vpm = 1;
7996 return 0;
7997 }
7998
7999 /* this fixes up 1D arrays properly */
load_index_src(struct r600_shader_ctx * ctx,int src_index,int * idx_gpr)8000 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8001 {
8002 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8003 int r, i;
8004 struct r600_bytecode_alu alu;
8005 int temp_reg = r600_get_temp(ctx);
8006
8007 for (i = 0; i < 4; i++) {
8008 bool def_val = true, write_zero = false;
8009 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8010 alu.op = ALU_OP1_MOV;
8011 alu.dst.sel = temp_reg;
8012 alu.dst.chan = i;
8013
8014 switch (inst->Memory.Texture) {
8015 case TGSI_TEXTURE_BUFFER:
8016 case TGSI_TEXTURE_1D:
8017 if (i == 1 || i == 2 || i == 3) {
8018 write_zero = true;
8019 }
8020 break;
8021 case TGSI_TEXTURE_1D_ARRAY:
8022 if (i == 1 || i == 3)
8023 write_zero = true;
8024 else if (i == 2) {
8025 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8026 def_val = false;
8027 }
8028 break;
8029 case TGSI_TEXTURE_2D:
8030 if (i == 2 || i == 3)
8031 write_zero = true;
8032 break;
8033 default:
8034 if (i == 3)
8035 write_zero = true;
8036 break;
8037 }
8038
8039 if (write_zero) {
8040 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8041 alu.src[0].value = 0;
8042 } else if (def_val) {
8043 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8044 }
8045
8046 if (i == 3)
8047 alu.last = 1;
8048 alu.dst.write = 1;
8049 r = r600_bytecode_add_alu(ctx->bc, &alu);
8050 if (r)
8051 return r;
8052 }
8053 *idx_gpr = temp_reg;
8054 return 0;
8055 }
8056
load_buffer_coord(struct r600_shader_ctx * ctx,int src_idx,int temp_reg)8057 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8058 int temp_reg)
8059 {
8060 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8061 int r;
8062 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8063 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8064 r = single_alu_op2(ctx, ALU_OP1_MOV,
8065 temp_reg, 0,
8066 V_SQ_ALU_SRC_LITERAL, value >> 2,
8067 0, 0);
8068 if (r)
8069 return r;
8070 } else {
8071 struct r600_bytecode_alu alu;
8072 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8073 alu.op = ALU_OP2_LSHR_INT;
8074 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8075 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8076 alu.src[1].value = 2;
8077 alu.dst.sel = temp_reg;
8078 alu.dst.write = 1;
8079 alu.last = 1;
8080 r = r600_bytecode_add_alu(ctx->bc, &alu);
8081 if (r)
8082 return r;
8083 }
8084 return 0;
8085 }
8086
tgsi_load_buffer(struct r600_shader_ctx * ctx)8087 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8088 {
8089 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8090 /* have to work out the offset into the RAT immediate return buffer */
8091 struct r600_bytecode_vtx vtx;
8092 struct r600_bytecode_cf *cf;
8093 int r;
8094 int temp_reg = r600_get_temp(ctx);
8095 unsigned rat_index_mode;
8096 unsigned base;
8097
8098 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8099 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8100
8101 r = load_buffer_coord(ctx, 1, temp_reg);
8102 if (r)
8103 return r;
8104 ctx->bc->cf_last->barrier = 1;
8105 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8106 vtx.op = FETCH_OP_VFETCH;
8107 vtx.buffer_id = inst->Src[0].Register.Index + base;
8108 vtx.buffer_index_mode = rat_index_mode;
8109 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8110 vtx.src_gpr = temp_reg;
8111 vtx.src_sel_x = 0;
8112 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8113 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8114 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8115 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8116 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8117 vtx.num_format_all = 1;
8118 vtx.format_comp_all = 1;
8119 vtx.srf_mode_all = 0;
8120
8121 if (inst->Dst[0].Register.WriteMask & 8) {
8122 vtx.data_format = FMT_32_32_32_32;
8123 vtx.use_const_fields = 0;
8124 } else if (inst->Dst[0].Register.WriteMask & 4) {
8125 vtx.data_format = FMT_32_32_32;
8126 vtx.use_const_fields = 0;
8127 } else if (inst->Dst[0].Register.WriteMask & 2) {
8128 vtx.data_format = FMT_32_32;
8129 vtx.use_const_fields = 0;
8130 } else {
8131 vtx.data_format = FMT_32;
8132 vtx.use_const_fields = 0;
8133 }
8134
8135 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8136 if (r)
8137 return r;
8138 cf = ctx->bc->cf_last;
8139 cf->barrier = 1;
8140 return 0;
8141 }
8142
tgsi_load_rat(struct r600_shader_ctx * ctx)8143 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8144 {
8145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8146 /* have to work out the offset into the RAT immediate return buffer */
8147 struct r600_bytecode_vtx vtx;
8148 struct r600_bytecode_cf *cf;
8149 int r;
8150 int idx_gpr;
8151 unsigned format, num_format, format_comp, endian;
8152 const struct util_format_description *desc;
8153 unsigned rat_index_mode;
8154 unsigned immed_base;
8155
8156 r = load_thread_id_gpr(ctx);
8157 if (r)
8158 return r;
8159
8160 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8161
8162 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8163 r = load_index_src(ctx, 1, &idx_gpr);
8164 if (r)
8165 return r;
8166
8167 if (rat_index_mode)
8168 egcm_load_index_reg(ctx->bc, 1, false);
8169
8170 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8171 cf = ctx->bc->cf_last;
8172
8173 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8174 cf->rat.inst = V_RAT_INST_NOP_RTN;
8175 cf->rat.index_mode = rat_index_mode;
8176 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8177 cf->output.gpr = ctx->thread_id_gpr;
8178 cf->output.index_gpr = idx_gpr;
8179 cf->output.comp_mask = 0xf;
8180 cf->output.burst_count = 1;
8181 cf->vpm = 1;
8182 cf->barrier = 1;
8183 cf->mark = 1;
8184 cf->output.elem_size = 0;
8185
8186 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8187 cf = ctx->bc->cf_last;
8188 cf->barrier = 1;
8189
8190 desc = util_format_description(inst->Memory.Format);
8191 r600_vertex_data_type(inst->Memory.Format,
8192 &format, &num_format, &format_comp, &endian);
8193 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8194 vtx.op = FETCH_OP_VFETCH;
8195 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8196 vtx.buffer_index_mode = rat_index_mode;
8197 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8198 vtx.src_gpr = ctx->thread_id_gpr;
8199 vtx.src_sel_x = 1;
8200 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8201 vtx.dst_sel_x = desc->swizzle[0];
8202 vtx.dst_sel_y = desc->swizzle[1];
8203 vtx.dst_sel_z = desc->swizzle[2];
8204 vtx.dst_sel_w = desc->swizzle[3];
8205 vtx.srf_mode_all = 1;
8206 vtx.data_format = format;
8207 vtx.num_format_all = num_format;
8208 vtx.format_comp_all = format_comp;
8209 vtx.endian = endian;
8210 vtx.offset = 0;
8211 vtx.mega_fetch_count = 3;
8212 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8213 if (r)
8214 return r;
8215 cf = ctx->bc->cf_last;
8216 cf->barrier = 1;
8217 return 0;
8218 }
8219
tgsi_load_lds(struct r600_shader_ctx * ctx)8220 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8221 {
8222 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8223 struct r600_bytecode_alu alu;
8224 int r;
8225 int temp_reg = r600_get_temp(ctx);
8226
8227 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8228 alu.op = ALU_OP1_MOV;
8229 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8230 alu.dst.sel = temp_reg;
8231 alu.dst.write = 1;
8232 alu.last = 1;
8233 r = r600_bytecode_add_alu(ctx->bc, &alu);
8234 if (r)
8235 return r;
8236
8237 r = do_lds_fetch_values(ctx, temp_reg,
8238 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8239 if (r)
8240 return r;
8241 return 0;
8242 }
8243
tgsi_load(struct r600_shader_ctx * ctx)8244 static int tgsi_load(struct r600_shader_ctx *ctx)
8245 {
8246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8247 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8248 return tgsi_load_rat(ctx);
8249 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8250 return tgsi_load_gds(ctx);
8251 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8252 return tgsi_load_buffer(ctx);
8253 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8254 return tgsi_load_lds(ctx);
8255 return 0;
8256 }
8257
tgsi_store_buffer_rat(struct r600_shader_ctx * ctx)8258 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8259 {
8260 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8261 struct r600_bytecode_cf *cf;
8262 int r, i;
8263 unsigned rat_index_mode;
8264 int lasti;
8265 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8266
8267 r = load_buffer_coord(ctx, 0, treg2);
8268 if (r)
8269 return r;
8270
8271 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8272 if (rat_index_mode)
8273 egcm_load_index_reg(ctx->bc, 1, false);
8274
8275 for (i = 0; i <= 3; i++) {
8276 struct r600_bytecode_alu alu;
8277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8278 alu.op = ALU_OP1_MOV;
8279 alu.dst.sel = temp_reg;
8280 alu.dst.chan = i;
8281 alu.src[0].sel = V_SQ_ALU_SRC_0;
8282 alu.last = (i == 3);
8283 alu.dst.write = 1;
8284 r = r600_bytecode_add_alu(ctx->bc, &alu);
8285 if (r)
8286 return r;
8287 }
8288
8289 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8290 for (i = 0; i <= lasti; i++) {
8291 struct r600_bytecode_alu alu;
8292 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8293 continue;
8294
8295 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8296 temp_reg, 0,
8297 treg2, 0,
8298 V_SQ_ALU_SRC_LITERAL, i);
8299 if (r)
8300 return r;
8301
8302 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8303 alu.op = ALU_OP1_MOV;
8304 alu.dst.sel = ctx->temp_reg;
8305 alu.dst.chan = 0;
8306
8307 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8308 alu.last = 1;
8309 alu.dst.write = 1;
8310 r = r600_bytecode_add_alu(ctx->bc, &alu);
8311 if (r)
8312 return r;
8313
8314 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8315 cf = ctx->bc->cf_last;
8316
8317 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8318 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8319 cf->rat.index_mode = rat_index_mode;
8320 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8321 cf->output.gpr = ctx->temp_reg;
8322 cf->output.index_gpr = temp_reg;
8323 cf->output.comp_mask = 1;
8324 cf->output.burst_count = 1;
8325 cf->vpm = 1;
8326 cf->barrier = 1;
8327 cf->output.elem_size = 0;
8328 }
8329 return 0;
8330 }
8331
tgsi_store_rat(struct r600_shader_ctx * ctx)8332 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8333 {
8334 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8335 struct r600_bytecode_cf *cf;
8336 bool src_requires_loading = false;
8337 int val_gpr, idx_gpr;
8338 int r, i;
8339 unsigned rat_index_mode;
8340
8341 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8342
8343 r = load_index_src(ctx, 0, &idx_gpr);
8344 if (r)
8345 return r;
8346
8347 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8348 src_requires_loading = true;
8349
8350 if (src_requires_loading) {
8351 struct r600_bytecode_alu alu;
8352 for (i = 0; i < 4; i++) {
8353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8354 alu.op = ALU_OP1_MOV;
8355 alu.dst.sel = ctx->temp_reg;
8356 alu.dst.chan = i;
8357
8358 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8359 if (i == 3)
8360 alu.last = 1;
8361 alu.dst.write = 1;
8362 r = r600_bytecode_add_alu(ctx->bc, &alu);
8363 if (r)
8364 return r;
8365 }
8366 val_gpr = ctx->temp_reg;
8367 } else
8368 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8369 if (rat_index_mode)
8370 egcm_load_index_reg(ctx->bc, 1, false);
8371
8372 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8373 cf = ctx->bc->cf_last;
8374
8375 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8376 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8377 cf->rat.index_mode = rat_index_mode;
8378 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8379 cf->output.gpr = val_gpr;
8380 cf->output.index_gpr = idx_gpr;
8381 cf->output.comp_mask = 0xf;
8382 cf->output.burst_count = 1;
8383 cf->vpm = 1;
8384 cf->barrier = 1;
8385 cf->output.elem_size = 0;
8386 return 0;
8387 }
8388
tgsi_store_lds(struct r600_shader_ctx * ctx)8389 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
8390 {
8391 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8392 struct r600_bytecode_alu alu;
8393 int r, i, lasti;
8394 int write_mask = inst->Dst[0].Register.WriteMask;
8395 int temp_reg = r600_get_temp(ctx);
8396
8397 /* LDS write */
8398 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8399 alu.op = ALU_OP1_MOV;
8400 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8401 alu.dst.sel = temp_reg;
8402 alu.dst.write = 1;
8403 alu.last = 1;
8404 r = r600_bytecode_add_alu(ctx->bc, &alu);
8405 if (r)
8406 return r;
8407
8408 lasti = tgsi_last_instruction(write_mask);
8409 for (i = 1; i <= lasti; i++) {
8410 if (!(write_mask & (1 << i)))
8411 continue;
8412 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8413 temp_reg, i,
8414 temp_reg, 0,
8415 V_SQ_ALU_SRC_LITERAL, 4 * i);
8416 if (r)
8417 return r;
8418 }
8419 for (i = 0; i <= lasti; i++) {
8420 if (!(write_mask & (1 << i)))
8421 continue;
8422
8423 if ((i == 0 && ((write_mask & 3) == 3)) ||
8424 (i == 2 && ((write_mask & 0xc) == 0xc))) {
8425 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8426 alu.op = LDS_OP3_LDS_WRITE_REL;
8427
8428 alu.src[0].sel = temp_reg;
8429 alu.src[0].chan = i;
8430 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8431 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
8432 alu.last = 1;
8433 alu.is_lds_idx_op = true;
8434 alu.lds_idx = 1;
8435 r = r600_bytecode_add_alu(ctx->bc, &alu);
8436 if (r)
8437 return r;
8438 i += 1;
8439 continue;
8440 }
8441 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8442 alu.op = LDS_OP2_LDS_WRITE;
8443
8444 alu.src[0].sel = temp_reg;
8445 alu.src[0].chan = i;
8446 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8447
8448 alu.last = 1;
8449 alu.is_lds_idx_op = true;
8450
8451 r = r600_bytecode_add_alu(ctx->bc, &alu);
8452 if (r)
8453 return r;
8454 }
8455 return 0;
8456 }
8457
tgsi_store(struct r600_shader_ctx * ctx)8458 static int tgsi_store(struct r600_shader_ctx *ctx)
8459 {
8460 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8461 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
8462 return tgsi_store_buffer_rat(ctx);
8463 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
8464 return tgsi_store_lds(ctx);
8465 else
8466 return tgsi_store_rat(ctx);
8467 }
8468
tgsi_atomic_op_rat(struct r600_shader_ctx * ctx)8469 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8470 {
8471 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8472 /* have to work out the offset into the RAT immediate return buffer */
8473 struct r600_bytecode_alu alu;
8474 struct r600_bytecode_vtx vtx;
8475 struct r600_bytecode_cf *cf;
8476 int r;
8477 int idx_gpr;
8478 unsigned format, num_format, format_comp, endian;
8479 const struct util_format_description *desc;
8480 unsigned rat_index_mode;
8481 unsigned immed_base;
8482 unsigned rat_base;
8483
8484 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8485 rat_base = ctx->shader->rat_base;
8486
8487 r = load_thread_id_gpr(ctx);
8488 if (r)
8489 return r;
8490
8491 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
8492 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8493 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8494
8495 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
8496 if (r)
8497 return r;
8498 idx_gpr = ctx->temp_reg;
8499 } else {
8500 r = load_index_src(ctx, 1, &idx_gpr);
8501 if (r)
8502 return r;
8503 }
8504
8505 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8506
8507 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8508 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8509 alu.op = ALU_OP1_MOV;
8510 alu.dst.sel = ctx->thread_id_gpr;
8511 alu.dst.chan = 0;
8512 alu.dst.write = 1;
8513 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8514 alu.last = 1;
8515 r = r600_bytecode_add_alu(ctx->bc, &alu);
8516 if (r)
8517 return r;
8518
8519 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8520 alu.op = ALU_OP1_MOV;
8521 alu.dst.sel = ctx->thread_id_gpr;
8522 if (ctx->bc->chip_class == CAYMAN)
8523 alu.dst.chan = 2;
8524 else
8525 alu.dst.chan = 3;
8526 alu.dst.write = 1;
8527 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8528 alu.last = 1;
8529 r = r600_bytecode_add_alu(ctx->bc, &alu);
8530 if (r)
8531 return r;
8532 } else {
8533 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8534 alu.op = ALU_OP1_MOV;
8535 alu.dst.sel = ctx->thread_id_gpr;
8536 alu.dst.chan = 0;
8537 alu.dst.write = 1;
8538 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8539 alu.last = 1;
8540 r = r600_bytecode_add_alu(ctx->bc, &alu);
8541 if (r)
8542 return r;
8543 }
8544
8545 if (rat_index_mode)
8546 egcm_load_index_reg(ctx->bc, 1, false);
8547 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8548 cf = ctx->bc->cf_last;
8549
8550 cf->rat.id = rat_base + inst->Src[0].Register.Index;
8551 cf->rat.inst = ctx->inst_info->op;
8552 cf->rat.index_mode = rat_index_mode;
8553 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8554 cf->output.gpr = ctx->thread_id_gpr;
8555 cf->output.index_gpr = idx_gpr;
8556 cf->output.comp_mask = 0xf;
8557 cf->output.burst_count = 1;
8558 cf->vpm = 1;
8559 cf->barrier = 1;
8560 cf->mark = 1;
8561 cf->output.elem_size = 0;
8562 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8563 cf = ctx->bc->cf_last;
8564 cf->barrier = 1;
8565 cf->cf_addr = 1;
8566
8567 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8568 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
8569 desc = util_format_description(inst->Memory.Format);
8570 r600_vertex_data_type(inst->Memory.Format,
8571 &format, &num_format, &format_comp, &endian);
8572 vtx.dst_sel_x = desc->swizzle[0];
8573 } else {
8574 format = FMT_32;
8575 num_format = 1;
8576 format_comp = 0;
8577 endian = 0;
8578 vtx.dst_sel_x = 0;
8579 }
8580 vtx.op = FETCH_OP_VFETCH;
8581 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8582 vtx.buffer_index_mode = rat_index_mode;
8583 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8584 vtx.src_gpr = ctx->thread_id_gpr;
8585 vtx.src_sel_x = 1;
8586 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8587 vtx.dst_sel_y = 7;
8588 vtx.dst_sel_z = 7;
8589 vtx.dst_sel_w = 7;
8590 vtx.use_const_fields = 0;
8591 vtx.srf_mode_all = 1;
8592 vtx.data_format = format;
8593 vtx.num_format_all = num_format;
8594 vtx.format_comp_all = format_comp;
8595 vtx.endian = endian;
8596 vtx.offset = 0;
8597 vtx.mega_fetch_count = 0xf;
8598 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8599 if (r)
8600 return r;
8601 cf = ctx->bc->cf_last;
8602 cf->vpm = 1;
8603 cf->barrier = 1;
8604 return 0;
8605 }
8606
get_gds_op(int opcode)8607 static int get_gds_op(int opcode)
8608 {
8609 switch (opcode) {
8610 case TGSI_OPCODE_ATOMUADD:
8611 return FETCH_OP_GDS_ADD_RET;
8612 case TGSI_OPCODE_ATOMAND:
8613 return FETCH_OP_GDS_AND_RET;
8614 case TGSI_OPCODE_ATOMOR:
8615 return FETCH_OP_GDS_OR_RET;
8616 case TGSI_OPCODE_ATOMXOR:
8617 return FETCH_OP_GDS_XOR_RET;
8618 case TGSI_OPCODE_ATOMUMIN:
8619 return FETCH_OP_GDS_MIN_UINT_RET;
8620 case TGSI_OPCODE_ATOMUMAX:
8621 return FETCH_OP_GDS_MAX_UINT_RET;
8622 case TGSI_OPCODE_ATOMXCHG:
8623 return FETCH_OP_GDS_XCHG_RET;
8624 case TGSI_OPCODE_ATOMCAS:
8625 return FETCH_OP_GDS_CMP_XCHG_RET;
8626 default:
8627 return -1;
8628 }
8629 }
8630
tgsi_atomic_op_gds(struct r600_shader_ctx * ctx)8631 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8632 {
8633 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8634 struct r600_bytecode_gds gds;
8635 struct r600_bytecode_alu alu;
8636 int gds_op = get_gds_op(inst->Instruction.Opcode);
8637 int r;
8638 int uav_id = 0;
8639 int uav_index_mode = 0;
8640 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8641
8642 if (gds_op == -1) {
8643 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8644 return -1;
8645 }
8646
8647 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8648 if (r)
8649 return r;
8650
8651 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8652 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8653 int abs_value = abs(value);
8654 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8655 gds_op = FETCH_OP_GDS_SUB_RET;
8656 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8657 alu.op = ALU_OP1_MOV;
8658 alu.dst.sel = ctx->temp_reg;
8659 alu.dst.chan = is_cm ? 1 : 0;
8660 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8661 alu.src[0].value = abs_value;
8662 alu.last = 1;
8663 alu.dst.write = 1;
8664 r = r600_bytecode_add_alu(ctx->bc, &alu);
8665 if (r)
8666 return r;
8667 } else {
8668 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8669 alu.op = ALU_OP1_MOV;
8670 alu.dst.sel = ctx->temp_reg;
8671 alu.dst.chan = is_cm ? 1 : 0;
8672 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8673 alu.last = 1;
8674 alu.dst.write = 1;
8675 r = r600_bytecode_add_alu(ctx->bc, &alu);
8676 if (r)
8677 return r;
8678 }
8679
8680
8681 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8682 gds.op = gds_op;
8683 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8684 gds.uav_id = is_cm ? 0 : uav_id;
8685 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8686 gds.src_gpr = ctx->temp_reg;
8687 gds.src_gpr2 = 0;
8688 gds.src_sel_x = is_cm ? 0 : 4;
8689 gds.src_sel_y = is_cm ? 1 : 0;
8690 gds.src_sel_z = 7;
8691 gds.dst_sel_x = 0;
8692 gds.dst_sel_y = 7;
8693 gds.dst_sel_z = 7;
8694 gds.dst_sel_w = 7;
8695 gds.alloc_consume = !is_cm;
8696
8697 r = r600_bytecode_add_gds(ctx->bc, &gds);
8698 if (r)
8699 return r;
8700 ctx->bc->cf_last->vpm = 1;
8701 return 0;
8702 }
8703
get_lds_op(int opcode)8704 static int get_lds_op(int opcode)
8705 {
8706 switch (opcode) {
8707 case TGSI_OPCODE_ATOMUADD:
8708 return LDS_OP2_LDS_ADD_RET;
8709 case TGSI_OPCODE_ATOMAND:
8710 return LDS_OP2_LDS_AND_RET;
8711 case TGSI_OPCODE_ATOMOR:
8712 return LDS_OP2_LDS_OR_RET;
8713 case TGSI_OPCODE_ATOMXOR:
8714 return LDS_OP2_LDS_XOR_RET;
8715 case TGSI_OPCODE_ATOMUMIN:
8716 return LDS_OP2_LDS_MIN_UINT_RET;
8717 case TGSI_OPCODE_ATOMUMAX:
8718 return LDS_OP2_LDS_MAX_UINT_RET;
8719 case TGSI_OPCODE_ATOMIMIN:
8720 return LDS_OP2_LDS_MIN_INT_RET;
8721 case TGSI_OPCODE_ATOMIMAX:
8722 return LDS_OP2_LDS_MAX_INT_RET;
8723 case TGSI_OPCODE_ATOMXCHG:
8724 return LDS_OP2_LDS_XCHG_RET;
8725 case TGSI_OPCODE_ATOMCAS:
8726 return LDS_OP3_LDS_CMP_XCHG_RET;
8727 default:
8728 return -1;
8729 }
8730 }
8731
tgsi_atomic_op_lds(struct r600_shader_ctx * ctx)8732 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
8733 {
8734 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8735 int lds_op = get_lds_op(inst->Instruction.Opcode);
8736 int r;
8737
8738 struct r600_bytecode_alu alu;
8739 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8740 alu.op = lds_op;
8741 alu.is_lds_idx_op = true;
8742 alu.last = 1;
8743 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8744 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
8745 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
8746 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
8747 else
8748 alu.src[2].sel = V_SQ_ALU_SRC_0;
8749 r = r600_bytecode_add_alu(ctx->bc, &alu);
8750 if (r)
8751 return r;
8752
8753 /* then read from LDS_OQ_A_POP */
8754 memset(&alu, 0, sizeof(alu));
8755
8756 alu.op = ALU_OP1_MOV;
8757 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
8758 alu.src[0].chan = 0;
8759 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
8760 alu.dst.write = 1;
8761 alu.last = 1;
8762 r = r600_bytecode_add_alu(ctx->bc, &alu);
8763 if (r)
8764 return r;
8765
8766 return 0;
8767 }
8768
tgsi_atomic_op(struct r600_shader_ctx * ctx)8769 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
8770 {
8771 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8772 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8773 return tgsi_atomic_op_rat(ctx);
8774 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8775 return tgsi_atomic_op_gds(ctx);
8776 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8777 return tgsi_atomic_op_rat(ctx);
8778 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8779 return tgsi_atomic_op_lds(ctx);
8780 return 0;
8781 }
8782
tgsi_resq(struct r600_shader_ctx * ctx)8783 static int tgsi_resq(struct r600_shader_ctx *ctx)
8784 {
8785 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8786 unsigned sampler_index_mode;
8787 struct r600_bytecode_tex tex;
8788 int r;
8789 boolean has_txq_cube_array_z = false;
8790
8791 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
8792 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
8793 if (ctx->bc->chip_class < EVERGREEN)
8794 ctx->shader->uses_tex_buffers = true;
8795 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
8796 }
8797
8798 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
8799 inst->Dst[0].Register.WriteMask & 4) {
8800 ctx->shader->has_txq_cube_array_z_comp = true;
8801 has_txq_cube_array_z = true;
8802 }
8803
8804 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8805 if (sampler_index_mode)
8806 egcm_load_index_reg(ctx->bc, 1, false);
8807
8808
8809 /* does this shader want a num layers from TXQ for a cube array? */
8810 if (has_txq_cube_array_z) {
8811 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
8812 struct r600_bytecode_alu alu;
8813
8814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8815 alu.op = ALU_OP1_MOV;
8816
8817 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8818 /* with eg each dword is either number of cubes */
8819 alu.src[0].sel += id / 4;
8820 alu.src[0].chan = id % 4;
8821 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8822 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8823 alu.last = 1;
8824 r = r600_bytecode_add_alu(ctx->bc, &alu);
8825 if (r)
8826 return r;
8827 /* disable writemask from texture instruction */
8828 inst->Dst[0].Register.WriteMask &= ~4;
8829 }
8830 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8831 tex.op = ctx->inst_info->op;
8832 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
8833 tex.sampler_index_mode = sampler_index_mode;
8834 tex.resource_id = tex.sampler_id;
8835 tex.resource_index_mode = sampler_index_mode;
8836 tex.src_sel_x = 4;
8837 tex.src_sel_y = 4;
8838 tex.src_sel_z = 4;
8839 tex.src_sel_w = 4;
8840 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8841 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8842 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8843 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8844 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8845 r = r600_bytecode_add_tex(ctx->bc, &tex);
8846 if (r)
8847 return r;
8848
8849 return 0;
8850 }
8851
tgsi_lrp(struct r600_shader_ctx * ctx)8852 static int tgsi_lrp(struct r600_shader_ctx *ctx)
8853 {
8854 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8855 struct r600_bytecode_alu alu;
8856 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8857 unsigned i, temp_regs[2];
8858 int r;
8859
8860 /* optimize if it's just an equal balance */
8861 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
8862 for (i = 0; i < lasti + 1; i++) {
8863 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8864 continue;
8865
8866 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8867 alu.op = ALU_OP2_ADD;
8868 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8869 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8870 alu.omod = 3;
8871 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8872 alu.dst.chan = i;
8873 if (i == lasti) {
8874 alu.last = 1;
8875 }
8876 r = r600_bytecode_add_alu(ctx->bc, &alu);
8877 if (r)
8878 return r;
8879 }
8880 return 0;
8881 }
8882
8883 /* 1 - src0 */
8884 for (i = 0; i < lasti + 1; i++) {
8885 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8886 continue;
8887
8888 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8889 alu.op = ALU_OP2_ADD;
8890 alu.src[0].sel = V_SQ_ALU_SRC_1;
8891 alu.src[0].chan = 0;
8892 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
8893 r600_bytecode_src_toggle_neg(&alu.src[1]);
8894 alu.dst.sel = ctx->temp_reg;
8895 alu.dst.chan = i;
8896 if (i == lasti) {
8897 alu.last = 1;
8898 }
8899 alu.dst.write = 1;
8900 r = r600_bytecode_add_alu(ctx->bc, &alu);
8901 if (r)
8902 return r;
8903 }
8904
8905 /* (1 - src0) * src2 */
8906 for (i = 0; i < lasti + 1; i++) {
8907 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8908 continue;
8909
8910 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8911 alu.op = ALU_OP2_MUL;
8912 alu.src[0].sel = ctx->temp_reg;
8913 alu.src[0].chan = i;
8914 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8915 alu.dst.sel = ctx->temp_reg;
8916 alu.dst.chan = i;
8917 if (i == lasti) {
8918 alu.last = 1;
8919 }
8920 alu.dst.write = 1;
8921 r = r600_bytecode_add_alu(ctx->bc, &alu);
8922 if (r)
8923 return r;
8924 }
8925
8926 /* src0 * src1 + (1 - src0) * src2 */
8927 if (ctx->src[0].abs)
8928 temp_regs[0] = r600_get_temp(ctx);
8929 else
8930 temp_regs[0] = 0;
8931 if (ctx->src[1].abs)
8932 temp_regs[1] = r600_get_temp(ctx);
8933 else
8934 temp_regs[1] = 0;
8935
8936 for (i = 0; i < lasti + 1; i++) {
8937 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8938 continue;
8939
8940 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8941 alu.op = ALU_OP3_MULADD;
8942 alu.is_op3 = 1;
8943 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8944 if (r)
8945 return r;
8946 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
8947 if (r)
8948 return r;
8949 alu.src[2].sel = ctx->temp_reg;
8950 alu.src[2].chan = i;
8951
8952 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8953 alu.dst.chan = i;
8954 if (i == lasti) {
8955 alu.last = 1;
8956 }
8957 r = r600_bytecode_add_alu(ctx->bc, &alu);
8958 if (r)
8959 return r;
8960 }
8961 return 0;
8962 }
8963
tgsi_cmp(struct r600_shader_ctx * ctx)8964 static int tgsi_cmp(struct r600_shader_ctx *ctx)
8965 {
8966 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8967 struct r600_bytecode_alu alu;
8968 int i, r, j;
8969 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8970 int temp_regs[3];
8971 unsigned op;
8972
8973 if (ctx->src[0].abs && ctx->src[0].neg) {
8974 op = ALU_OP3_CNDE;
8975 ctx->src[0].abs = 0;
8976 ctx->src[0].neg = 0;
8977 } else {
8978 op = ALU_OP3_CNDGE;
8979 }
8980
8981 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
8982 temp_regs[j] = 0;
8983 if (ctx->src[j].abs)
8984 temp_regs[j] = r600_get_temp(ctx);
8985 }
8986
8987 for (i = 0; i < lasti + 1; i++) {
8988 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8989 continue;
8990
8991 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8992 alu.op = op;
8993 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8994 if (r)
8995 return r;
8996 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
8997 if (r)
8998 return r;
8999 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
9000 if (r)
9001 return r;
9002 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9003 alu.dst.chan = i;
9004 alu.dst.write = 1;
9005 alu.is_op3 = 1;
9006 if (i == lasti)
9007 alu.last = 1;
9008 r = r600_bytecode_add_alu(ctx->bc, &alu);
9009 if (r)
9010 return r;
9011 }
9012 return 0;
9013 }
9014
tgsi_ucmp(struct r600_shader_ctx * ctx)9015 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9016 {
9017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9018 struct r600_bytecode_alu alu;
9019 int i, r;
9020 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9021
9022 for (i = 0; i < lasti + 1; i++) {
9023 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9024 continue;
9025
9026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9027 alu.op = ALU_OP3_CNDE_INT;
9028 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9029 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9030 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9031 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9032 alu.dst.chan = i;
9033 alu.dst.write = 1;
9034 alu.is_op3 = 1;
9035 if (i == lasti)
9036 alu.last = 1;
9037 r = r600_bytecode_add_alu(ctx->bc, &alu);
9038 if (r)
9039 return r;
9040 }
9041 return 0;
9042 }
9043
tgsi_exp(struct r600_shader_ctx * ctx)9044 static int tgsi_exp(struct r600_shader_ctx *ctx)
9045 {
9046 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9047 struct r600_bytecode_alu alu;
9048 int r;
9049 unsigned i;
9050
9051 /* result.x = 2^floor(src); */
9052 if (inst->Dst[0].Register.WriteMask & 1) {
9053 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9054
9055 alu.op = ALU_OP1_FLOOR;
9056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9057
9058 alu.dst.sel = ctx->temp_reg;
9059 alu.dst.chan = 0;
9060 alu.dst.write = 1;
9061 alu.last = 1;
9062 r = r600_bytecode_add_alu(ctx->bc, &alu);
9063 if (r)
9064 return r;
9065
9066 if (ctx->bc->chip_class == CAYMAN) {
9067 for (i = 0; i < 3; i++) {
9068 alu.op = ALU_OP1_EXP_IEEE;
9069 alu.src[0].sel = ctx->temp_reg;
9070 alu.src[0].chan = 0;
9071
9072 alu.dst.sel = ctx->temp_reg;
9073 alu.dst.chan = i;
9074 alu.dst.write = i == 0;
9075 alu.last = i == 2;
9076 r = r600_bytecode_add_alu(ctx->bc, &alu);
9077 if (r)
9078 return r;
9079 }
9080 } else {
9081 alu.op = ALU_OP1_EXP_IEEE;
9082 alu.src[0].sel = ctx->temp_reg;
9083 alu.src[0].chan = 0;
9084
9085 alu.dst.sel = ctx->temp_reg;
9086 alu.dst.chan = 0;
9087 alu.dst.write = 1;
9088 alu.last = 1;
9089 r = r600_bytecode_add_alu(ctx->bc, &alu);
9090 if (r)
9091 return r;
9092 }
9093 }
9094
9095 /* result.y = tmp - floor(tmp); */
9096 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9097 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9098
9099 alu.op = ALU_OP1_FRACT;
9100 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9101
9102 alu.dst.sel = ctx->temp_reg;
9103 #if 0
9104 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9105 if (r)
9106 return r;
9107 #endif
9108 alu.dst.write = 1;
9109 alu.dst.chan = 1;
9110
9111 alu.last = 1;
9112
9113 r = r600_bytecode_add_alu(ctx->bc, &alu);
9114 if (r)
9115 return r;
9116 }
9117
9118 /* result.z = RoughApprox2ToX(tmp);*/
9119 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9120 if (ctx->bc->chip_class == CAYMAN) {
9121 for (i = 0; i < 3; i++) {
9122 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9123 alu.op = ALU_OP1_EXP_IEEE;
9124 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9125
9126 alu.dst.sel = ctx->temp_reg;
9127 alu.dst.chan = i;
9128 if (i == 2) {
9129 alu.dst.write = 1;
9130 alu.last = 1;
9131 }
9132
9133 r = r600_bytecode_add_alu(ctx->bc, &alu);
9134 if (r)
9135 return r;
9136 }
9137 } else {
9138 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9139 alu.op = ALU_OP1_EXP_IEEE;
9140 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9141
9142 alu.dst.sel = ctx->temp_reg;
9143 alu.dst.write = 1;
9144 alu.dst.chan = 2;
9145
9146 alu.last = 1;
9147
9148 r = r600_bytecode_add_alu(ctx->bc, &alu);
9149 if (r)
9150 return r;
9151 }
9152 }
9153
9154 /* result.w = 1.0;*/
9155 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9156 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9157
9158 alu.op = ALU_OP1_MOV;
9159 alu.src[0].sel = V_SQ_ALU_SRC_1;
9160 alu.src[0].chan = 0;
9161
9162 alu.dst.sel = ctx->temp_reg;
9163 alu.dst.chan = 3;
9164 alu.dst.write = 1;
9165 alu.last = 1;
9166 r = r600_bytecode_add_alu(ctx->bc, &alu);
9167 if (r)
9168 return r;
9169 }
9170 return tgsi_helper_copy(ctx, inst);
9171 }
9172
tgsi_log(struct r600_shader_ctx * ctx)9173 static int tgsi_log(struct r600_shader_ctx *ctx)
9174 {
9175 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9176 struct r600_bytecode_alu alu;
9177 int r;
9178 unsigned i;
9179
9180 /* result.x = floor(log2(|src|)); */
9181 if (inst->Dst[0].Register.WriteMask & 1) {
9182 if (ctx->bc->chip_class == CAYMAN) {
9183 for (i = 0; i < 3; i++) {
9184 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9185
9186 alu.op = ALU_OP1_LOG_IEEE;
9187 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9188 r600_bytecode_src_set_abs(&alu.src[0]);
9189
9190 alu.dst.sel = ctx->temp_reg;
9191 alu.dst.chan = i;
9192 if (i == 0)
9193 alu.dst.write = 1;
9194 if (i == 2)
9195 alu.last = 1;
9196 r = r600_bytecode_add_alu(ctx->bc, &alu);
9197 if (r)
9198 return r;
9199 }
9200
9201 } else {
9202 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9203
9204 alu.op = ALU_OP1_LOG_IEEE;
9205 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9206 r600_bytecode_src_set_abs(&alu.src[0]);
9207
9208 alu.dst.sel = ctx->temp_reg;
9209 alu.dst.chan = 0;
9210 alu.dst.write = 1;
9211 alu.last = 1;
9212 r = r600_bytecode_add_alu(ctx->bc, &alu);
9213 if (r)
9214 return r;
9215 }
9216
9217 alu.op = ALU_OP1_FLOOR;
9218 alu.src[0].sel = ctx->temp_reg;
9219 alu.src[0].chan = 0;
9220
9221 alu.dst.sel = ctx->temp_reg;
9222 alu.dst.chan = 0;
9223 alu.dst.write = 1;
9224 alu.last = 1;
9225
9226 r = r600_bytecode_add_alu(ctx->bc, &alu);
9227 if (r)
9228 return r;
9229 }
9230
9231 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9232 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9233
9234 if (ctx->bc->chip_class == CAYMAN) {
9235 for (i = 0; i < 3; i++) {
9236 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9237
9238 alu.op = ALU_OP1_LOG_IEEE;
9239 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9240 r600_bytecode_src_set_abs(&alu.src[0]);
9241
9242 alu.dst.sel = ctx->temp_reg;
9243 alu.dst.chan = i;
9244 if (i == 1)
9245 alu.dst.write = 1;
9246 if (i == 2)
9247 alu.last = 1;
9248
9249 r = r600_bytecode_add_alu(ctx->bc, &alu);
9250 if (r)
9251 return r;
9252 }
9253 } else {
9254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9255
9256 alu.op = ALU_OP1_LOG_IEEE;
9257 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9258 r600_bytecode_src_set_abs(&alu.src[0]);
9259
9260 alu.dst.sel = ctx->temp_reg;
9261 alu.dst.chan = 1;
9262 alu.dst.write = 1;
9263 alu.last = 1;
9264
9265 r = r600_bytecode_add_alu(ctx->bc, &alu);
9266 if (r)
9267 return r;
9268 }
9269
9270 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9271
9272 alu.op = ALU_OP1_FLOOR;
9273 alu.src[0].sel = ctx->temp_reg;
9274 alu.src[0].chan = 1;
9275
9276 alu.dst.sel = ctx->temp_reg;
9277 alu.dst.chan = 1;
9278 alu.dst.write = 1;
9279 alu.last = 1;
9280
9281 r = r600_bytecode_add_alu(ctx->bc, &alu);
9282 if (r)
9283 return r;
9284
9285 if (ctx->bc->chip_class == CAYMAN) {
9286 for (i = 0; i < 3; i++) {
9287 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9288 alu.op = ALU_OP1_EXP_IEEE;
9289 alu.src[0].sel = ctx->temp_reg;
9290 alu.src[0].chan = 1;
9291
9292 alu.dst.sel = ctx->temp_reg;
9293 alu.dst.chan = i;
9294 if (i == 1)
9295 alu.dst.write = 1;
9296 if (i == 2)
9297 alu.last = 1;
9298
9299 r = r600_bytecode_add_alu(ctx->bc, &alu);
9300 if (r)
9301 return r;
9302 }
9303 } else {
9304 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9305 alu.op = ALU_OP1_EXP_IEEE;
9306 alu.src[0].sel = ctx->temp_reg;
9307 alu.src[0].chan = 1;
9308
9309 alu.dst.sel = ctx->temp_reg;
9310 alu.dst.chan = 1;
9311 alu.dst.write = 1;
9312 alu.last = 1;
9313
9314 r = r600_bytecode_add_alu(ctx->bc, &alu);
9315 if (r)
9316 return r;
9317 }
9318
9319 if (ctx->bc->chip_class == CAYMAN) {
9320 for (i = 0; i < 3; i++) {
9321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9322 alu.op = ALU_OP1_RECIP_IEEE;
9323 alu.src[0].sel = ctx->temp_reg;
9324 alu.src[0].chan = 1;
9325
9326 alu.dst.sel = ctx->temp_reg;
9327 alu.dst.chan = i;
9328 if (i == 1)
9329 alu.dst.write = 1;
9330 if (i == 2)
9331 alu.last = 1;
9332
9333 r = r600_bytecode_add_alu(ctx->bc, &alu);
9334 if (r)
9335 return r;
9336 }
9337 } else {
9338 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9339 alu.op = ALU_OP1_RECIP_IEEE;
9340 alu.src[0].sel = ctx->temp_reg;
9341 alu.src[0].chan = 1;
9342
9343 alu.dst.sel = ctx->temp_reg;
9344 alu.dst.chan = 1;
9345 alu.dst.write = 1;
9346 alu.last = 1;
9347
9348 r = r600_bytecode_add_alu(ctx->bc, &alu);
9349 if (r)
9350 return r;
9351 }
9352
9353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9354
9355 alu.op = ALU_OP2_MUL;
9356
9357 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9358 r600_bytecode_src_set_abs(&alu.src[0]);
9359
9360 alu.src[1].sel = ctx->temp_reg;
9361 alu.src[1].chan = 1;
9362
9363 alu.dst.sel = ctx->temp_reg;
9364 alu.dst.chan = 1;
9365 alu.dst.write = 1;
9366 alu.last = 1;
9367
9368 r = r600_bytecode_add_alu(ctx->bc, &alu);
9369 if (r)
9370 return r;
9371 }
9372
9373 /* result.z = log2(|src|);*/
9374 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
9375 if (ctx->bc->chip_class == CAYMAN) {
9376 for (i = 0; i < 3; i++) {
9377 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9378
9379 alu.op = ALU_OP1_LOG_IEEE;
9380 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9381 r600_bytecode_src_set_abs(&alu.src[0]);
9382
9383 alu.dst.sel = ctx->temp_reg;
9384 if (i == 2)
9385 alu.dst.write = 1;
9386 alu.dst.chan = i;
9387 if (i == 2)
9388 alu.last = 1;
9389
9390 r = r600_bytecode_add_alu(ctx->bc, &alu);
9391 if (r)
9392 return r;
9393 }
9394 } else {
9395 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9396
9397 alu.op = ALU_OP1_LOG_IEEE;
9398 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9399 r600_bytecode_src_set_abs(&alu.src[0]);
9400
9401 alu.dst.sel = ctx->temp_reg;
9402 alu.dst.write = 1;
9403 alu.dst.chan = 2;
9404 alu.last = 1;
9405
9406 r = r600_bytecode_add_alu(ctx->bc, &alu);
9407 if (r)
9408 return r;
9409 }
9410 }
9411
9412 /* result.w = 1.0; */
9413 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
9414 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9415
9416 alu.op = ALU_OP1_MOV;
9417 alu.src[0].sel = V_SQ_ALU_SRC_1;
9418 alu.src[0].chan = 0;
9419
9420 alu.dst.sel = ctx->temp_reg;
9421 alu.dst.chan = 3;
9422 alu.dst.write = 1;
9423 alu.last = 1;
9424
9425 r = r600_bytecode_add_alu(ctx->bc, &alu);
9426 if (r)
9427 return r;
9428 }
9429
9430 return tgsi_helper_copy(ctx, inst);
9431 }
9432
tgsi_eg_arl(struct r600_shader_ctx * ctx)9433 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
9434 {
9435 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9436 struct r600_bytecode_alu alu;
9437 int r;
9438 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9439 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
9440
9441 assert(inst->Dst[0].Register.Index < 3);
9442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9443
9444 switch (inst->Instruction.Opcode) {
9445 case TGSI_OPCODE_ARL:
9446 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
9447 break;
9448 case TGSI_OPCODE_ARR:
9449 alu.op = ALU_OP1_FLT_TO_INT;
9450 break;
9451 case TGSI_OPCODE_UARL:
9452 alu.op = ALU_OP1_MOV;
9453 break;
9454 default:
9455 assert(0);
9456 return -1;
9457 }
9458
9459 for (i = 0; i <= lasti; ++i) {
9460 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9461 continue;
9462 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9463 alu.last = i == lasti;
9464 alu.dst.sel = reg;
9465 alu.dst.chan = i;
9466 alu.dst.write = 1;
9467 r = r600_bytecode_add_alu(ctx->bc, &alu);
9468 if (r)
9469 return r;
9470 }
9471
9472 if (inst->Dst[0].Register.Index > 0)
9473 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
9474 else
9475 ctx->bc->ar_loaded = 0;
9476
9477 return 0;
9478 }
tgsi_r600_arl(struct r600_shader_ctx * ctx)9479 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
9480 {
9481 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9482 struct r600_bytecode_alu alu;
9483 int r;
9484 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9485
9486 switch (inst->Instruction.Opcode) {
9487 case TGSI_OPCODE_ARL:
9488 memset(&alu, 0, sizeof(alu));
9489 alu.op = ALU_OP1_FLOOR;
9490 alu.dst.sel = ctx->bc->ar_reg;
9491 alu.dst.write = 1;
9492 for (i = 0; i <= lasti; ++i) {
9493 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9494 alu.dst.chan = i;
9495 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9496 alu.last = i == lasti;
9497 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9498 return r;
9499 }
9500 }
9501
9502 memset(&alu, 0, sizeof(alu));
9503 alu.op = ALU_OP1_FLT_TO_INT;
9504 alu.src[0].sel = ctx->bc->ar_reg;
9505 alu.dst.sel = ctx->bc->ar_reg;
9506 alu.dst.write = 1;
9507 /* FLT_TO_INT is trans-only on r600/r700 */
9508 alu.last = TRUE;
9509 for (i = 0; i <= lasti; ++i) {
9510 alu.dst.chan = i;
9511 alu.src[0].chan = i;
9512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9513 return r;
9514 }
9515 break;
9516 case TGSI_OPCODE_ARR:
9517 memset(&alu, 0, sizeof(alu));
9518 alu.op = ALU_OP1_FLT_TO_INT;
9519 alu.dst.sel = ctx->bc->ar_reg;
9520 alu.dst.write = 1;
9521 /* FLT_TO_INT is trans-only on r600/r700 */
9522 alu.last = TRUE;
9523 for (i = 0; i <= lasti; ++i) {
9524 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9525 alu.dst.chan = i;
9526 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9527 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9528 return r;
9529 }
9530 }
9531 break;
9532 case TGSI_OPCODE_UARL:
9533 memset(&alu, 0, sizeof(alu));
9534 alu.op = ALU_OP1_MOV;
9535 alu.dst.sel = ctx->bc->ar_reg;
9536 alu.dst.write = 1;
9537 for (i = 0; i <= lasti; ++i) {
9538 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9539 alu.dst.chan = i;
9540 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9541 alu.last = i == lasti;
9542 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9543 return r;
9544 }
9545 }
9546 break;
9547 default:
9548 assert(0);
9549 return -1;
9550 }
9551
9552 ctx->bc->ar_loaded = 0;
9553 return 0;
9554 }
9555
tgsi_opdst(struct r600_shader_ctx * ctx)9556 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9557 {
9558 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9559 struct r600_bytecode_alu alu;
9560 int i, r = 0;
9561
9562 for (i = 0; i < 4; i++) {
9563 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9564
9565 alu.op = ALU_OP2_MUL;
9566 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9567
9568 if (i == 0 || i == 3) {
9569 alu.src[0].sel = V_SQ_ALU_SRC_1;
9570 } else {
9571 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9572 }
9573
9574 if (i == 0 || i == 2) {
9575 alu.src[1].sel = V_SQ_ALU_SRC_1;
9576 } else {
9577 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9578 }
9579 if (i == 3)
9580 alu.last = 1;
9581 r = r600_bytecode_add_alu(ctx->bc, &alu);
9582 if (r)
9583 return r;
9584 }
9585 return 0;
9586 }
9587
emit_logic_pred(struct r600_shader_ctx * ctx,int opcode,int alu_type)9588 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
9589 {
9590 struct r600_bytecode_alu alu;
9591 int r;
9592
9593 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9594 alu.op = opcode;
9595 alu.execute_mask = 1;
9596 alu.update_pred = 1;
9597
9598 alu.dst.sel = ctx->temp_reg;
9599 alu.dst.write = 1;
9600 alu.dst.chan = 0;
9601
9602 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9603 alu.src[1].sel = V_SQ_ALU_SRC_0;
9604 alu.src[1].chan = 0;
9605
9606 alu.last = 1;
9607
9608 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9609 if (r)
9610 return r;
9611 return 0;
9612 }
9613
pops(struct r600_shader_ctx * ctx,int pops)9614 static int pops(struct r600_shader_ctx *ctx, int pops)
9615 {
9616 unsigned force_pop = ctx->bc->force_add_cf;
9617
9618 if (!force_pop) {
9619 int alu_pop = 3;
9620 if (ctx->bc->cf_last) {
9621 if (ctx->bc->cf_last->op == CF_OP_ALU)
9622 alu_pop = 0;
9623 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9624 alu_pop = 1;
9625 }
9626 alu_pop += pops;
9627 if (alu_pop == 1) {
9628 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9629 ctx->bc->force_add_cf = 1;
9630 } else if (alu_pop == 2) {
9631 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9632 ctx->bc->force_add_cf = 1;
9633 } else {
9634 force_pop = 1;
9635 }
9636 }
9637
9638 if (force_pop) {
9639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9640 ctx->bc->cf_last->pop_count = pops;
9641 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9642 }
9643
9644 return 0;
9645 }
9646
callstack_update_max_depth(struct r600_shader_ctx * ctx,unsigned reason)9647 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
9648 unsigned reason)
9649 {
9650 struct r600_stack_info *stack = &ctx->bc->stack;
9651 unsigned elements;
9652 int entries;
9653
9654 unsigned entry_size = stack->entry_size;
9655
9656 elements = (stack->loop + stack->push_wqm ) * entry_size;
9657 elements += stack->push;
9658
9659 switch (ctx->bc->chip_class) {
9660 case R600:
9661 case R700:
9662 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9663 * the stack must be reserved to hold the current active/continue
9664 * masks */
9665 if (reason == FC_PUSH_VPM || stack->push > 0) {
9666 elements += 2;
9667 }
9668 break;
9669
9670 case CAYMAN:
9671 /* r9xx: any stack operation on empty stack consumes 2 additional
9672 * elements */
9673 elements += 2;
9674
9675 /* fallthrough */
9676 /* FIXME: do the two elements added above cover the cases for the
9677 * r8xx+ below? */
9678
9679 case EVERGREEN:
9680 /* r8xx+: 2 extra elements are not always required, but one extra
9681 * element must be added for each of the following cases:
9682 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9683 * stack usage.
9684 * (Currently we don't use ALU_ELSE_AFTER.)
9685 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9686 * PUSH instruction executed.
9687 *
9688 * NOTE: it seems we also need to reserve additional element in some
9689 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9690 * then STACK_SIZE should be 2 instead of 1 */
9691 if (reason == FC_PUSH_VPM || stack->push > 0) {
9692 elements += 1;
9693 }
9694 break;
9695
9696 default:
9697 assert(0);
9698 break;
9699 }
9700
9701 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9702 * for all chips, so we use 4 in the final formula, not the real entry_size
9703 * for the chip */
9704 entry_size = 4;
9705
9706 entries = (elements + (entry_size - 1)) / entry_size;
9707
9708 if (entries > stack->max_entries)
9709 stack->max_entries = entries;
9710 return elements;
9711 }
9712
callstack_pop(struct r600_shader_ctx * ctx,unsigned reason)9713 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9714 {
9715 switch(reason) {
9716 case FC_PUSH_VPM:
9717 --ctx->bc->stack.push;
9718 assert(ctx->bc->stack.push >= 0);
9719 break;
9720 case FC_PUSH_WQM:
9721 --ctx->bc->stack.push_wqm;
9722 assert(ctx->bc->stack.push_wqm >= 0);
9723 break;
9724 case FC_LOOP:
9725 --ctx->bc->stack.loop;
9726 assert(ctx->bc->stack.loop >= 0);
9727 break;
9728 default:
9729 assert(0);
9730 break;
9731 }
9732 }
9733
callstack_push(struct r600_shader_ctx * ctx,unsigned reason)9734 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9735 {
9736 switch (reason) {
9737 case FC_PUSH_VPM:
9738 ++ctx->bc->stack.push;
9739 break;
9740 case FC_PUSH_WQM:
9741 ++ctx->bc->stack.push_wqm;
9742 break;
9743 case FC_LOOP:
9744 ++ctx->bc->stack.loop;
9745 break;
9746 default:
9747 assert(0);
9748 }
9749
9750 return callstack_update_max_depth(ctx, reason);
9751 }
9752
fc_set_mid(struct r600_shader_ctx * ctx,int fc_sp)9753 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
9754 {
9755 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
9756
9757 sp->mid = realloc((void *)sp->mid,
9758 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
9759 sp->mid[sp->num_mid] = ctx->bc->cf_last;
9760 sp->num_mid++;
9761 }
9762
fc_pushlevel(struct r600_shader_ctx * ctx,int type)9763 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
9764 {
9765 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
9766 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
9767 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
9768 ctx->bc->fc_sp++;
9769 }
9770
fc_poplevel(struct r600_shader_ctx * ctx)9771 static void fc_poplevel(struct r600_shader_ctx *ctx)
9772 {
9773 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
9774 free(sp->mid);
9775 sp->mid = NULL;
9776 sp->num_mid = 0;
9777 sp->start = NULL;
9778 sp->type = 0;
9779 ctx->bc->fc_sp--;
9780 }
9781
9782 #if 0
9783 static int emit_return(struct r600_shader_ctx *ctx)
9784 {
9785 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
9786 return 0;
9787 }
9788
9789 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
9790 {
9791
9792 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
9793 ctx->bc->cf_last->pop_count = pops;
9794 /* XXX work out offset */
9795 return 0;
9796 }
9797
9798 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
9799 {
9800 return 0;
9801 }
9802
9803 static void emit_testflag(struct r600_shader_ctx *ctx)
9804 {
9805
9806 }
9807
9808 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
9809 {
9810 emit_testflag(ctx);
9811 emit_jump_to_offset(ctx, 1, 4);
9812 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
9813 pops(ctx, ifidx + 1);
9814 emit_return(ctx);
9815 }
9816
9817 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
9818 {
9819 emit_testflag(ctx);
9820
9821 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9822 ctx->bc->cf_last->pop_count = 1;
9823
9824 fc_set_mid(ctx, fc_sp);
9825
9826 pops(ctx, 1);
9827 }
9828 #endif
9829
emit_if(struct r600_shader_ctx * ctx,int opcode)9830 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
9831 {
9832 int alu_type = CF_OP_ALU_PUSH_BEFORE;
9833 bool needs_workaround = false;
9834 int elems = callstack_push(ctx, FC_PUSH_VPM);
9835
9836 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
9837 needs_workaround = true;
9838
9839 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
9840 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
9841 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
9842
9843 if (elems && (!dmod1 || !dmod2))
9844 needs_workaround = true;
9845 }
9846
9847 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
9848 * LOOP_STARTxxx for nested loops may put the branch stack into a state
9849 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
9850 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
9851 if (needs_workaround) {
9852 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
9853 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9854 alu_type = CF_OP_ALU;
9855 }
9856
9857 emit_logic_pred(ctx, opcode, alu_type);
9858
9859 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
9860
9861 fc_pushlevel(ctx, FC_IF);
9862
9863 return 0;
9864 }
9865
tgsi_if(struct r600_shader_ctx * ctx)9866 static int tgsi_if(struct r600_shader_ctx *ctx)
9867 {
9868 return emit_if(ctx, ALU_OP2_PRED_SETNE);
9869 }
9870
tgsi_uif(struct r600_shader_ctx * ctx)9871 static int tgsi_uif(struct r600_shader_ctx *ctx)
9872 {
9873 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
9874 }
9875
tgsi_else(struct r600_shader_ctx * ctx)9876 static int tgsi_else(struct r600_shader_ctx *ctx)
9877 {
9878 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
9879 ctx->bc->cf_last->pop_count = 1;
9880
9881 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
9882 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
9883 return 0;
9884 }
9885
tgsi_endif(struct r600_shader_ctx * ctx)9886 static int tgsi_endif(struct r600_shader_ctx *ctx)
9887 {
9888 int offset = 2;
9889 pops(ctx, 1);
9890 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
9891 R600_ERR("if/endif unbalanced in shader\n");
9892 return -1;
9893 }
9894
9895 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
9896 if (ctx->bc->cf_last->eg_alu_extended)
9897 offset += 2;
9898
9899 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
9900 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
9901 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
9902 } else {
9903 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
9904 }
9905 fc_poplevel(ctx);
9906
9907 callstack_pop(ctx, FC_PUSH_VPM);
9908 return 0;
9909 }
9910
tgsi_bgnloop(struct r600_shader_ctx * ctx)9911 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
9912 {
9913 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
9914 * limited to 4096 iterations, like the other LOOP_* instructions. */
9915 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
9916
9917 fc_pushlevel(ctx, FC_LOOP);
9918
9919 /* check stack depth */
9920 callstack_push(ctx, FC_LOOP);
9921 return 0;
9922 }
9923
tgsi_endloop(struct r600_shader_ctx * ctx)9924 static int tgsi_endloop(struct r600_shader_ctx *ctx)
9925 {
9926 int i;
9927
9928 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
9929
9930 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
9931 R600_ERR("loop/endloop in shader code are not paired.\n");
9932 return -EINVAL;
9933 }
9934
9935 /* fixup loop pointers - from r600isa
9936 LOOP END points to CF after LOOP START,
9937 LOOP START point to CF after LOOP END
9938 BRK/CONT point to LOOP END CF
9939 */
9940 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
9941
9942 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9943
9944 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
9945 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
9946 }
9947 /* XXX add LOOPRET support */
9948 fc_poplevel(ctx);
9949 callstack_pop(ctx, FC_LOOP);
9950 return 0;
9951 }
9952
tgsi_loop_brk_cont(struct r600_shader_ctx * ctx)9953 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
9954 {
9955 unsigned int fscp;
9956
9957 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
9958 {
9959 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
9960 break;
9961 }
9962
9963 if (fscp == 0) {
9964 R600_ERR("Break not inside loop/endloop pair\n");
9965 return -EINVAL;
9966 }
9967
9968 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9969
9970 fc_set_mid(ctx, fscp - 1);
9971
9972 return 0;
9973 }
9974
tgsi_gs_emit(struct r600_shader_ctx * ctx)9975 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
9976 {
9977 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9978 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
9979 int r;
9980
9981 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9982 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
9983
9984 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9985 if (!r) {
9986 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
9987 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9988 return emit_inc_ring_offset(ctx, stream, TRUE);
9989 }
9990 return r;
9991 }
9992
tgsi_umad(struct r600_shader_ctx * ctx)9993 static int tgsi_umad(struct r600_shader_ctx *ctx)
9994 {
9995 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9996 struct r600_bytecode_alu alu;
9997 int i, j, k, r;
9998 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9999
10000 /* src0 * src1 */
10001 for (i = 0; i < lasti + 1; i++) {
10002 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10003 continue;
10004
10005 if (ctx->bc->chip_class == CAYMAN) {
10006 for (j = 0 ; j < 4; j++) {
10007 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10008
10009 alu.op = ALU_OP2_MULLO_UINT;
10010 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
10011 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
10012 }
10013 alu.dst.chan = j;
10014 alu.dst.sel = ctx->temp_reg;
10015 alu.dst.write = (j == i);
10016 if (j == 3)
10017 alu.last = 1;
10018 r = r600_bytecode_add_alu(ctx->bc, &alu);
10019 if (r)
10020 return r;
10021 }
10022 } else {
10023 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10024
10025 alu.dst.chan = i;
10026 alu.dst.sel = ctx->temp_reg;
10027 alu.dst.write = 1;
10028
10029 alu.op = ALU_OP2_MULLO_UINT;
10030 for (j = 0; j < 2; j++) {
10031 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10032 }
10033
10034 alu.last = 1;
10035 r = r600_bytecode_add_alu(ctx->bc, &alu);
10036 if (r)
10037 return r;
10038 }
10039 }
10040
10041
10042 for (i = 0; i < lasti + 1; i++) {
10043 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10044 continue;
10045
10046 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10047 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10048
10049 alu.op = ALU_OP2_ADD_INT;
10050
10051 alu.src[0].sel = ctx->temp_reg;
10052 alu.src[0].chan = i;
10053
10054 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10055 if (i == lasti) {
10056 alu.last = 1;
10057 }
10058 r = r600_bytecode_add_alu(ctx->bc, &alu);
10059 if (r)
10060 return r;
10061 }
10062 return 0;
10063 }
10064
tgsi_pk2h(struct r600_shader_ctx * ctx)10065 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10066 {
10067 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10068 struct r600_bytecode_alu alu;
10069 int r, i;
10070 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10071
10072 /* temp.xy = f32_to_f16(src) */
10073 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10074 alu.op = ALU_OP1_FLT32_TO_FLT16;
10075 alu.dst.chan = 0;
10076 alu.dst.sel = ctx->temp_reg;
10077 alu.dst.write = 1;
10078 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10079 r = r600_bytecode_add_alu(ctx->bc, &alu);
10080 if (r)
10081 return r;
10082 alu.dst.chan = 1;
10083 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10084 alu.last = 1;
10085 r = r600_bytecode_add_alu(ctx->bc, &alu);
10086 if (r)
10087 return r;
10088
10089 /* dst.x = temp.y * 0x10000 + temp.x */
10090 for (i = 0; i < lasti + 1; i++) {
10091 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10092 continue;
10093
10094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10095 alu.op = ALU_OP3_MULADD_UINT24;
10096 alu.is_op3 = 1;
10097 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10098 alu.last = i == lasti;
10099 alu.src[0].sel = ctx->temp_reg;
10100 alu.src[0].chan = 1;
10101 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10102 alu.src[1].value = 0x10000;
10103 alu.src[2].sel = ctx->temp_reg;
10104 alu.src[2].chan = 0;
10105 r = r600_bytecode_add_alu(ctx->bc, &alu);
10106 if (r)
10107 return r;
10108 }
10109
10110 return 0;
10111 }
10112
tgsi_up2h(struct r600_shader_ctx * ctx)10113 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10114 {
10115 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10116 struct r600_bytecode_alu alu;
10117 int r, i;
10118 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10119
10120 /* temp.x = src.x */
10121 /* note: no need to mask out the high bits */
10122 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10123 alu.op = ALU_OP1_MOV;
10124 alu.dst.chan = 0;
10125 alu.dst.sel = ctx->temp_reg;
10126 alu.dst.write = 1;
10127 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10128 r = r600_bytecode_add_alu(ctx->bc, &alu);
10129 if (r)
10130 return r;
10131
10132 /* temp.y = src.x >> 16 */
10133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10134 alu.op = ALU_OP2_LSHR_INT;
10135 alu.dst.chan = 1;
10136 alu.dst.sel = ctx->temp_reg;
10137 alu.dst.write = 1;
10138 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10139 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10140 alu.src[1].value = 16;
10141 alu.last = 1;
10142 r = r600_bytecode_add_alu(ctx->bc, &alu);
10143 if (r)
10144 return r;
10145
10146 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10147 for (i = 0; i < lasti + 1; i++) {
10148 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10149 continue;
10150 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10151 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10152 alu.op = ALU_OP1_FLT16_TO_FLT32;
10153 alu.src[0].sel = ctx->temp_reg;
10154 alu.src[0].chan = i % 2;
10155 alu.last = i == lasti;
10156 r = r600_bytecode_add_alu(ctx->bc, &alu);
10157 if (r)
10158 return r;
10159 }
10160
10161 return 0;
10162 }
10163
tgsi_bfe(struct r600_shader_ctx * ctx)10164 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10165 {
10166 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10167 struct r600_bytecode_alu alu;
10168 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10169 int r, i;
10170 int dst = -1;
10171
10172 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10173 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10174 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10175 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10176 dst = r600_get_temp(ctx);
10177
10178 r = tgsi_op3_dst(ctx, dst);
10179 if (r)
10180 return r;
10181
10182 for (i = 0; i < lasti + 1; i++) {
10183 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10184 alu.op = ALU_OP2_SETGE_INT;
10185 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10186 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10187 alu.src[1].value = 32;
10188 alu.dst.sel = ctx->temp_reg;
10189 alu.dst.chan = i;
10190 alu.dst.write = 1;
10191 if (i == lasti)
10192 alu.last = 1;
10193 r = r600_bytecode_add_alu(ctx->bc, &alu);
10194 if (r)
10195 return r;
10196 }
10197
10198 for (i = 0; i < lasti + 1; i++) {
10199 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10200 alu.op = ALU_OP3_CNDE_INT;
10201 alu.is_op3 = 1;
10202 alu.src[0].sel = ctx->temp_reg;
10203 alu.src[0].chan = i;
10204
10205 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10206 if (dst != -1)
10207 alu.src[1].sel = dst;
10208 else
10209 alu.src[1].sel = alu.dst.sel;
10210 alu.src[1].chan = i;
10211 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10212 alu.dst.write = 1;
10213 if (i == lasti)
10214 alu.last = 1;
10215 r = r600_bytecode_add_alu(ctx->bc, &alu);
10216 if (r)
10217 return r;
10218 }
10219
10220 return 0;
10221 }
10222
tgsi_clock(struct r600_shader_ctx * ctx)10223 static int tgsi_clock(struct r600_shader_ctx *ctx)
10224 {
10225 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10226 struct r600_bytecode_alu alu;
10227 int r;
10228
10229 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10230 alu.op = ALU_OP1_MOV;
10231 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10232 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10233 r = r600_bytecode_add_alu(ctx->bc, &alu);
10234 if (r)
10235 return r;
10236 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10237 alu.op = ALU_OP1_MOV;
10238 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10239 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10240 alu.last = 1;
10241 r = r600_bytecode_add_alu(ctx->bc, &alu);
10242 if (r)
10243 return r;
10244 return 0;
10245 }
10246
10247 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
10248 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
10249 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10250 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10251
10252 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10253
10254 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
10255 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10256 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10257 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10258 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10259 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10260 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10261 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10262 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
10263 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10264 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10265 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10266 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10267 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10268 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10269 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
10270 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10271 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10272 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10273 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10274 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10275 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10276 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10277 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10278 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10279 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10280 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
10281 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10282 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10283 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
10284 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10285 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10286 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
10287 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10288 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10289 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10290 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
10291 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10292 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10293 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10294 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10295 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10296 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10297 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10298 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
10299 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10300 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10301 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10302 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10303 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10304 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10305 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
10306 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10307 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10308 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10309 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10310 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10311 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
10312 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10313 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10314 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10315 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10316 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10317 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10318 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10319 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10320 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10321 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10322 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10323 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10324 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10325 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10326 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10327 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10328 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10329 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
10330 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
10331 [81] = { ALU_OP0_NOP, tgsi_unsupported},
10332 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10333 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10334 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10335 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10336 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10337 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
10338 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10339 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10340 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10341 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10342 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10343 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10344 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10345 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10346 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10347 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10348 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10349 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10350 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10351 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10352 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10353 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10354 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10355 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
10356 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10357 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10358 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10359 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10360 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10361 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10362 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
10363 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10364 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10365 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10366 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10367 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10368 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
10369 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
10370 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10371 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10372 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10373 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10374 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10375 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
10376 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10377 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
10378 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10379 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10380 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10381 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10382 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10383 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10384 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10385 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10386 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10387 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10388 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
10389 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10390 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
10391 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10392 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10393 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10394 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10395 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10396 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10397 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10398 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10399 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10400 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10401 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10402 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10403 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10404 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10405 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10406 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10407 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
10408 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10409 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10410 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10411 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
10412 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
10413 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10414 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10415 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10416 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
10417 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
10418 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
10419 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
10420 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
10421 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
10422 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
10423 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
10424 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
10425 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
10426 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
10427 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10428 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10429 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10430 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10431 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10432 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
10433 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
10434 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
10435 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
10436 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
10437 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
10438 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
10439 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
10440 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
10441 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
10442 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
10443 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
10444 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
10445 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10446 };
10447
10448 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
10449 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10450 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10451 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10452 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10453 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
10454 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10455 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10456 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10457 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10458 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10459 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10460 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10461 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10462 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10463 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10464 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10465 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10466 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10467 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10468 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10469 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10470 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10471 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10472 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10473 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10474 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10475 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10476 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10477 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10478 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
10479 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10480 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10481 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
10482 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10483 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10484 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
10485 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10486 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10487 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10488 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10489 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10490 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10491 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10492 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10493 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10494 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10495 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10496 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
10497 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10498 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10499 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10500 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10501 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10502 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10503 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10504 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10505 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10506 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10507 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10508 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10509 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10510 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10511 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10512 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10513 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10514 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10515 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10516 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10517 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10518 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10519 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10520 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10521 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10522 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10523 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10524 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10525 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10526 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10527 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10528 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10529 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10530 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10531 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10532 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10533 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10534 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10535 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10536 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10537 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10538 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10539 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10540 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10541 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10542 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10543 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10544 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10545 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10546 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10547 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10548 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10549 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10550 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10551 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10552 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10553 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10554 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10555 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10556 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10557 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10558 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10559 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10560 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10561 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10562 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10563 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10564 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10565 /* Refer below for TGSI_OPCODE_DFMA */
10566 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
10567 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10568 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10569 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10570 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10571 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10572 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10573 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10574 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
10575 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10576 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10577 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10578 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10579 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10580 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10581 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10582 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10583 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10584 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10585 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10586 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10587 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10588 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10589 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10590 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10591 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10592 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10593 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10594 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10595 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10596 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10597 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10598 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10599 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10600 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10601 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10602 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10603 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10604 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10605 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10606 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10607 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10608 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10609 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10610 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10611 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10612 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10613 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10614 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10615 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10616 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10617 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10618 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10619 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10620 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10621 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10622 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10623 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10624 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10625 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10626 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10627 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10628 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10629 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10630 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10631 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10632 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10633 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10634 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10635 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10636 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10637 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10638 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10639 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10640 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10641 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10642 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10643 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10644 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10645 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10646 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10647 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10648 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10649 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10650 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10651 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10652 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10653 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10654 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10655 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10656 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10657 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10658 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10659 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10660 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10661 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10662 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10663 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10664 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10665 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10666 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10667 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10668 };
10669
10670 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
10671 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10672 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10673 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10674 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
10675 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
10676 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10677 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10678 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10679 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10680 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10681 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10682 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10683 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10684 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10685 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10686 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10687 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10688 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10689 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10690 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
10691 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10692 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10693 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10694 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10695 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10696 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10697 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10698 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
10699 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
10700 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
10701 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10702 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10703 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
10704 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10705 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10706 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
10707 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10708 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10709 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10710 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10711 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10712 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10713 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10714 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10715 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10716 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10717 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10718 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
10719 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10720 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10721 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10722 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10723 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10724 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10725 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10726 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10727 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10728 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10729 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10730 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10731 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10732 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10733 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10734 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10735 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10736 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10737 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10738 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10739 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10740 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10741 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10742 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10743 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10744 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10745 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10746 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10747 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10748 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10749 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10750 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10751 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10752 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10753 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
10754 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10755 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10756 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10757 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10758 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10759 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10760 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10761 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10762 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10763 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10764 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10765 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10766 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10767 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10768 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10769 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10770 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10771 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10772 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10773 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10774 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10775 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10776 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10777 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10778 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10779 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10780 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10781 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10782 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10783 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10784 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10785 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10786 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10787 /* Refer below for TGSI_OPCODE_DFMA */
10788 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
10789 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10790 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10791 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10792 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10793 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10794 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10795 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10796 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
10797 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
10798 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10799 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10800 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10801 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10802 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10803 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10804 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
10805 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10806 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10807 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10808 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10809 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10810 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10811 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10812 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10813 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10814 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10815 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10816 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10817 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10818 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10819 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10820 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10821 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10822 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10823 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10824 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10825 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10826 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10827 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10828 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10829 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10830 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10831 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10832 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10833 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10834 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10835 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10836 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10837 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10838 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10839 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10840 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10841 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10842 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10843 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10844 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10845 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10846 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10847 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10848 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10849 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
10850 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
10851 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10852 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10853 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10854 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10855 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10856 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10857 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10858 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10859 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10860 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10861 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10862 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10863 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10864 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10865 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10866 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10867 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10868 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10869 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10870 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10871 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10872 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10873 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10874 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10875 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10876 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10877 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10878 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10879 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10880 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10881 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10882 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10883 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10884 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10885 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10886 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10887 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10888 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10889 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10890 };
10891