1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28
29 namespace brw {
30
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->eot = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->exec_size = 8;
59 this->group = 0;
60 this->size_written = (dst.file == BAD_FILE ?
61 0 : this->exec_size * type_sz(dst.type));
62 this->annotation = NULL;
63 }
64
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68 inst->ir = this->base_ir;
69 inst->annotation = this->current_annotation;
70
71 this->instructions.push_tail(inst);
72
73 return inst;
74 }
75
76 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78 vec4_instruction *new_inst)
79 {
80 new_inst->ir = inst->ir;
81 new_inst->annotation = inst->annotation;
82
83 inst->insert_before(block, new_inst);
84
85 return inst;
86 }
87
88 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90 const src_reg &src1, const src_reg &src2)
91 {
92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94
95
96 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98 const src_reg &src1)
99 {
100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102
103 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108
109 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114
115 vec4_instruction *
emit(enum opcode opcode)116 vec4_visitor::emit(enum opcode opcode)
117 {
118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120
121 #define ALU1(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
124 { \
125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126 }
127
128 #define ALU2(op) \
129 vec4_instruction * \
130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
131 const src_reg &src1) \
132 { \
133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
134 src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 vec4_instruction * \
139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
140 const src_reg &src1) \
141 { \
142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
143 BRW_OPCODE_##op, dst, src0, src1); \
144 inst->writes_accumulator = true; \
145 return inst; \
146 }
147
148 #define ALU3(op) \
149 vec4_instruction * \
150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
151 const src_reg &src1, const src_reg &src2) \
152 { \
153 assert(devinfo->gen >= 6); \
154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
155 src0, src1, src2); \
156 }
157
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191
192 /** Gen4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196 vec4_instruction *inst;
197
198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199 inst->predicate = predicate;
200
201 return inst;
202 }
203
204 /** Gen6 IF with embedded comparison. */
205 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)206 vec4_visitor::IF(src_reg src0, src_reg src1,
207 enum brw_conditional_mod condition)
208 {
209 assert(devinfo->gen == 6);
210
211 vec4_instruction *inst;
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 src0, src1);
218 inst->conditional_mod = condition;
219
220 return inst;
221 }
222
223 /**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230 enum brw_conditional_mod condition)
231 {
232 vec4_instruction *inst;
233
234 /* Take the instruction:
235 *
236 * CMP null<d> src0<f> src1<f>
237 *
238 * Original gen4 does type conversion to the destination type before
239 * comparison, producing garbage results for floating point comparisons.
240 *
241 * The destination type doesn't matter on newer generations, so we set the
242 * type to match src0 so we can compact the instruction.
243 */
244 dst.type = src0.type;
245
246 resolve_ud_negate(&src0);
247 resolve_ud_negate(&src1);
248
249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250 inst->conditional_mod = condition;
251
252 return inst;
253 }
254
255 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258 vec4_instruction *inst;
259
260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
261 dst, index);
262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
263 inst->mlen = 2;
264
265 return inst;
266 }
267
268 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270 const src_reg &index)
271 {
272 vec4_instruction *inst;
273
274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
275 dst, src, index);
276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
277 inst->mlen = 3;
278
279 return inst;
280 }
281
282 src_reg
fix_3src_operand(const src_reg & src)283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286 * able to use vertical stride of zero to replicate the vec4 uniform, like
287 *
288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289 *
290 * But you can't, since vertical stride is always four in three-source
291 * instructions. Instead, insert a MOV instruction to do the replication so
292 * that the three-source instruction can consume it.
293 */
294
295 /* The MOV is only needed if the source is a uniform or immediate. */
296 if (src.file != UNIFORM && src.file != IMM)
297 return src;
298
299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305 return src_reg(expanded);
306 }
307
308 src_reg
fix_math_operand(const src_reg & src)309 vec4_visitor::fix_math_operand(const src_reg &src)
310 {
311 if (devinfo->gen < 6 || src.file == BAD_FILE)
312 return src;
313
314 /* The gen6 math instruction ignores the source modifiers --
315 * swizzle, abs, negate, and at least some parts of the register
316 * region description.
317 *
318 * Rather than trying to enumerate all these cases, *always* expand the
319 * operand to a temp GRF for gen6.
320 *
321 * For gen7, keep the operand as-is, except if immediate, which gen7 still
322 * can't use.
323 */
324
325 if (devinfo->gen == 7 && src.file != IMM)
326 return src;
327
328 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329 expanded.type = src.type;
330 emit(MOV(expanded, src));
331 return src_reg(expanded);
332 }
333
334 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)335 vec4_visitor::emit_math(enum opcode opcode,
336 const dst_reg &dst,
337 const src_reg &src0, const src_reg &src1)
338 {
339 vec4_instruction *math =
340 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341
342 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
343 /* MATH on Gen6 must be align1, so we can't do writemasks. */
344 math->dst = dst_reg(this, glsl_type::vec4_type);
345 math->dst.type = dst.type;
346 math = emit(MOV(dst, src_reg(math->dst)));
347 } else if (devinfo->gen < 6) {
348 math->base_mrf = 1;
349 math->mlen = src1.file == BAD_FILE ? 1 : 2;
350 }
351
352 return math;
353 }
354
355 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358 if (devinfo->gen < 7) {
359 unreachable("ir_unop_pack_half_2x16 should be lowered");
360 }
361
362 assert(dst.type == BRW_REGISTER_TYPE_UD);
363 assert(src0.type == BRW_REGISTER_TYPE_F);
364
365 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366 *
367 * Because this instruction does not have a 16-bit floating-point type,
368 * the destination data type must be Word (W).
369 *
370 * The destination must be DWord-aligned and specify a horizontal stride
371 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
372 * each destination channel and the upper word is not modified.
373 *
374 * The above restriction implies that the f32to16 instruction must use
375 * align1 mode, because only in align1 mode is it possible to specify
376 * horizontal stride. We choose here to defy the hardware docs and emit
377 * align16 instructions.
378 *
379 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380 * instructions. I was partially successful in that the code passed all
381 * tests. However, the code was dubiously correct and fragile, and the
382 * tests were not harsh enough to probe that frailty. Not trusting the
383 * code, I chose instead to remain in align16 mode in defiance of the hw
384 * docs).
385 *
386 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
387 * simulator, emitting a f32to16 in align16 mode with UD as destination
388 * data type is safe. The behavior differs from that specified in the PRM
389 * in that the upper word of each destination channel is cleared to 0.
390 */
391
392 dst_reg tmp_dst(this, glsl_type::uvec2_type);
393 src_reg tmp_src(tmp_dst);
394
395 #if 0
396 /* Verify the undocumented behavior on which the following instructions
397 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
398 * then the result of the bit-or instruction below will be incorrect.
399 *
400 * You should inspect the disasm output in order to verify that the MOV is
401 * not optimized away.
402 */
403 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404 #endif
405
406 /* Give tmp the form below, where "." means untouched.
407 *
408 * w z y x w z y x
409 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410 *
411 * That the upper word of each write-channel be 0 is required for the
412 * following bit-shift and bit-or instructions to work. Note that this
413 * relies on the undocumented hardware behavior mentioned above.
414 */
415 tmp_dst.writemask = WRITEMASK_XY;
416 emit(F32TO16(tmp_dst, src0));
417
418 /* Give the write-channels of dst the form:
419 * 0xhhhh0000
420 */
421 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423
424 /* Finally, give the write-channels of dst the form of packHalf2x16's
425 * output:
426 * 0xhhhhllll
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429 emit(OR(dst, src_reg(dst), tmp_src));
430 }
431
432 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435 if (devinfo->gen < 7) {
436 unreachable("ir_unop_unpack_half_2x16 should be lowered");
437 }
438
439 assert(dst.type == BRW_REGISTER_TYPE_F);
440 assert(src0.type == BRW_REGISTER_TYPE_UD);
441
442 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443 *
444 * Because this instruction does not have a 16-bit floating-point type,
445 * the source data type must be Word (W). The destination type must be
446 * F (Float).
447 *
448 * To use W as the source data type, we must adjust horizontal strides,
449 * which is only possible in align1 mode. All my [chadv] attempts at
450 * emitting align1 instructions for unpackHalf2x16 failed to pass the
451 * Piglit tests, so I gave up.
452 *
453 * I've verified that, on gen7 hardware and the simulator, it is safe to
454 * emit f16to32 in align16 mode with UD as source data type.
455 */
456
457 dst_reg tmp_dst(this, glsl_type::uvec2_type);
458 src_reg tmp_src(tmp_dst);
459
460 tmp_dst.writemask = WRITEMASK_X;
461 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462
463 tmp_dst.writemask = WRITEMASK_Y;
464 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465
466 dst.writemask = WRITEMASK_XY;
467 emit(F16TO32(dst, tmp_src));
468 }
469
470 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)471 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472 {
473 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475 * is not suitable to generate the shift values, but we can use the packed
476 * vector float and a type-converting MOV.
477 */
478 dst_reg shift(this, glsl_type::uvec4_type);
479 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480
481 dst_reg shifted(this, glsl_type::uvec4_type);
482 src0.swizzle = BRW_SWIZZLE_XXXX;
483 emit(SHR(shifted, src0, src_reg(shift)));
484
485 shifted.type = BRW_REGISTER_TYPE_UB;
486 dst_reg f(this, glsl_type::vec4_type);
487 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488
489 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490 }
491
492 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)493 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_B;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 dst_reg scaled(this, glsl_type::vec4_type);
512 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513
514 dst_reg max(this, glsl_type::vec4_type);
515 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517 }
518
519 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)520 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521 {
522 dst_reg saturated(this, glsl_type::vec4_type);
523 vec4_instruction *inst = emit(MOV(saturated, src0));
524 inst->saturate = true;
525
526 dst_reg scaled(this, glsl_type::vec4_type);
527 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528
529 dst_reg rounded(this, glsl_type::vec4_type);
530 emit(RNDE(rounded, src_reg(scaled)));
531
532 dst_reg u(this, glsl_type::uvec4_type);
533 emit(MOV(u, src_reg(rounded)));
534
535 src_reg bytes(u);
536 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537 }
538
539 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)540 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542 dst_reg max(this, glsl_type::vec4_type);
543 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544
545 dst_reg min(this, glsl_type::vec4_type);
546 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg i(this, glsl_type::ivec4_type);
555 emit(MOV(i, src_reg(rounded)));
556
557 src_reg bytes(i);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 /*
562 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563 * false) elements needed to pack a type.
564 */
565 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)566 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567 {
568 unsigned int i;
569 int size;
570
571 switch (type->base_type) {
572 case GLSL_TYPE_UINT:
573 case GLSL_TYPE_INT:
574 case GLSL_TYPE_FLOAT:
575 case GLSL_TYPE_FLOAT16:
576 case GLSL_TYPE_BOOL:
577 case GLSL_TYPE_DOUBLE:
578 case GLSL_TYPE_UINT16:
579 case GLSL_TYPE_INT16:
580 case GLSL_TYPE_UINT8:
581 case GLSL_TYPE_INT8:
582 case GLSL_TYPE_UINT64:
583 case GLSL_TYPE_INT64:
584 if (type->is_matrix()) {
585 const glsl_type *col_type = type->column_type();
586 unsigned col_slots =
587 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588 return type->matrix_columns * col_slots;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600 type->length;
601 case GLSL_TYPE_STRUCT:
602 case GLSL_TYPE_INTERFACE:
603 size = 0;
604 for (i = 0; i < type->length; i++) {
605 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606 bindless);
607 }
608 return size;
609 case GLSL_TYPE_SUBROUTINE:
610 return 1;
611
612 case GLSL_TYPE_SAMPLER:
613 /* Samplers take up no register space, since they're baked in at
614 * link time.
615 */
616 return bindless ? 1 : 0;
617 case GLSL_TYPE_ATOMIC_UINT:
618 return 0;
619 case GLSL_TYPE_IMAGE:
620 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621 case GLSL_TYPE_VOID:
622 case GLSL_TYPE_ERROR:
623 case GLSL_TYPE_FUNCTION:
624 unreachable("not reached");
625 }
626
627 return 0;
628 }
629
630 /**
631 * Returns the minimum number of vec4 elements needed to pack a type.
632 *
633 * For simple types, it will return 1 (a single vec4); for matrices, the
634 * number of columns; for array and struct, the sum of the vec4_size of
635 * each of its elements; and for sampler and atomic, zero.
636 *
637 * This method is useful to calculate how much register space is needed to
638 * store a particular type.
639 */
640 extern "C" int
type_size_vec4(const struct glsl_type * type,bool bindless)641 type_size_vec4(const struct glsl_type *type, bool bindless)
642 {
643 return type_size_xvec4(type, true, bindless);
644 }
645
646 /**
647 * Returns the minimum number of dvec4 elements needed to pack a type.
648 *
649 * For simple types, it will return 1 (a single dvec4); for matrices, the
650 * number of columns; for array and struct, the sum of the dvec4_size of
651 * each of its elements; and for sampler and atomic, zero.
652 *
653 * This method is useful to calculate how much register space is needed to
654 * store a particular type.
655 *
656 * Measuring double-precision vertex inputs as dvec4 is required because
657 * ARB_vertex_attrib_64bit states that these uses the same number of locations
658 * than the single-precision version. That is, two consecutives dvec4 would be
659 * located in location "x" and location "x+1", not "x+2".
660 *
661 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
662 * remap_vs_attrs() will take in account both the location and also if the
663 * type fits in one or two vec4 slots.
664 */
665 extern "C" int
type_size_dvec4(const struct glsl_type * type,bool bindless)666 type_size_dvec4(const struct glsl_type *type, bool bindless)
667 {
668 return type_size_xvec4(type, false, bindless);
669 }
670
src_reg(class vec4_visitor * v,const struct glsl_type * type)671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
672 {
673 init();
674
675 this->file = VGRF;
676 this->nr = v->alloc.allocate(type_size_vec4(type, false));
677
678 if (type->is_array() || type->is_struct()) {
679 this->swizzle = BRW_SWIZZLE_NOOP;
680 } else {
681 this->swizzle = brw_swizzle_for_size(type->vector_elements);
682 }
683
684 this->type = brw_type_for_base_type(type);
685 }
686
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
688 {
689 assert(size > 0);
690
691 init();
692
693 this->file = VGRF;
694 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
695
696 this->swizzle = BRW_SWIZZLE_NOOP;
697
698 this->type = brw_type_for_base_type(type);
699 }
700
dst_reg(class vec4_visitor * v,const struct glsl_type * type)701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
702 {
703 init();
704
705 this->file = VGRF;
706 this->nr = v->alloc.allocate(type_size_vec4(type, false));
707
708 if (type->is_array() || type->is_struct()) {
709 this->writemask = WRITEMASK_XYZW;
710 } else {
711 this->writemask = (1 << type->vector_elements) - 1;
712 }
713
714 this->type = brw_type_for_base_type(type);
715 }
716
717 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
719 src_reg src0, src_reg src1)
720 {
721 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
722 inst->conditional_mod = conditionalmod;
723 return inst;
724 }
725
726 /**
727 * Emits the instructions needed to perform a pull constant load. before_block
728 * and before_inst can be NULL in which case the instruction will be appended
729 * to the end of the instruction list.
730 */
731 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
733 src_reg surf_index,
734 src_reg offset_reg,
735 bblock_t *before_block,
736 vec4_instruction *before_inst)
737 {
738 assert((before_inst == NULL && before_block == NULL) ||
739 (before_inst && before_block));
740
741 vec4_instruction *pull;
742
743 if (devinfo->gen >= 7) {
744 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
745
746 grf_offset.type = offset_reg.type;
747
748 pull = MOV(grf_offset, offset_reg);
749
750 if (before_inst)
751 emit_before(before_block, before_inst, pull);
752 else
753 emit(pull);
754
755 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
756 dst,
757 surf_index,
758 src_reg(grf_offset));
759 pull->mlen = 1;
760 } else {
761 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
762 dst,
763 surf_index,
764 offset_reg);
765 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
766 pull->mlen = 1;
767 }
768
769 if (before_inst)
770 emit_before(before_block, before_inst, pull);
771 else
772 emit(pull);
773 }
774
775 src_reg
emit_uniformize(const src_reg & src)776 vec4_visitor::emit_uniformize(const src_reg &src)
777 {
778 const src_reg chan_index(this, glsl_type::uint_type);
779 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
780 src.type);
781
782 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
783 ->force_writemask_all = true;
784 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
785 ->force_writemask_all = true;
786
787 return src_reg(dst);
788 }
789
790 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)791 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
792 src_reg coordinate, src_reg surface)
793 {
794 vec4_instruction *inst =
795 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
796 dst_reg(this, glsl_type::uvec4_type));
797 inst->base_mrf = 2;
798 inst->src[1] = surface;
799 inst->src[2] = brw_imm_ud(0); /* sampler */
800 inst->mlen = 1;
801
802 const int param_base = inst->base_mrf;
803
804 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
805 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
806 int zero_mask = 0xf & ~coord_mask;
807
808 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
809 coordinate));
810
811 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
812 brw_imm_d(0)));
813
814 emit(inst);
815 return src_reg(inst->dst);
816 }
817
818 bool
is_high_sampler(src_reg sampler)819 vec4_visitor::is_high_sampler(src_reg sampler)
820 {
821 if (!devinfo->is_haswell)
822 return false;
823
824 return sampler.file != IMM || sampler.ud >= 16;
825 }
826
827 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)828 vec4_visitor::emit_texture(ir_texture_opcode op,
829 dst_reg dest,
830 const glsl_type *dest_type,
831 src_reg coordinate,
832 int coord_components,
833 src_reg shadow_comparator,
834 src_reg lod, src_reg lod2,
835 src_reg sample_index,
836 uint32_t constant_offset,
837 src_reg offset_value,
838 src_reg mcs,
839 uint32_t surface,
840 src_reg surface_reg,
841 src_reg sampler_reg)
842 {
843 enum opcode opcode;
844 switch (op) {
845 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
846 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
847 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
848 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
849 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
850 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
851 case ir_tg4: opcode = offset_value.file != BAD_FILE
852 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
853 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
854 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
855 case ir_txb:
856 unreachable("TXB is not valid for vertex shaders.");
857 case ir_lod:
858 unreachable("LOD is not valid for vertex shaders.");
859 case ir_samples_identical: {
860 /* There are some challenges implementing this for vec4, and it seems
861 * unlikely to be used anyway. For now, just return false ways.
862 */
863 emit(MOV(dest, brw_imm_ud(0u)));
864 return;
865 }
866 default:
867 unreachable("Unrecognized tex op");
868 }
869
870 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
871
872 inst->offset = constant_offset;
873
874 /* The message header is necessary for:
875 * - Gen4 (always)
876 * - Texel offsets
877 * - Gather channel selection
878 * - Sampler indices too large to fit in a 4-bit value.
879 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
880 */
881 inst->header_size =
882 (devinfo->gen < 5 ||
883 inst->offset != 0 || op == ir_tg4 ||
884 op == ir_texture_samples ||
885 is_high_sampler(sampler_reg)) ? 1 : 0;
886 inst->base_mrf = 2;
887 inst->mlen = inst->header_size;
888 inst->dst.writemask = WRITEMASK_XYZW;
889 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
890
891 inst->src[1] = surface_reg;
892 inst->src[2] = sampler_reg;
893
894 /* MRF for the first parameter */
895 int param_base = inst->base_mrf + inst->header_size;
896
897 if (op == ir_txs || op == ir_query_levels) {
898 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
899 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
900 inst->mlen++;
901 } else if (op == ir_texture_samples) {
902 inst->dst.writemask = WRITEMASK_X;
903 } else {
904 /* Load the coordinate */
905 /* FINISHME: gl_clamp_mask and saturate */
906 int coord_mask = (1 << coord_components) - 1;
907 int zero_mask = 0xf & ~coord_mask;
908
909 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
910 coordinate));
911 inst->mlen++;
912
913 if (zero_mask != 0) {
914 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
915 brw_imm_d(0)));
916 }
917 /* Load the shadow comparator */
918 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
919 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
920 WRITEMASK_X),
921 shadow_comparator));
922 inst->mlen++;
923 }
924
925 /* Load the LOD info */
926 if (op == ir_tex || op == ir_txl) {
927 int mrf, writemask;
928 if (devinfo->gen >= 5) {
929 mrf = param_base + 1;
930 if (shadow_comparator.file != BAD_FILE) {
931 writemask = WRITEMASK_Y;
932 /* mlen already incremented */
933 } else {
934 writemask = WRITEMASK_X;
935 inst->mlen++;
936 }
937 } else /* devinfo->gen == 4 */ {
938 mrf = param_base;
939 writemask = WRITEMASK_W;
940 }
941 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
942 } else if (op == ir_txf) {
943 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
944 } else if (op == ir_txf_ms) {
945 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
946 sample_index));
947 if (devinfo->gen >= 7) {
948 /* MCS data is in the first channel of `mcs`, but we need to get it into
949 * the .y channel of the second vec4 of params, so replicate .x across
950 * the whole vec4 and then mask off everything except .y
951 */
952 mcs.swizzle = BRW_SWIZZLE_XXXX;
953 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
954 mcs));
955 }
956 inst->mlen++;
957 } else if (op == ir_txd) {
958 const brw_reg_type type = lod.type;
959
960 if (devinfo->gen >= 5) {
961 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
962 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
963 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
964 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
965 inst->mlen++;
966
967 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
968 lod.swizzle = BRW_SWIZZLE_ZZZZ;
969 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
970 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
971 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
972 inst->mlen++;
973
974 if (shadow_comparator.file != BAD_FILE) {
975 emit(MOV(dst_reg(MRF, param_base + 2,
976 shadow_comparator.type, WRITEMASK_Z),
977 shadow_comparator));
978 }
979 }
980 } else /* devinfo->gen == 4 */ {
981 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
982 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
983 inst->mlen += 2;
984 }
985 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
986 if (shadow_comparator.file != BAD_FILE) {
987 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
988 shadow_comparator));
989 }
990
991 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
992 offset_value));
993 inst->mlen++;
994 }
995 }
996
997 emit(inst);
998
999 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1000 * spec requires layers.
1001 */
1002 if (op == ir_txs && devinfo->gen < 7) {
1003 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1004 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1005 src_reg(inst->dst), brw_imm_d(1));
1006 }
1007
1008 if (devinfo->gen == 6 && op == ir_tg4) {
1009 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1010 }
1011
1012 if (op == ir_query_levels) {
1013 /* # levels is in .w */
1014 src_reg swizzled(dest);
1015 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1016 SWIZZLE_W, SWIZZLE_W);
1017 emit(MOV(dest, swizzled));
1018 }
1019 }
1020
1021 /**
1022 * Apply workarounds for Gen6 gather with UINT/SINT
1023 */
1024 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1025 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1026 {
1027 if (!wa)
1028 return;
1029
1030 int width = (wa & WA_8BIT) ? 8 : 16;
1031 dst_reg dst_f = dst;
1032 dst_f.type = BRW_REGISTER_TYPE_F;
1033
1034 /* Convert from UNORM to UINT */
1035 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1036 emit(MOV(dst, src_reg(dst_f)));
1037
1038 if (wa & WA_SIGN) {
1039 /* Reinterpret the UINT value as a signed INT value by
1040 * shifting the sign bit into place, then shifting back
1041 * preserving sign.
1042 */
1043 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1044 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1045 }
1046 }
1047
1048 void
gs_emit_vertex(int)1049 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1050 {
1051 unreachable("not reached");
1052 }
1053
1054 void
gs_end_primitive()1055 vec4_visitor::gs_end_primitive()
1056 {
1057 unreachable("not reached");
1058 }
1059
1060 void
emit_ndc_computation()1061 vec4_visitor::emit_ndc_computation()
1062 {
1063 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1064 return;
1065
1066 /* Get the position */
1067 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1068
1069 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1070 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1071 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1072 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1073
1074 current_annotation = "NDC";
1075 dst_reg ndc_w = ndc;
1076 ndc_w.writemask = WRITEMASK_W;
1077 src_reg pos_w = pos;
1078 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1079 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1080
1081 dst_reg ndc_xyz = ndc;
1082 ndc_xyz.writemask = WRITEMASK_XYZ;
1083
1084 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1085 }
1086
1087 void
emit_psiz_and_flags(dst_reg reg)1088 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1089 {
1090 if (devinfo->gen < 6 &&
1091 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1092 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1093 devinfo->has_negative_rhw_bug)) {
1094 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1095 dst_reg header1_w = header1;
1096 header1_w.writemask = WRITEMASK_W;
1097
1098 emit(MOV(header1, brw_imm_ud(0u)));
1099
1100 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1101 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1102
1103 current_annotation = "Point size";
1104 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1105 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1106 }
1107
1108 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1109 current_annotation = "Clipping flags";
1110 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1111
1112 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1113 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1114 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1115 }
1116
1117 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1118 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1119 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1120 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1121 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1122 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1123 }
1124
1125 /* i965 clipping workaround:
1126 * 1) Test for -ve rhw
1127 * 2) If set,
1128 * set ndc = (0,0,0,0)
1129 * set ucp[6] = 1
1130 *
1131 * Later, clipping will detect ucp[6] and ensure the primitive is
1132 * clipped against all fixed planes.
1133 */
1134 if (devinfo->has_negative_rhw_bug &&
1135 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1136 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1137 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1138 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1139 vec4_instruction *inst;
1140 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1141 inst->predicate = BRW_PREDICATE_NORMAL;
1142 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1143 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1144 inst->predicate = BRW_PREDICATE_NORMAL;
1145 }
1146
1147 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1148 } else if (devinfo->gen < 6) {
1149 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1150 } else {
1151 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1152 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1153 dst_reg reg_w = reg;
1154 reg_w.writemask = WRITEMASK_W;
1155 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1156 reg_as_src.type = reg_w.type;
1157 reg_as_src.swizzle = brw_swizzle_for_size(1);
1158 emit(MOV(reg_w, reg_as_src));
1159 }
1160 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1161 dst_reg reg_y = reg;
1162 reg_y.writemask = WRITEMASK_Y;
1163 reg_y.type = BRW_REGISTER_TYPE_D;
1164 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1165 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1166 }
1167 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1168 dst_reg reg_z = reg;
1169 reg_z.writemask = WRITEMASK_Z;
1170 reg_z.type = BRW_REGISTER_TYPE_D;
1171 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1172 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1173 }
1174 }
1175 }
1176
1177 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1178 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1179 {
1180 assert(varying < VARYING_SLOT_MAX);
1181
1182 unsigned num_comps = output_num_components[varying][component];
1183 if (num_comps == 0)
1184 return NULL;
1185
1186 assert(output_reg[varying][component].type == reg.type);
1187 current_annotation = output_reg_annotation[varying];
1188 if (output_reg[varying][component].file != BAD_FILE) {
1189 src_reg src = src_reg(output_reg[varying][component]);
1190 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1191 reg.writemask =
1192 brw_writemask_for_component_packing(num_comps, component);
1193 return emit(MOV(reg, src));
1194 }
1195 return NULL;
1196 }
1197
1198 void
emit_urb_slot(dst_reg reg,int varying)1199 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1200 {
1201 reg.type = BRW_REGISTER_TYPE_F;
1202 output_reg[varying][0].type = reg.type;
1203
1204 switch (varying) {
1205 case VARYING_SLOT_PSIZ:
1206 {
1207 /* PSIZ is always in slot 0, and is coupled with other flags. */
1208 current_annotation = "indices, point width, clip flags";
1209 emit_psiz_and_flags(reg);
1210 break;
1211 }
1212 case BRW_VARYING_SLOT_NDC:
1213 current_annotation = "NDC";
1214 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1215 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1216 break;
1217 case VARYING_SLOT_POS:
1218 current_annotation = "gl_Position";
1219 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1220 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1221 break;
1222 case VARYING_SLOT_EDGE: {
1223 /* This is present when doing unfilled polygons. We're supposed to copy
1224 * the edge flag from the user-provided vertex array
1225 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1226 * of that attribute (starts as 1.0f). This is then used in clipping to
1227 * determine which edges should be drawn as wireframe.
1228 */
1229 current_annotation = "edge flag";
1230 int edge_attr = util_bitcount64(nir->info.inputs_read &
1231 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1232 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1233 glsl_type::float_type, WRITEMASK_XYZW))));
1234 break;
1235 }
1236 case BRW_VARYING_SLOT_PAD:
1237 /* No need to write to this slot */
1238 break;
1239 default:
1240 for (int i = 0; i < 4; i++) {
1241 emit_generic_urb_slot(reg, varying, i);
1242 }
1243 break;
1244 }
1245 }
1246
1247 static unsigned
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,unsigned mlen)1248 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1249 {
1250 if (devinfo->gen >= 6) {
1251 /* URB data written (does not include the message header reg) must
1252 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1253 * section 5.4.3.2.2: URB_INTERLEAVED.
1254 *
1255 * URB entries are allocated on a multiple of 1024 bits, so an
1256 * extra 128 bits written here to make the end align to 256 is
1257 * no problem.
1258 */
1259 if ((mlen % 2) != 1)
1260 mlen++;
1261 }
1262
1263 return mlen;
1264 }
1265
1266
1267 /**
1268 * Generates the VUE payload plus the necessary URB write instructions to
1269 * output it.
1270 *
1271 * The VUE layout is documented in Volume 2a.
1272 */
1273 void
emit_vertex()1274 vec4_visitor::emit_vertex()
1275 {
1276 /* MRF 0 is reserved for the debugger, so start with message header
1277 * in MRF 1.
1278 */
1279 int base_mrf = 1;
1280 int mrf = base_mrf;
1281 /* In the process of generating our URB write message contents, we
1282 * may need to unspill a register or load from an array. Those
1283 * reads would use MRFs 14-15.
1284 */
1285 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1286
1287 /* The following assertion verifies that max_usable_mrf causes an
1288 * even-numbered amount of URB write data, which will meet gen6's
1289 * requirements for length alignment.
1290 */
1291 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1292
1293 /* First mrf is the g0-based message header containing URB handles and
1294 * such.
1295 */
1296 emit_urb_write_header(mrf++);
1297
1298 if (devinfo->gen < 6) {
1299 emit_ndc_computation();
1300 }
1301
1302 /* We may need to split this up into several URB writes, so do them in a
1303 * loop.
1304 */
1305 int slot = 0;
1306 bool complete = false;
1307 do {
1308 /* URB offset is in URB row increments, and each of our MRFs is half of
1309 * one of those, since we're doing interleaved writes.
1310 */
1311 int offset = slot / 2;
1312
1313 mrf = base_mrf + 1;
1314 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1315 emit_urb_slot(dst_reg(MRF, mrf++),
1316 prog_data->vue_map.slot_to_varying[slot]);
1317
1318 /* If this was max_usable_mrf, we can't fit anything more into this
1319 * URB WRITE. Same thing if we reached the maximum length available.
1320 */
1321 if (mrf > max_usable_mrf ||
1322 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1323 slot++;
1324 break;
1325 }
1326 }
1327
1328 complete = slot >= prog_data->vue_map.num_slots;
1329 current_annotation = "URB write";
1330 vec4_instruction *inst = emit_urb_write_opcode(complete);
1331 inst->base_mrf = base_mrf;
1332 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1333 inst->offset += offset;
1334 } while(!complete);
1335 }
1336
1337
1338 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1339 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1340 src_reg *reladdr, int reg_offset)
1341 {
1342 /* Because we store the values to scratch interleaved like our
1343 * vertex data, we need to scale the vec4 index by 2.
1344 */
1345 int message_header_scale = 2;
1346
1347 /* Pre-gen6, the message header uses byte offsets instead of vec4
1348 * (16-byte) offset units.
1349 */
1350 if (devinfo->gen < 6)
1351 message_header_scale *= 16;
1352
1353 if (reladdr) {
1354 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1355 * to multiply the reladdr by 2. Notice that the reg_offset part
1356 * is in units of 16 bytes and is used to select the low/high 16-byte
1357 * chunk of a full dvec4, so we don't want to multiply that part.
1358 */
1359 src_reg index = src_reg(this, glsl_type::int_type);
1360 if (type_sz(inst->dst.type) < 8) {
1361 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1362 brw_imm_d(reg_offset)));
1363 emit_before(block, inst, MUL(dst_reg(index), index,
1364 brw_imm_d(message_header_scale)));
1365 } else {
1366 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1367 brw_imm_d(message_header_scale * 2)));
1368 emit_before(block, inst, ADD(dst_reg(index), index,
1369 brw_imm_d(reg_offset * message_header_scale)));
1370 }
1371 return index;
1372 } else {
1373 return brw_imm_d(reg_offset * message_header_scale);
1374 }
1375 }
1376
1377 /**
1378 * Emits an instruction before @inst to load the value named by @orig_src
1379 * from scratch space at @base_offset to @temp.
1380 *
1381 * @base_offset is measured in 32-byte units (the size of a register).
1382 */
1383 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1384 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1385 dst_reg temp, src_reg orig_src,
1386 int base_offset)
1387 {
1388 assert(orig_src.offset % REG_SIZE == 0);
1389 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1390 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1391 reg_offset);
1392
1393 if (type_sz(orig_src.type) < 8) {
1394 emit_before(block, inst, SCRATCH_READ(temp, index));
1395 } else {
1396 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1397 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1398 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1399 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1400 vec4_instruction *last_read =
1401 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1402 emit_before(block, inst, last_read);
1403 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1404 }
1405 }
1406
1407 /**
1408 * Emits an instruction after @inst to store the value to be written
1409 * to @orig_dst to scratch space at @base_offset, from @temp.
1410 *
1411 * @base_offset is measured in 32-byte units (the size of a register).
1412 */
1413 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1414 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1415 int base_offset)
1416 {
1417 assert(inst->dst.offset % REG_SIZE == 0);
1418 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1419 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1420 reg_offset);
1421
1422 /* Create a temporary register to store *inst's result in.
1423 *
1424 * We have to be careful in MOVing from our temporary result register in
1425 * the scratch write. If we swizzle from channels of the temporary that
1426 * weren't initialized, it will confuse live interval analysis, which will
1427 * make spilling fail to make progress.
1428 */
1429 bool is_64bit = type_sz(inst->dst.type) == 8;
1430 const glsl_type *alloc_type =
1431 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1432 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1433 inst->dst.type),
1434 brw_swizzle_for_mask(inst->dst.writemask));
1435
1436 if (!is_64bit) {
1437 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1438 inst->dst.writemask));
1439 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1440 if (inst->opcode != BRW_OPCODE_SEL)
1441 write->predicate = inst->predicate;
1442 write->ir = inst->ir;
1443 write->annotation = inst->annotation;
1444 inst->insert_after(block, write);
1445 } else {
1446 dst_reg shuffled = dst_reg(this, alloc_type);
1447 vec4_instruction *last =
1448 shuffle_64bit_data(shuffled, temp, true, block, inst);
1449 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1450
1451 uint8_t mask = 0;
1452 if (inst->dst.writemask & WRITEMASK_X)
1453 mask |= WRITEMASK_XY;
1454 if (inst->dst.writemask & WRITEMASK_Y)
1455 mask |= WRITEMASK_ZW;
1456 if (mask) {
1457 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1458
1459 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1460 if (inst->opcode != BRW_OPCODE_SEL)
1461 write->predicate = inst->predicate;
1462 write->ir = inst->ir;
1463 write->annotation = inst->annotation;
1464 last->insert_after(block, write);
1465 }
1466
1467 mask = 0;
1468 if (inst->dst.writemask & WRITEMASK_Z)
1469 mask |= WRITEMASK_XY;
1470 if (inst->dst.writemask & WRITEMASK_W)
1471 mask |= WRITEMASK_ZW;
1472 if (mask) {
1473 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1474
1475 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1476 reg_offset + 1);
1477 vec4_instruction *write =
1478 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1479 if (inst->opcode != BRW_OPCODE_SEL)
1480 write->predicate = inst->predicate;
1481 write->ir = inst->ir;
1482 write->annotation = inst->annotation;
1483 last->insert_after(block, write);
1484 }
1485 }
1486
1487 inst->dst.file = temp.file;
1488 inst->dst.nr = temp.nr;
1489 inst->dst.offset %= REG_SIZE;
1490 inst->dst.reladdr = NULL;
1491 }
1492
1493 /**
1494 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1495 * adds the scratch read(s) before \p inst. The function also checks for
1496 * recursive reladdr scratch accesses, issuing the corresponding scratch
1497 * loads and rewriting reladdr references accordingly.
1498 *
1499 * \return \p src if it did not require a scratch load, otherwise, the
1500 * register holding the result of the scratch load that the caller should
1501 * use to rewrite src.
1502 */
1503 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1504 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1505 vec4_instruction *inst, src_reg src)
1506 {
1507 /* Resolve recursive reladdr scratch access by calling ourselves
1508 * with src.reladdr
1509 */
1510 if (src.reladdr)
1511 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1512 *src.reladdr);
1513
1514 /* Now handle scratch access on src */
1515 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1516 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1517 glsl_type::dvec4_type : glsl_type::vec4_type);
1518 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1519 src.nr = temp.nr;
1520 src.offset %= REG_SIZE;
1521 src.reladdr = NULL;
1522 }
1523
1524 return src;
1525 }
1526
1527 /**
1528 * We can't generally support array access in GRF space, because a
1529 * single instruction's destination can only span 2 contiguous
1530 * registers. So, we send all GRF arrays that get variable index
1531 * access to scratch space.
1532 */
1533 void
move_grf_array_access_to_scratch()1534 vec4_visitor::move_grf_array_access_to_scratch()
1535 {
1536 int scratch_loc[this->alloc.count];
1537 memset(scratch_loc, -1, sizeof(scratch_loc));
1538
1539 /* First, calculate the set of virtual GRFs that need to be punted
1540 * to scratch due to having any array access on them, and where in
1541 * scratch.
1542 */
1543 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1544 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1545 if (scratch_loc[inst->dst.nr] == -1) {
1546 scratch_loc[inst->dst.nr] = last_scratch;
1547 last_scratch += this->alloc.sizes[inst->dst.nr];
1548 }
1549
1550 for (src_reg *iter = inst->dst.reladdr;
1551 iter->reladdr;
1552 iter = iter->reladdr) {
1553 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1554 scratch_loc[iter->nr] = last_scratch;
1555 last_scratch += this->alloc.sizes[iter->nr];
1556 }
1557 }
1558 }
1559
1560 for (int i = 0 ; i < 3; i++) {
1561 for (src_reg *iter = &inst->src[i];
1562 iter->reladdr;
1563 iter = iter->reladdr) {
1564 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1565 scratch_loc[iter->nr] = last_scratch;
1566 last_scratch += this->alloc.sizes[iter->nr];
1567 }
1568 }
1569 }
1570 }
1571
1572 /* Now, for anything that will be accessed through scratch, rewrite
1573 * it to load/store. Note that this is a _safe list walk, because
1574 * we may generate a new scratch_write instruction after the one
1575 * we're processing.
1576 */
1577 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1578 /* Set up the annotation tracking for new generated instructions. */
1579 base_ir = inst->ir;
1580 current_annotation = inst->annotation;
1581
1582 /* First handle scratch access on the dst. Notice we have to handle
1583 * the case where the dst's reladdr also points to scratch space.
1584 */
1585 if (inst->dst.reladdr)
1586 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1587 *inst->dst.reladdr);
1588
1589 /* Now that we have handled any (possibly recursive) reladdr scratch
1590 * accesses for dst we can safely do the scratch write for dst itself
1591 */
1592 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1593 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1594
1595 /* Now handle scratch access on any src. In this case, since inst->src[i]
1596 * already is a src_reg, we can just call emit_resolve_reladdr with
1597 * inst->src[i] and it will take care of handling scratch loads for
1598 * both src and src.reladdr (recursively).
1599 */
1600 for (int i = 0 ; i < 3; i++) {
1601 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1602 inst->src[i]);
1603 }
1604 }
1605 }
1606
1607 /**
1608 * Emits an instruction before @inst to load the value named by @orig_src
1609 * from the pull constant buffer (surface) at @base_offset to @temp.
1610 */
1611 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1612 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1613 dst_reg temp, src_reg orig_src,
1614 int base_offset, src_reg indirect)
1615 {
1616 assert(orig_src.offset % 16 == 0);
1617 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1618
1619 /* For 64bit loads we need to emit two 32-bit load messages and we also
1620 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1621 * that we emit the 32-bit loads into a temporary and we shuffle the result
1622 * into the original destination.
1623 */
1624 dst_reg orig_temp = temp;
1625 bool is_64bit = type_sz(orig_src.type) == 8;
1626 if (is_64bit) {
1627 assert(type_sz(temp.type) == 8);
1628 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1629 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1630 }
1631
1632 src_reg src = orig_src;
1633 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1634 int reg_offset = base_offset + src.offset / 16;
1635
1636 src_reg offset;
1637 if (indirect.file != BAD_FILE) {
1638 offset = src_reg(this, glsl_type::uint_type);
1639 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1640 brw_imm_ud(reg_offset * 16)));
1641 } else {
1642 offset = brw_imm_d(reg_offset * 16);
1643 }
1644
1645 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1646 brw_imm_ud(index),
1647 offset,
1648 block, inst);
1649
1650 src = byte_offset(src, 16);
1651 }
1652
1653 if (is_64bit) {
1654 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1655 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1656 }
1657 }
1658
1659 /**
1660 * Implements array access of uniforms by inserting a
1661 * PULL_CONSTANT_LOAD instruction.
1662 *
1663 * Unlike temporary GRF array access (where we don't support it due to
1664 * the difficulty of doing relative addressing on instruction
1665 * destinations), we could potentially do array access of uniforms
1666 * that were loaded in GRF space as push constants. In real-world
1667 * usage we've seen, though, the arrays being used are always larger
1668 * than we could load as push constants, so just always move all
1669 * uniform array access out to a pull constant buffer.
1670 */
1671 void
move_uniform_array_access_to_pull_constants()1672 vec4_visitor::move_uniform_array_access_to_pull_constants()
1673 {
1674 /* The vulkan dirver doesn't support pull constants other than UBOs so
1675 * everything has to be pushed regardless.
1676 */
1677 if (!compiler->supports_pull_constants) {
1678 split_uniform_registers();
1679 return;
1680 }
1681
1682 /* Allocate the pull_params array */
1683 assert(stage_prog_data->nr_pull_params == 0);
1684 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1685 this->uniforms * 4);
1686
1687 int pull_constant_loc[this->uniforms];
1688 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1689
1690 /* First, walk through the instructions and determine which things need to
1691 * be pulled. We mark something as needing to be pulled by setting
1692 * pull_constant_loc to 0.
1693 */
1694 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1695 /* We only care about MOV_INDIRECT of a uniform */
1696 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1697 inst->src[0].file != UNIFORM)
1698 continue;
1699
1700 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1701
1702 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1703 pull_constant_loc[uniform_nr + j] = 0;
1704 }
1705
1706 /* Next, we walk the list of uniforms and assign real pull constant
1707 * locations and set their corresponding entries in pull_param.
1708 */
1709 for (int j = 0; j < this->uniforms; j++) {
1710 if (pull_constant_loc[j] < 0)
1711 continue;
1712
1713 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1714
1715 for (int i = 0; i < 4; i++) {
1716 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1717 = stage_prog_data->param[j * 4 + i];
1718 }
1719 }
1720
1721 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1722 * instructions to actual uniform pulls.
1723 */
1724 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1725 /* We only care about MOV_INDIRECT of a uniform */
1726 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1727 inst->src[0].file != UNIFORM)
1728 continue;
1729
1730 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1731
1732 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1733
1734 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1735 pull_constant_loc[uniform_nr], inst->src[1]);
1736 inst->remove(block);
1737 }
1738
1739 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1740 * no need to track them as larger-than-vec4 objects. This will be
1741 * relied on in cutting out unused uniform vectors from push
1742 * constants.
1743 */
1744 split_uniform_registers();
1745 }
1746
1747 void
resolve_ud_negate(src_reg * reg)1748 vec4_visitor::resolve_ud_negate(src_reg *reg)
1749 {
1750 if (reg->type != BRW_REGISTER_TYPE_UD ||
1751 !reg->negate)
1752 return;
1753
1754 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1755 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1756 *reg = temp;
1757 }
1758
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1759 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1760 void *log_data,
1761 const struct brw_sampler_prog_key_data *key_tex,
1762 struct brw_vue_prog_data *prog_data,
1763 const nir_shader *shader,
1764 void *mem_ctx,
1765 bool no_spills,
1766 int shader_time_index)
1767 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1768 key_tex(key_tex),
1769 prog_data(prog_data),
1770 fail_msg(NULL),
1771 first_non_payload_grf(0),
1772 live_analysis(this), performance_analysis(this),
1773 need_all_constants_in_pull_buffer(false),
1774 no_spills(no_spills),
1775 shader_time_index(shader_time_index),
1776 last_scratch(0)
1777 {
1778 this->failed = false;
1779
1780 this->base_ir = NULL;
1781 this->current_annotation = NULL;
1782 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1783
1784 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1785
1786 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1787
1788 this->uniforms = 0;
1789
1790 this->nir_locals = NULL;
1791 this->nir_ssa_values = NULL;
1792 }
1793
1794
1795 void
fail(const char * format,...)1796 vec4_visitor::fail(const char *format, ...)
1797 {
1798 va_list va;
1799 char *msg;
1800
1801 if (failed)
1802 return;
1803
1804 failed = true;
1805
1806 va_start(va, format);
1807 msg = ralloc_vasprintf(mem_ctx, format, va);
1808 va_end(va);
1809 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1810
1811 this->fail_msg = msg;
1812
1813 if (debug_enabled) {
1814 fprintf(stderr, "%s", msg);
1815 }
1816 }
1817
1818 } /* namespace brw */
1819