• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28 
29 namespace brw {
30 
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                    const src_reg &src0, const src_reg &src1,
33                                    const src_reg &src2)
34 {
35    this->opcode = opcode;
36    this->dst = dst;
37    this->src[0] = src0;
38    this->src[1] = src1;
39    this->src[2] = src2;
40    this->saturate = false;
41    this->force_writemask_all = false;
42    this->no_dd_clear = false;
43    this->no_dd_check = false;
44    this->writes_accumulator = false;
45    this->conditional_mod = BRW_CONDITIONAL_NONE;
46    this->predicate = BRW_PREDICATE_NONE;
47    this->predicate_inverse = false;
48    this->target = 0;
49    this->shadow_compare = false;
50    this->ir = NULL;
51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52    this->header_size = 0;
53    this->flag_subreg = 0;
54    this->mlen = 0;
55    this->base_mrf = 0;
56    this->offset = 0;
57    this->exec_size = 8;
58    this->group = 0;
59    this->size_written = (dst.file == BAD_FILE ?
60                          0 : this->exec_size * type_sz(dst.type));
61    this->annotation = NULL;
62 }
63 
64 vec4_instruction *
emit(vec4_instruction * inst)65 vec4_visitor::emit(vec4_instruction *inst)
66 {
67    inst->ir = this->base_ir;
68    inst->annotation = this->current_annotation;
69 
70    this->instructions.push_tail(inst);
71 
72    return inst;
73 }
74 
75 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77                           vec4_instruction *new_inst)
78 {
79    new_inst->ir = inst->ir;
80    new_inst->annotation = inst->annotation;
81 
82    inst->insert_before(block, new_inst);
83 
84    return inst;
85 }
86 
87 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89                    const src_reg &src1, const src_reg &src2)
90 {
91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92 }
93 
94 
95 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97                    const src_reg &src1)
98 {
99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100 }
101 
102 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104 {
105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106 }
107 
108 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110 {
111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112 }
113 
114 vec4_instruction *
emit(enum opcode opcode)115 vec4_visitor::emit(enum opcode opcode)
116 {
117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118 }
119 
120 #define ALU1(op)							\
121    vec4_instruction *							\
122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
123    {									\
124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125    }
126 
127 #define ALU2(op)							\
128    vec4_instruction *							\
129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
130                     const src_reg &src1)				\
131    {									\
132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
133                                            src0, src1);                 \
134    }
135 
136 #define ALU2_ACC(op)							\
137    vec4_instruction *							\
138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
139                     const src_reg &src1)				\
140    {									\
141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
142                        BRW_OPCODE_##op, dst, src0, src1);		\
143       inst->writes_accumulator = true;                                  \
144       return inst;                                                      \
145    }
146 
147 #define ALU3(op)							\
148    vec4_instruction *							\
149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
150                     const src_reg &src1, const src_reg &src2)		\
151    {									\
152       assert(devinfo->gen >= 6);						\
153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
154 					   src0, src1, src2);		\
155    }
156 
157 ALU1(NOT)
ALU1(MOV)158 ALU1(MOV)
159 ALU1(FRC)
160 ALU1(RNDD)
161 ALU1(RNDE)
162 ALU1(RNDZ)
163 ALU1(F32TO16)
164 ALU1(F16TO32)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2_ACC(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(DP3)
172 ALU2(DP4)
173 ALU2(DPH)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(MAC)
189 ALU1(DIM)
190 
191 /** Gen4 predicated IF. */
192 vec4_instruction *
193 vec4_visitor::IF(enum brw_predicate predicate)
194 {
195    vec4_instruction *inst;
196 
197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198    inst->predicate = predicate;
199 
200    return inst;
201 }
202 
203 /** Gen6 IF with embedded comparison. */
204 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)205 vec4_visitor::IF(src_reg src0, src_reg src1,
206                  enum brw_conditional_mod condition)
207 {
208    assert(devinfo->gen == 6);
209 
210    vec4_instruction *inst;
211 
212    resolve_ud_negate(&src0);
213    resolve_ud_negate(&src1);
214 
215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216 					src0, src1);
217    inst->conditional_mod = condition;
218 
219    return inst;
220 }
221 
222 /**
223  * CMP: Sets the low bit of the destination channels with the result
224  * of the comparison, while the upper bits are undefined, and updates
225  * the flag register with the packed 16 bits of the result.
226  */
227 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229                   enum brw_conditional_mod condition)
230 {
231    vec4_instruction *inst;
232 
233    /* Take the instruction:
234     *
235     * CMP null<d> src0<f> src1<f>
236     *
237     * Original gen4 does type conversion to the destination type before
238     * comparison, producing garbage results for floating point comparisons.
239     *
240     * The destination type doesn't matter on newer generations, so we set the
241     * type to match src0 so we can compact the instruction.
242     */
243    dst.type = src0.type;
244 
245    resolve_ud_negate(&src0);
246    resolve_ud_negate(&src1);
247 
248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249    inst->conditional_mod = condition;
250 
251    return inst;
252 }
253 
254 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257    vec4_instruction *inst;
258 
259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 					dst, index);
261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262    inst->mlen = 2;
263 
264    return inst;
265 }
266 
267 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269                             const src_reg &index)
270 {
271    vec4_instruction *inst;
272 
273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 					dst, src, index);
275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276    inst->mlen = 3;
277 
278    return inst;
279 }
280 
281 src_reg
fix_3src_operand(const src_reg & src)282 vec4_visitor::fix_3src_operand(const src_reg &src)
283 {
284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285     * able to use vertical stride of zero to replicate the vec4 uniform, like
286     *
287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288     *
289     * But you can't, since vertical stride is always four in three-source
290     * instructions. Instead, insert a MOV instruction to do the replication so
291     * that the three-source instruction can consume it.
292     */
293 
294    /* The MOV is only needed if the source is a uniform or immediate. */
295    if (src.file != UNIFORM && src.file != IMM)
296       return src;
297 
298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299       return src;
300 
301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302    expanded.type = src.type;
303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304    return src_reg(expanded);
305 }
306 
307 src_reg
resolve_source_modifiers(const src_reg & src)308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
309 {
310    if (!src.abs && !src.negate)
311       return src;
312 
313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314    resolved.type = src.type;
315    emit(MOV(resolved, src));
316 
317    return src_reg(resolved);
318 }
319 
320 src_reg
fix_math_operand(const src_reg & src)321 vec4_visitor::fix_math_operand(const src_reg &src)
322 {
323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324       return src;
325 
326    /* The gen6 math instruction ignores the source modifiers --
327     * swizzle, abs, negate, and at least some parts of the register
328     * region description.
329     *
330     * Rather than trying to enumerate all these cases, *always* expand the
331     * operand to a temp GRF for gen6.
332     *
333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
334     * can't use.
335     */
336 
337    if (devinfo->gen == 7 && src.file != IMM)
338       return src;
339 
340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341    expanded.type = src.type;
342    emit(MOV(expanded, src));
343    return src_reg(expanded);
344 }
345 
346 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)347 vec4_visitor::emit_math(enum opcode opcode,
348                         const dst_reg &dst,
349                         const src_reg &src0, const src_reg &src1)
350 {
351    vec4_instruction *math =
352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353 
354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
356       math->dst = dst_reg(this, glsl_type::vec4_type);
357       math->dst.type = dst.type;
358       math = emit(MOV(dst, src_reg(math->dst)));
359    } else if (devinfo->gen < 6) {
360       math->base_mrf = 1;
361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
362    }
363 
364    return math;
365 }
366 
367 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369 {
370    if (devinfo->gen < 7) {
371       unreachable("ir_unop_pack_half_2x16 should be lowered");
372    }
373 
374    assert(dst.type == BRW_REGISTER_TYPE_UD);
375    assert(src0.type == BRW_REGISTER_TYPE_F);
376 
377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378     *
379     *   Because this instruction does not have a 16-bit floating-point type,
380     *   the destination data type must be Word (W).
381     *
382     *   The destination must be DWord-aligned and specify a horizontal stride
383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
384     *   each destination channel and the upper word is not modified.
385     *
386     * The above restriction implies that the f32to16 instruction must use
387     * align1 mode, because only in align1 mode is it possible to specify
388     * horizontal stride.  We choose here to defy the hardware docs and emit
389     * align16 instructions.
390     *
391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392     * instructions. I was partially successful in that the code passed all
393     * tests.  However, the code was dubiously correct and fragile, and the
394     * tests were not harsh enough to probe that frailty. Not trusting the
395     * code, I chose instead to remain in align16 mode in defiance of the hw
396     * docs).
397     *
398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399     * simulator, emitting a f32to16 in align16 mode with UD as destination
400     * data type is safe. The behavior differs from that specified in the PRM
401     * in that the upper word of each destination channel is cleared to 0.
402     */
403 
404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
405    src_reg tmp_src(tmp_dst);
406 
407 #if 0
408    /* Verify the undocumented behavior on which the following instructions
409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
410     * then the result of the bit-or instruction below will be incorrect.
411     *
412     * You should inspect the disasm output in order to verify that the MOV is
413     * not optimized away.
414     */
415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416 #endif
417 
418    /* Give tmp the form below, where "." means untouched.
419     *
420     *     w z          y          x w z          y          x
421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422     *
423     * That the upper word of each write-channel be 0 is required for the
424     * following bit-shift and bit-or instructions to work. Note that this
425     * relies on the undocumented hardware behavior mentioned above.
426     */
427    tmp_dst.writemask = WRITEMASK_XY;
428    emit(F32TO16(tmp_dst, src0));
429 
430    /* Give the write-channels of dst the form:
431     *   0xhhhh0000
432     */
433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435 
436    /* Finally, give the write-channels of dst the form of packHalf2x16's
437     * output:
438     *   0xhhhhllll
439     */
440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441    emit(OR(dst, src_reg(dst), tmp_src));
442 }
443 
444 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446 {
447    if (devinfo->gen < 7) {
448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
449    }
450 
451    assert(dst.type == BRW_REGISTER_TYPE_F);
452    assert(src0.type == BRW_REGISTER_TYPE_UD);
453 
454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455     *
456     *   Because this instruction does not have a 16-bit floating-point type,
457     *   the source data type must be Word (W). The destination type must be
458     *   F (Float).
459     *
460     * To use W as the source data type, we must adjust horizontal strides,
461     * which is only possible in align1 mode. All my [chadv] attempts at
462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
463     * Piglit tests, so I gave up.
464     *
465     * I've verified that, on gen7 hardware and the simulator, it is safe to
466     * emit f16to32 in align16 mode with UD as source data type.
467     */
468 
469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
470    src_reg tmp_src(tmp_dst);
471 
472    tmp_dst.writemask = WRITEMASK_X;
473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474 
475    tmp_dst.writemask = WRITEMASK_Y;
476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477 
478    dst.writemask = WRITEMASK_XY;
479    emit(F16TO32(dst, tmp_src));
480 }
481 
482 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484 {
485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487     * is not suitable to generate the shift values, but we can use the packed
488     * vector float and a type-converting MOV.
489     */
490    dst_reg shift(this, glsl_type::uvec4_type);
491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492 
493    dst_reg shifted(this, glsl_type::uvec4_type);
494    src0.swizzle = BRW_SWIZZLE_XXXX;
495    emit(SHR(shifted, src0, src_reg(shift)));
496 
497    shifted.type = BRW_REGISTER_TYPE_UB;
498    dst_reg f(this, glsl_type::vec4_type);
499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500 
501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502 }
503 
504 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506 {
507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509     * is not suitable to generate the shift values, but we can use the packed
510     * vector float and a type-converting MOV.
511     */
512    dst_reg shift(this, glsl_type::uvec4_type);
513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514 
515    dst_reg shifted(this, glsl_type::uvec4_type);
516    src0.swizzle = BRW_SWIZZLE_XXXX;
517    emit(SHR(shifted, src0, src_reg(shift)));
518 
519    shifted.type = BRW_REGISTER_TYPE_B;
520    dst_reg f(this, glsl_type::vec4_type);
521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522 
523    dst_reg scaled(this, glsl_type::vec4_type);
524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525 
526    dst_reg max(this, glsl_type::vec4_type);
527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529 }
530 
531 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533 {
534    dst_reg saturated(this, glsl_type::vec4_type);
535    vec4_instruction *inst = emit(MOV(saturated, src0));
536    inst->saturate = true;
537 
538    dst_reg scaled(this, glsl_type::vec4_type);
539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540 
541    dst_reg rounded(this, glsl_type::vec4_type);
542    emit(RNDE(rounded, src_reg(scaled)));
543 
544    dst_reg u(this, glsl_type::uvec4_type);
545    emit(MOV(u, src_reg(rounded)));
546 
547    src_reg bytes(u);
548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549 }
550 
551 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553 {
554    dst_reg max(this, glsl_type::vec4_type);
555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556 
557    dst_reg min(this, glsl_type::vec4_type);
558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559 
560    dst_reg scaled(this, glsl_type::vec4_type);
561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562 
563    dst_reg rounded(this, glsl_type::vec4_type);
564    emit(RNDE(rounded, src_reg(scaled)));
565 
566    dst_reg i(this, glsl_type::ivec4_type);
567    emit(MOV(i, src_reg(rounded)));
568 
569    src_reg bytes(i);
570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571 }
572 
573 /*
574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575  * false) elements needed to pack a type.
576  */
577 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4)578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579 {
580    unsigned int i;
581    int size;
582 
583    switch (type->base_type) {
584    case GLSL_TYPE_UINT:
585    case GLSL_TYPE_INT:
586    case GLSL_TYPE_FLOAT:
587    case GLSL_TYPE_BOOL:
588    case GLSL_TYPE_DOUBLE:
589       if (type->is_matrix()) {
590          const glsl_type *col_type = type->column_type();
591          unsigned col_slots =
592             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
593          return type->matrix_columns * col_slots;
594       } else {
595          /* Regardless of size of vector, it gets a vec4. This is bad
596           * packing for things like floats, but otherwise arrays become a
597           * mess.  Hopefully a later pass over the code can pack scalars
598           * down if appropriate.
599           */
600          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
601       }
602    case GLSL_TYPE_ARRAY:
603       assert(type->length > 0);
604       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
605    case GLSL_TYPE_STRUCT:
606       size = 0;
607       for (i = 0; i < type->length; i++) {
608 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
609       }
610       return size;
611    case GLSL_TYPE_SUBROUTINE:
612       return 1;
613 
614    case GLSL_TYPE_SAMPLER:
615       /* Samplers take up no register space, since they're baked in at
616        * link time.
617        */
618       return 0;
619    case GLSL_TYPE_ATOMIC_UINT:
620       return 0;
621    case GLSL_TYPE_IMAGE:
622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
623    case GLSL_TYPE_VOID:
624    case GLSL_TYPE_ERROR:
625    case GLSL_TYPE_INTERFACE:
626    case GLSL_TYPE_FUNCTION:
627       unreachable("not reached");
628    }
629 
630    return 0;
631 }
632 
633 /**
634  * Returns the minimum number of vec4 elements needed to pack a type.
635  *
636  * For simple types, it will return 1 (a single vec4); for matrices, the
637  * number of columns; for array and struct, the sum of the vec4_size of
638  * each of its elements; and for sampler and atomic, zero.
639  *
640  * This method is useful to calculate how much register space is needed to
641  * store a particular type.
642  */
643 extern "C" int
type_size_vec4(const struct glsl_type * type)644 type_size_vec4(const struct glsl_type *type)
645 {
646    return type_size_xvec4(type, true);
647 }
648 
649 /**
650  * Returns the minimum number of dvec4 elements needed to pack a type.
651  *
652  * For simple types, it will return 1 (a single dvec4); for matrices, the
653  * number of columns; for array and struct, the sum of the dvec4_size of
654  * each of its elements; and for sampler and atomic, zero.
655  *
656  * This method is useful to calculate how much register space is needed to
657  * store a particular type.
658  *
659  * Measuring double-precision vertex inputs as dvec4 is required because
660  * ARB_vertex_attrib_64bit states that these uses the same number of locations
661  * than the single-precision version. That is, two consecutives dvec4 would be
662  * located in location "x" and location "x+1", not "x+2".
663  *
664  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
665  * remap_vs_attrs() will take in account both the location and also if the
666  * type fits in one or two vec4 slots.
667  */
668 extern "C" int
type_size_dvec4(const struct glsl_type * type)669 type_size_dvec4(const struct glsl_type *type)
670 {
671    return type_size_xvec4(type, false);
672 }
673 
src_reg(class vec4_visitor * v,const struct glsl_type * type)674 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
675 {
676    init();
677 
678    this->file = VGRF;
679    this->nr = v->alloc.allocate(type_size_vec4(type));
680 
681    if (type->is_array() || type->is_record()) {
682       this->swizzle = BRW_SWIZZLE_NOOP;
683    } else {
684       this->swizzle = brw_swizzle_for_size(type->vector_elements);
685    }
686 
687    this->type = brw_type_for_base_type(type);
688 }
689 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)690 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
691 {
692    assert(size > 0);
693 
694    init();
695 
696    this->file = VGRF;
697    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
698 
699    this->swizzle = BRW_SWIZZLE_NOOP;
700 
701    this->type = brw_type_for_base_type(type);
702 }
703 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)704 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
705 {
706    init();
707 
708    this->file = VGRF;
709    this->nr = v->alloc.allocate(type_size_vec4(type));
710 
711    if (type->is_array() || type->is_record()) {
712       this->writemask = WRITEMASK_XYZW;
713    } else {
714       this->writemask = (1 << type->vector_elements) - 1;
715    }
716 
717    this->type = brw_type_for_base_type(type);
718 }
719 
720 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)721 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
722                           src_reg src0, src_reg src1)
723 {
724    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
725    inst->conditional_mod = conditionalmod;
726    return inst;
727 }
728 
729 vec4_instruction *
emit_lrp(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)730 vec4_visitor::emit_lrp(const dst_reg &dst,
731                        const src_reg &x, const src_reg &y, const src_reg &a)
732 {
733    if (devinfo->gen >= 6) {
734       /* Note that the instruction's argument order is reversed from GLSL
735        * and the IR.
736        */
737      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
738                      fix_3src_operand(x)));
739    } else {
740       /* Earlier generations don't support three source operations, so we
741        * need to emit x*(1-a) + y*a.
742        */
743       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
744       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
745       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
746       y_times_a.writemask           = dst.writemask;
747       one_minus_a.writemask         = dst.writemask;
748       x_times_one_minus_a.writemask = dst.writemask;
749 
750       emit(MUL(y_times_a, y, a));
751       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
752       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
753       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
754    }
755 }
756 
757 /**
758  * Emits the instructions needed to perform a pull constant load. before_block
759  * and before_inst can be NULL in which case the instruction will be appended
760  * to the end of the instruction list.
761  */
762 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)763 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
764                                           src_reg surf_index,
765                                           src_reg offset_reg,
766                                           bblock_t *before_block,
767                                           vec4_instruction *before_inst)
768 {
769    assert((before_inst == NULL && before_block == NULL) ||
770           (before_inst && before_block));
771 
772    vec4_instruction *pull;
773 
774    if (devinfo->gen >= 9) {
775       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
776       src_reg header(this, glsl_type::uvec4_type, 2);
777 
778       pull = new(mem_ctx)
779          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
780                           dst_reg(header));
781 
782       if (before_inst)
783          emit_before(before_block, before_inst, pull);
784       else
785          emit(pull);
786 
787       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
788                                  offset_reg.type);
789       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
790 
791       if (before_inst)
792          emit_before(before_block, before_inst, pull);
793       else
794          emit(pull);
795 
796       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
797                                            dst,
798                                            surf_index,
799                                            header);
800       pull->mlen = 2;
801       pull->header_size = 1;
802    } else if (devinfo->gen >= 7) {
803       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
804 
805       grf_offset.type = offset_reg.type;
806 
807       pull = MOV(grf_offset, offset_reg);
808 
809       if (before_inst)
810          emit_before(before_block, before_inst, pull);
811       else
812          emit(pull);
813 
814       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
815                                            dst,
816                                            surf_index,
817                                            src_reg(grf_offset));
818       pull->mlen = 1;
819    } else {
820       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
821                                            dst,
822                                            surf_index,
823                                            offset_reg);
824       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
825       pull->mlen = 1;
826    }
827 
828    if (before_inst)
829       emit_before(before_block, before_inst, pull);
830    else
831       emit(pull);
832 }
833 
834 src_reg
emit_uniformize(const src_reg & src)835 vec4_visitor::emit_uniformize(const src_reg &src)
836 {
837    const src_reg chan_index(this, glsl_type::uint_type);
838    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
839                               src.type);
840 
841    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
842       ->force_writemask_all = true;
843    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
844       ->force_writemask_all = true;
845 
846    return src_reg(dst);
847 }
848 
849 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)850 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
851                              src_reg coordinate, src_reg surface)
852 {
853    vec4_instruction *inst =
854       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
855                                     dst_reg(this, glsl_type::uvec4_type));
856    inst->base_mrf = 2;
857    inst->src[1] = surface;
858    inst->src[2] = surface;
859 
860    int param_base;
861 
862    if (devinfo->gen >= 9) {
863       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
864       vec4_instruction *header_inst = new(mem_ctx)
865          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
866                           dst_reg(MRF, inst->base_mrf));
867 
868       emit(header_inst);
869 
870       inst->mlen = 2;
871       inst->header_size = 1;
872       param_base = inst->base_mrf + 1;
873    } else {
874       inst->mlen = 1;
875       param_base = inst->base_mrf;
876    }
877 
878    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
879    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
880    int zero_mask = 0xf & ~coord_mask;
881 
882    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
883             coordinate));
884 
885    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
886             brw_imm_d(0)));
887 
888    emit(inst);
889    return src_reg(inst->dst);
890 }
891 
892 bool
is_high_sampler(src_reg sampler)893 vec4_visitor::is_high_sampler(src_reg sampler)
894 {
895    if (devinfo->gen < 8 && !devinfo->is_haswell)
896       return false;
897 
898    return sampler.file != IMM || sampler.ud >= 16;
899 }
900 
901 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)902 vec4_visitor::emit_texture(ir_texture_opcode op,
903                            dst_reg dest,
904                            const glsl_type *dest_type,
905                            src_reg coordinate,
906                            int coord_components,
907                            src_reg shadow_comparator,
908                            src_reg lod, src_reg lod2,
909                            src_reg sample_index,
910                            uint32_t constant_offset,
911                            src_reg offset_value,
912                            src_reg mcs,
913                            uint32_t surface,
914                            src_reg surface_reg,
915                            src_reg sampler_reg)
916 {
917    /* The sampler can only meaningfully compute LOD for fragment shader
918     * messages. For all other stages, we change the opcode to TXL and hardcode
919     * the LOD to 0.
920     *
921     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
922     * valid LOD argument.
923     */
924    if (op == ir_tex || op == ir_query_levels) {
925       assert(lod.file == BAD_FILE);
926       lod = brw_imm_f(0.0f);
927    }
928 
929    enum opcode opcode;
930    switch (op) {
931    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
932    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
933    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
934    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
935    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
936                              SHADER_OPCODE_TXF_CMS); break;
937    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
938    case ir_tg4: opcode = offset_value.file != BAD_FILE
939                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
940    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
941    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
942    case ir_txb:
943       unreachable("TXB is not valid for vertex shaders.");
944    case ir_lod:
945       unreachable("LOD is not valid for vertex shaders.");
946    case ir_samples_identical: {
947       /* There are some challenges implementing this for vec4, and it seems
948        * unlikely to be used anyway.  For now, just return false ways.
949        */
950       emit(MOV(dest, brw_imm_ud(0u)));
951       return;
952    }
953    default:
954       unreachable("Unrecognized tex op");
955    }
956 
957    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
958 
959    inst->offset = constant_offset;
960 
961    /* The message header is necessary for:
962     * - Gen4 (always)
963     * - Gen9+ for selecting SIMD4x2
964     * - Texel offsets
965     * - Gather channel selection
966     * - Sampler indices too large to fit in a 4-bit value.
967     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
968     */
969    inst->header_size =
970       (devinfo->gen < 5 || devinfo->gen >= 9 ||
971        inst->offset != 0 || op == ir_tg4 ||
972        op == ir_texture_samples ||
973        is_high_sampler(sampler_reg)) ? 1 : 0;
974    inst->base_mrf = 2;
975    inst->mlen = inst->header_size;
976    inst->dst.writemask = WRITEMASK_XYZW;
977    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
978 
979    inst->src[1] = surface_reg;
980    inst->src[2] = sampler_reg;
981 
982    /* MRF for the first parameter */
983    int param_base = inst->base_mrf + inst->header_size;
984 
985    if (op == ir_txs || op == ir_query_levels) {
986       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
987       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
988       inst->mlen++;
989    } else if (op == ir_texture_samples) {
990       inst->dst.writemask = WRITEMASK_X;
991    } else {
992       /* Load the coordinate */
993       /* FINISHME: gl_clamp_mask and saturate */
994       int coord_mask = (1 << coord_components) - 1;
995       int zero_mask = 0xf & ~coord_mask;
996 
997       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
998                coordinate));
999       inst->mlen++;
1000 
1001       if (zero_mask != 0) {
1002          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1003                   brw_imm_d(0)));
1004       }
1005       /* Load the shadow comparator */
1006       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1007 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1008 			  WRITEMASK_X),
1009 		  shadow_comparator));
1010 	 inst->mlen++;
1011       }
1012 
1013       /* Load the LOD info */
1014       if (op == ir_tex || op == ir_txl) {
1015 	 int mrf, writemask;
1016 	 if (devinfo->gen >= 5) {
1017 	    mrf = param_base + 1;
1018 	    if (shadow_comparator.file != BAD_FILE) {
1019 	       writemask = WRITEMASK_Y;
1020 	       /* mlen already incremented */
1021 	    } else {
1022 	       writemask = WRITEMASK_X;
1023 	       inst->mlen++;
1024 	    }
1025 	 } else /* devinfo->gen == 4 */ {
1026 	    mrf = param_base;
1027 	    writemask = WRITEMASK_W;
1028 	 }
1029 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1030       } else if (op == ir_txf) {
1031          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1032       } else if (op == ir_txf_ms) {
1033          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1034                   sample_index));
1035          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1036             /* MCS data is stored in the first two channels of ‘mcs’, but we
1037              * need to get it into the .y and .z channels of the second vec4
1038              * of params.
1039              */
1040             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1041             emit(MOV(dst_reg(MRF, param_base + 1,
1042                              glsl_type::uint_type, WRITEMASK_YZ),
1043                      mcs));
1044          } else if (devinfo->gen >= 7) {
1045             /* MCS data is in the first channel of `mcs`, but we need to get it into
1046              * the .y channel of the second vec4 of params, so replicate .x across
1047              * the whole vec4 and then mask off everything except .y
1048              */
1049             mcs.swizzle = BRW_SWIZZLE_XXXX;
1050             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1051                      mcs));
1052          }
1053          inst->mlen++;
1054       } else if (op == ir_txd) {
1055          const brw_reg_type type = lod.type;
1056 
1057 	 if (devinfo->gen >= 5) {
1058 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1061 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1062 	    inst->mlen++;
1063 
1064 	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1065 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
1066 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1067 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1068 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1069 	       inst->mlen++;
1070 
1071                if (shadow_comparator.file != BAD_FILE) {
1072                   emit(MOV(dst_reg(MRF, param_base + 2,
1073                                    shadow_comparator.type, WRITEMASK_Z),
1074                            shadow_comparator));
1075                }
1076 	    }
1077 	 } else /* devinfo->gen == 4 */ {
1078 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1079 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1080 	    inst->mlen += 2;
1081 	 }
1082       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1083          if (shadow_comparator.file != BAD_FILE) {
1084             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1085                      shadow_comparator));
1086          }
1087 
1088          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1089                   offset_value));
1090          inst->mlen++;
1091       }
1092    }
1093 
1094    emit(inst);
1095 
1096    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1097     * spec requires layers.
1098     */
1099    if (op == ir_txs && devinfo->gen < 7) {
1100       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1101       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1102                   src_reg(inst->dst), brw_imm_d(1));
1103    }
1104 
1105    if (devinfo->gen == 6 && op == ir_tg4) {
1106       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1107    }
1108 
1109    if (op == ir_query_levels) {
1110       /* # levels is in .w */
1111       src_reg swizzled(dest);
1112       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1113                                       SWIZZLE_W, SWIZZLE_W);
1114       emit(MOV(dest, swizzled));
1115    }
1116 }
1117 
1118 /**
1119  * Apply workarounds for Gen6 gather with UINT/SINT
1120  */
1121 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1122 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1123 {
1124    if (!wa)
1125       return;
1126 
1127    int width = (wa & WA_8BIT) ? 8 : 16;
1128    dst_reg dst_f = dst;
1129    dst_f.type = BRW_REGISTER_TYPE_F;
1130 
1131    /* Convert from UNORM to UINT */
1132    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1133    emit(MOV(dst, src_reg(dst_f)));
1134 
1135    if (wa & WA_SIGN) {
1136       /* Reinterpret the UINT value as a signed INT value by
1137        * shifting the sign bit into place, then shifting back
1138        * preserving sign.
1139        */
1140       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1141       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1142    }
1143 }
1144 
1145 void
gs_emit_vertex(int)1146 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1147 {
1148    unreachable("not reached");
1149 }
1150 
1151 void
gs_end_primitive()1152 vec4_visitor::gs_end_primitive()
1153 {
1154    unreachable("not reached");
1155 }
1156 
1157 void
emit_ndc_computation()1158 vec4_visitor::emit_ndc_computation()
1159 {
1160    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1161       return;
1162 
1163    /* Get the position */
1164    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1165 
1166    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1167    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1168    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1169    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1170 
1171    current_annotation = "NDC";
1172    dst_reg ndc_w = ndc;
1173    ndc_w.writemask = WRITEMASK_W;
1174    src_reg pos_w = pos;
1175    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1176    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1177 
1178    dst_reg ndc_xyz = ndc;
1179    ndc_xyz.writemask = WRITEMASK_XYZ;
1180 
1181    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1182 }
1183 
1184 void
emit_psiz_and_flags(dst_reg reg)1185 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1186 {
1187    if (devinfo->gen < 6 &&
1188        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1189         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1190         devinfo->has_negative_rhw_bug)) {
1191       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1192       dst_reg header1_w = header1;
1193       header1_w.writemask = WRITEMASK_W;
1194 
1195       emit(MOV(header1, brw_imm_ud(0u)));
1196 
1197       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1198 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1199 
1200 	 current_annotation = "Point size";
1201 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1202 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1203       }
1204 
1205       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1206          current_annotation = "Clipping flags";
1207          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1208          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1209 
1210          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1211          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1212          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1213 
1214          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218       }
1219 
1220       /* i965 clipping workaround:
1221        * 1) Test for -ve rhw
1222        * 2) If set,
1223        *      set ndc = (0,0,0,0)
1224        *      set ucp[6] = 1
1225        *
1226        * Later, clipping will detect ucp[6] and ensure the primitive is
1227        * clipped against all fixed planes.
1228        */
1229       if (devinfo->has_negative_rhw_bug &&
1230           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234          vec4_instruction *inst;
1235          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236          inst->predicate = BRW_PREDICATE_NORMAL;
1237          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239          inst->predicate = BRW_PREDICATE_NORMAL;
1240       }
1241 
1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243    } else if (devinfo->gen < 6) {
1244       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245    } else {
1246       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1248          dst_reg reg_w = reg;
1249          reg_w.writemask = WRITEMASK_W;
1250          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251          reg_as_src.type = reg_w.type;
1252          reg_as_src.swizzle = brw_swizzle_for_size(1);
1253          emit(MOV(reg_w, reg_as_src));
1254       }
1255       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1256          dst_reg reg_y = reg;
1257          reg_y.writemask = WRITEMASK_Y;
1258          reg_y.type = BRW_REGISTER_TYPE_D;
1259          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261       }
1262       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1263          dst_reg reg_z = reg;
1264          reg_z.writemask = WRITEMASK_Z;
1265          reg_z.type = BRW_REGISTER_TYPE_D;
1266          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268       }
1269    }
1270 }
1271 
1272 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274 {
1275    assert(varying < VARYING_SLOT_MAX);
1276 
1277    unsigned num_comps = output_num_components[varying][component];
1278    if (num_comps == 0)
1279       return NULL;
1280 
1281    assert(output_reg[varying][component].type == reg.type);
1282    current_annotation = output_reg_annotation[varying];
1283    if (output_reg[varying][component].file != BAD_FILE) {
1284       src_reg src = src_reg(output_reg[varying][component]);
1285       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286       reg.writemask =
1287          brw_writemask_for_component_packing(num_comps, component);
1288       return emit(MOV(reg, src));
1289    }
1290    return NULL;
1291 }
1292 
1293 void
emit_urb_slot(dst_reg reg,int varying)1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295 {
1296    reg.type = BRW_REGISTER_TYPE_F;
1297    output_reg[varying][0].type = reg.type;
1298 
1299    switch (varying) {
1300    case VARYING_SLOT_PSIZ:
1301    {
1302       /* PSIZ is always in slot 0, and is coupled with other flags. */
1303       current_annotation = "indices, point width, clip flags";
1304       emit_psiz_and_flags(reg);
1305       break;
1306    }
1307    case BRW_VARYING_SLOT_NDC:
1308       current_annotation = "NDC";
1309       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311       break;
1312    case VARYING_SLOT_POS:
1313       current_annotation = "gl_Position";
1314       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316       break;
1317    case VARYING_SLOT_EDGE:
1318       /* This is present when doing unfilled polygons.  We're supposed to copy
1319        * the edge flag from the user-provided vertex array
1320        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321        * of that attribute (starts as 1.0f).  This is then used in clipping to
1322        * determine which edges should be drawn as wireframe.
1323        */
1324       current_annotation = "edge flag";
1325       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1326                                     glsl_type::float_type, WRITEMASK_XYZW))));
1327       break;
1328    case BRW_VARYING_SLOT_PAD:
1329       /* No need to write to this slot */
1330       break;
1331    default:
1332       for (int i = 0; i < 4; i++) {
1333          emit_generic_urb_slot(reg, varying, i);
1334       }
1335       break;
1336    }
1337 }
1338 
1339 static int
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,int mlen)1340 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1341 {
1342    if (devinfo->gen >= 6) {
1343       /* URB data written (does not include the message header reg) must
1344        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1345        * section 5.4.3.2.2: URB_INTERLEAVED.
1346        *
1347        * URB entries are allocated on a multiple of 1024 bits, so an
1348        * extra 128 bits written here to make the end align to 256 is
1349        * no problem.
1350        */
1351       if ((mlen % 2) != 1)
1352 	 mlen++;
1353    }
1354 
1355    return mlen;
1356 }
1357 
1358 
1359 /**
1360  * Generates the VUE payload plus the necessary URB write instructions to
1361  * output it.
1362  *
1363  * The VUE layout is documented in Volume 2a.
1364  */
1365 void
emit_vertex()1366 vec4_visitor::emit_vertex()
1367 {
1368    /* MRF 0 is reserved for the debugger, so start with message header
1369     * in MRF 1.
1370     */
1371    int base_mrf = 1;
1372    int mrf = base_mrf;
1373    /* In the process of generating our URB write message contents, we
1374     * may need to unspill a register or load from an array.  Those
1375     * reads would use MRFs 14-15.
1376     */
1377    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1378 
1379    /* The following assertion verifies that max_usable_mrf causes an
1380     * even-numbered amount of URB write data, which will meet gen6's
1381     * requirements for length alignment.
1382     */
1383    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1384 
1385    /* First mrf is the g0-based message header containing URB handles and
1386     * such.
1387     */
1388    emit_urb_write_header(mrf++);
1389 
1390    if (devinfo->gen < 6) {
1391       emit_ndc_computation();
1392    }
1393 
1394    /* We may need to split this up into several URB writes, so do them in a
1395     * loop.
1396     */
1397    int slot = 0;
1398    bool complete = false;
1399    do {
1400       /* URB offset is in URB row increments, and each of our MRFs is half of
1401        * one of those, since we're doing interleaved writes.
1402        */
1403       int offset = slot / 2;
1404 
1405       mrf = base_mrf + 1;
1406       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1407          emit_urb_slot(dst_reg(MRF, mrf++),
1408                        prog_data->vue_map.slot_to_varying[slot]);
1409 
1410          /* If this was max_usable_mrf, we can't fit anything more into this
1411           * URB WRITE. Same thing if we reached the maximum length available.
1412           */
1413          if (mrf > max_usable_mrf ||
1414              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1415             slot++;
1416             break;
1417          }
1418       }
1419 
1420       complete = slot >= prog_data->vue_map.num_slots;
1421       current_annotation = "URB write";
1422       vec4_instruction *inst = emit_urb_write_opcode(complete);
1423       inst->base_mrf = base_mrf;
1424       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1425       inst->offset += offset;
1426    } while(!complete);
1427 }
1428 
1429 
1430 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1431 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1432 				 src_reg *reladdr, int reg_offset)
1433 {
1434    /* Because we store the values to scratch interleaved like our
1435     * vertex data, we need to scale the vec4 index by 2.
1436     */
1437    int message_header_scale = 2;
1438 
1439    /* Pre-gen6, the message header uses byte offsets instead of vec4
1440     * (16-byte) offset units.
1441     */
1442    if (devinfo->gen < 6)
1443       message_header_scale *= 16;
1444 
1445    if (reladdr) {
1446       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1447        * to multiply the reladdr by 2. Notice that the reg_offset part
1448        * is in units of 16 bytes and is used to select the low/high 16-byte
1449        * chunk of a full dvec4, so we don't want to multiply that part.
1450        */
1451       src_reg index = src_reg(this, glsl_type::int_type);
1452       if (type_sz(inst->dst.type) < 8) {
1453          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1454                                       brw_imm_d(reg_offset)));
1455          emit_before(block, inst, MUL(dst_reg(index), index,
1456                                       brw_imm_d(message_header_scale)));
1457       } else {
1458          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1459                                       brw_imm_d(message_header_scale * 2)));
1460          emit_before(block, inst, ADD(dst_reg(index), index,
1461                                       brw_imm_d(reg_offset * message_header_scale)));
1462       }
1463       return index;
1464    } else {
1465       return brw_imm_d(reg_offset * message_header_scale);
1466    }
1467 }
1468 
1469 /**
1470  * Emits an instruction before @inst to load the value named by @orig_src
1471  * from scratch space at @base_offset to @temp.
1472  *
1473  * @base_offset is measured in 32-byte units (the size of a register).
1474  */
1475 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1476 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1477 				dst_reg temp, src_reg orig_src,
1478 				int base_offset)
1479 {
1480    assert(orig_src.offset % REG_SIZE == 0);
1481    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1482    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1483                                       reg_offset);
1484 
1485    if (type_sz(orig_src.type) < 8) {
1486       emit_before(block, inst, SCRATCH_READ(temp, index));
1487    } else {
1488       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1489       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1490       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1491       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1492       vec4_instruction *last_read =
1493          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1494       emit_before(block, inst, last_read);
1495       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1496    }
1497 }
1498 
1499 /**
1500  * Emits an instruction after @inst to store the value to be written
1501  * to @orig_dst to scratch space at @base_offset, from @temp.
1502  *
1503  * @base_offset is measured in 32-byte units (the size of a register).
1504  */
1505 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1506 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1507                                  int base_offset)
1508 {
1509    assert(inst->dst.offset % REG_SIZE == 0);
1510    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1511    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1512                                       reg_offset);
1513 
1514    /* Create a temporary register to store *inst's result in.
1515     *
1516     * We have to be careful in MOVing from our temporary result register in
1517     * the scratch write.  If we swizzle from channels of the temporary that
1518     * weren't initialized, it will confuse live interval analysis, which will
1519     * make spilling fail to make progress.
1520     */
1521    bool is_64bit = type_sz(inst->dst.type) == 8;
1522    const glsl_type *alloc_type =
1523       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1524    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1525                                        inst->dst.type),
1526                                 brw_swizzle_for_mask(inst->dst.writemask));
1527 
1528    if (!is_64bit) {
1529       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1530 				          inst->dst.writemask));
1531       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1532       if (inst->opcode != BRW_OPCODE_SEL)
1533          write->predicate = inst->predicate;
1534       write->ir = inst->ir;
1535       write->annotation = inst->annotation;
1536       inst->insert_after(block, write);
1537    } else {
1538       dst_reg shuffled = dst_reg(this, alloc_type);
1539       vec4_instruction *last =
1540          shuffle_64bit_data(shuffled, temp, true, block, inst);
1541       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1542 
1543       uint8_t mask = 0;
1544       if (inst->dst.writemask & WRITEMASK_X)
1545          mask |= WRITEMASK_XY;
1546       if (inst->dst.writemask & WRITEMASK_Y)
1547          mask |= WRITEMASK_ZW;
1548       if (mask) {
1549          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1550 
1551          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1552          if (inst->opcode != BRW_OPCODE_SEL)
1553             write->predicate = inst->predicate;
1554          write->ir = inst->ir;
1555          write->annotation = inst->annotation;
1556          last->insert_after(block, write);
1557       }
1558 
1559       mask = 0;
1560       if (inst->dst.writemask & WRITEMASK_Z)
1561          mask |= WRITEMASK_XY;
1562       if (inst->dst.writemask & WRITEMASK_W)
1563          mask |= WRITEMASK_ZW;
1564       if (mask) {
1565          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1566 
1567          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1568                                             reg_offset + 1);
1569          vec4_instruction *write =
1570             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1571          if (inst->opcode != BRW_OPCODE_SEL)
1572             write->predicate = inst->predicate;
1573          write->ir = inst->ir;
1574          write->annotation = inst->annotation;
1575          last->insert_after(block, write);
1576       }
1577    }
1578 
1579    inst->dst.file = temp.file;
1580    inst->dst.nr = temp.nr;
1581    inst->dst.offset %= REG_SIZE;
1582    inst->dst.reladdr = NULL;
1583 }
1584 
1585 /**
1586  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1587  * adds the scratch read(s) before \p inst. The function also checks for
1588  * recursive reladdr scratch accesses, issuing the corresponding scratch
1589  * loads and rewriting reladdr references accordingly.
1590  *
1591  * \return \p src if it did not require a scratch load, otherwise, the
1592  * register holding the result of the scratch load that the caller should
1593  * use to rewrite src.
1594  */
1595 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1596 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1597                                    vec4_instruction *inst, src_reg src)
1598 {
1599    /* Resolve recursive reladdr scratch access by calling ourselves
1600     * with src.reladdr
1601     */
1602    if (src.reladdr)
1603       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1604                                           *src.reladdr);
1605 
1606    /* Now handle scratch access on src */
1607    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1608       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1609          glsl_type::dvec4_type : glsl_type::vec4_type);
1610       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1611       src.nr = temp.nr;
1612       src.offset %= REG_SIZE;
1613       src.reladdr = NULL;
1614    }
1615 
1616    return src;
1617 }
1618 
1619 /**
1620  * We can't generally support array access in GRF space, because a
1621  * single instruction's destination can only span 2 contiguous
1622  * registers.  So, we send all GRF arrays that get variable index
1623  * access to scratch space.
1624  */
1625 void
move_grf_array_access_to_scratch()1626 vec4_visitor::move_grf_array_access_to_scratch()
1627 {
1628    int scratch_loc[this->alloc.count];
1629    memset(scratch_loc, -1, sizeof(scratch_loc));
1630 
1631    /* First, calculate the set of virtual GRFs that need to be punted
1632     * to scratch due to having any array access on them, and where in
1633     * scratch.
1634     */
1635    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1636       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1637          if (scratch_loc[inst->dst.nr] == -1) {
1638             scratch_loc[inst->dst.nr] = last_scratch;
1639             last_scratch += this->alloc.sizes[inst->dst.nr];
1640          }
1641 
1642          for (src_reg *iter = inst->dst.reladdr;
1643               iter->reladdr;
1644               iter = iter->reladdr) {
1645             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1646                scratch_loc[iter->nr] = last_scratch;
1647                last_scratch += this->alloc.sizes[iter->nr];
1648             }
1649          }
1650       }
1651 
1652       for (int i = 0 ; i < 3; i++) {
1653          for (src_reg *iter = &inst->src[i];
1654               iter->reladdr;
1655               iter = iter->reladdr) {
1656             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1657                scratch_loc[iter->nr] = last_scratch;
1658                last_scratch += this->alloc.sizes[iter->nr];
1659             }
1660          }
1661       }
1662    }
1663 
1664    /* Now, for anything that will be accessed through scratch, rewrite
1665     * it to load/store.  Note that this is a _safe list walk, because
1666     * we may generate a new scratch_write instruction after the one
1667     * we're processing.
1668     */
1669    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1670       /* Set up the annotation tracking for new generated instructions. */
1671       base_ir = inst->ir;
1672       current_annotation = inst->annotation;
1673 
1674       /* First handle scratch access on the dst. Notice we have to handle
1675        * the case where the dst's reladdr also points to scratch space.
1676        */
1677       if (inst->dst.reladdr)
1678          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1679                                                    *inst->dst.reladdr);
1680 
1681       /* Now that we have handled any (possibly recursive) reladdr scratch
1682        * accesses for dst we can safely do the scratch write for dst itself
1683        */
1684       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1685          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1686 
1687       /* Now handle scratch access on any src. In this case, since inst->src[i]
1688        * already is a src_reg, we can just call emit_resolve_reladdr with
1689        * inst->src[i] and it will take care of handling scratch loads for
1690        * both src and src.reladdr (recursively).
1691        */
1692       for (int i = 0 ; i < 3; i++) {
1693          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1694                                              inst->src[i]);
1695       }
1696    }
1697 }
1698 
1699 /**
1700  * Emits an instruction before @inst to load the value named by @orig_src
1701  * from the pull constant buffer (surface) at @base_offset to @temp.
1702  */
1703 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1704 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1705                                       dst_reg temp, src_reg orig_src,
1706                                       int base_offset, src_reg indirect)
1707 {
1708    assert(orig_src.offset % 16 == 0);
1709    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1710 
1711    /* For 64bit loads we need to emit two 32-bit load messages and we also
1712     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1713     * that we emit the 32-bit loads into a temporary and we shuffle the result
1714     * into the original destination.
1715     */
1716    dst_reg orig_temp = temp;
1717    bool is_64bit = type_sz(orig_src.type) == 8;
1718    if (is_64bit) {
1719       assert(type_sz(temp.type) == 8);
1720       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1721       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1722    }
1723 
1724    src_reg src = orig_src;
1725    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1726       int reg_offset = base_offset + src.offset / 16;
1727 
1728       src_reg offset;
1729       if (indirect.file != BAD_FILE) {
1730          offset = src_reg(this, glsl_type::uint_type);
1731          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1732                                       brw_imm_ud(reg_offset * 16)));
1733       } else if (devinfo->gen >= 8) {
1734          /* Store the offset in a GRF so we can send-from-GRF. */
1735          offset = src_reg(this, glsl_type::uint_type);
1736          emit_before(block, inst, MOV(dst_reg(offset),
1737                                       brw_imm_ud(reg_offset * 16)));
1738       } else {
1739          offset = brw_imm_d(reg_offset * 16);
1740       }
1741 
1742       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1743                                   brw_imm_ud(index),
1744                                   offset,
1745                                   block, inst);
1746 
1747       src = byte_offset(src, 16);
1748    }
1749 
1750    brw_mark_surface_used(&prog_data->base, index);
1751 
1752    if (is_64bit) {
1753       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1754       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1755    }
1756 }
1757 
1758 /**
1759  * Implements array access of uniforms by inserting a
1760  * PULL_CONSTANT_LOAD instruction.
1761  *
1762  * Unlike temporary GRF array access (where we don't support it due to
1763  * the difficulty of doing relative addressing on instruction
1764  * destinations), we could potentially do array access of uniforms
1765  * that were loaded in GRF space as push constants.  In real-world
1766  * usage we've seen, though, the arrays being used are always larger
1767  * than we could load as push constants, so just always move all
1768  * uniform array access out to a pull constant buffer.
1769  */
1770 void
move_uniform_array_access_to_pull_constants()1771 vec4_visitor::move_uniform_array_access_to_pull_constants()
1772 {
1773    /* The vulkan dirver doesn't support pull constants other than UBOs so
1774     * everything has to be pushed regardless.
1775     */
1776    if (stage_prog_data->pull_param == NULL) {
1777       split_uniform_registers();
1778       return;
1779    }
1780 
1781    int pull_constant_loc[this->uniforms];
1782    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1783 
1784    /* First, walk through the instructions and determine which things need to
1785     * be pulled.  We mark something as needing to be pulled by setting
1786     * pull_constant_loc to 0.
1787     */
1788    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1789       /* We only care about MOV_INDIRECT of a uniform */
1790       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1791           inst->src[0].file != UNIFORM)
1792          continue;
1793 
1794       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1795 
1796       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1797          pull_constant_loc[uniform_nr + j] = 0;
1798    }
1799 
1800    /* Next, we walk the list of uniforms and assign real pull constant
1801     * locations and set their corresponding entries in pull_param.
1802     */
1803    for (int j = 0; j < this->uniforms; j++) {
1804       if (pull_constant_loc[j] < 0)
1805          continue;
1806 
1807       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1808 
1809       for (int i = 0; i < 4; i++) {
1810          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1811             = stage_prog_data->param[j * 4 + i];
1812       }
1813    }
1814 
1815    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1816     * instructions to actual uniform pulls.
1817     */
1818    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1819       /* We only care about MOV_INDIRECT of a uniform */
1820       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1821           inst->src[0].file != UNIFORM)
1822          continue;
1823 
1824       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1825 
1826       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1827 
1828       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1829                               pull_constant_loc[uniform_nr], inst->src[1]);
1830       inst->remove(block);
1831    }
1832 
1833    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1834     * no need to track them as larger-than-vec4 objects.  This will be
1835     * relied on in cutting out unused uniform vectors from push
1836     * constants.
1837     */
1838    split_uniform_registers();
1839 }
1840 
1841 void
resolve_ud_negate(src_reg * reg)1842 vec4_visitor::resolve_ud_negate(src_reg *reg)
1843 {
1844    if (reg->type != BRW_REGISTER_TYPE_UD ||
1845        !reg->negate)
1846       return;
1847 
1848    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1849    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1850    *reg = temp;
1851 }
1852 
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1853 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1854                            void *log_data,
1855                            const struct brw_sampler_prog_key_data *key_tex,
1856                            struct brw_vue_prog_data *prog_data,
1857                            const nir_shader *shader,
1858 			   void *mem_ctx,
1859                            bool no_spills,
1860                            int shader_time_index)
1861    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1862      key_tex(key_tex),
1863      prog_data(prog_data),
1864      fail_msg(NULL),
1865      first_non_payload_grf(0),
1866      need_all_constants_in_pull_buffer(false),
1867      no_spills(no_spills),
1868      shader_time_index(shader_time_index),
1869      last_scratch(0)
1870 {
1871    this->failed = false;
1872 
1873    this->base_ir = NULL;
1874    this->current_annotation = NULL;
1875    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1876 
1877    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1878 
1879    this->virtual_grf_start = NULL;
1880    this->virtual_grf_end = NULL;
1881    this->live_intervals = NULL;
1882 
1883    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1884 
1885    this->uniforms = 0;
1886 }
1887 
~vec4_visitor()1888 vec4_visitor::~vec4_visitor()
1889 {
1890 }
1891 
1892 
1893 void
fail(const char * format,...)1894 vec4_visitor::fail(const char *format, ...)
1895 {
1896    va_list va;
1897    char *msg;
1898 
1899    if (failed)
1900       return;
1901 
1902    failed = true;
1903 
1904    va_start(va, format);
1905    msg = ralloc_vasprintf(mem_ctx, format, va);
1906    va_end(va);
1907    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1908 
1909    this->fail_msg = msg;
1910 
1911    if (debug_enabled) {
1912       fprintf(stderr, "%s",  msg);
1913    }
1914 }
1915 
1916 } /* namespace brw */
1917