• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28 
29 namespace brw {
30 
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                    const src_reg &src0, const src_reg &src1,
33                                    const src_reg &src2)
34 {
35    this->opcode = opcode;
36    this->dst = dst;
37    this->src[0] = src0;
38    this->src[1] = src1;
39    this->src[2] = src2;
40    this->saturate = false;
41    this->force_writemask_all = false;
42    this->no_dd_clear = false;
43    this->no_dd_check = false;
44    this->writes_accumulator = false;
45    this->conditional_mod = BRW_CONDITIONAL_NONE;
46    this->predicate = BRW_PREDICATE_NONE;
47    this->predicate_inverse = false;
48    this->target = 0;
49    this->shadow_compare = false;
50    this->eot = false;
51    this->ir = NULL;
52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53    this->header_size = 0;
54    this->flag_subreg = 0;
55    this->mlen = 0;
56    this->base_mrf = 0;
57    this->offset = 0;
58    this->exec_size = 8;
59    this->group = 0;
60    this->size_written = (dst.file == BAD_FILE ?
61                          0 : this->exec_size * type_sz(dst.type));
62    this->annotation = NULL;
63 }
64 
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68    inst->ir = this->base_ir;
69    inst->annotation = this->current_annotation;
70 
71    this->instructions.push_tail(inst);
72 
73    return inst;
74 }
75 
76 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78                           vec4_instruction *new_inst)
79 {
80    new_inst->ir = inst->ir;
81    new_inst->annotation = inst->annotation;
82 
83    inst->insert_before(block, new_inst);
84 
85    return inst;
86 }
87 
88 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90                    const src_reg &src1, const src_reg &src2)
91 {
92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94 
95 
96 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98                    const src_reg &src1)
99 {
100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102 
103 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108 
109 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114 
115 vec4_instruction *
emit(enum opcode opcode)116 vec4_visitor::emit(enum opcode opcode)
117 {
118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120 
121 #define ALU1(op)							\
122    vec4_instruction *							\
123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124    {									\
125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126    }
127 
128 #define ALU2(op)							\
129    vec4_instruction *							\
130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                     const src_reg &src1)				\
132    {									\
133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
134                                            src0, src1);                 \
135    }
136 
137 #define ALU2_ACC(op)							\
138    vec4_instruction *							\
139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                     const src_reg &src1)				\
141    {									\
142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                        BRW_OPCODE_##op, dst, src0, src1);		\
144       inst->writes_accumulator = true;                                  \
145       return inst;                                                      \
146    }
147 
148 #define ALU3(op)							\
149    vec4_instruction *							\
150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                     const src_reg &src1, const src_reg &src2)		\
152    {									\
153       assert(devinfo->ver >= 6);						\
154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
155 					   src0, src1, src2);		\
156    }
157 
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191 
192 /** Gfx4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196    vec4_instruction *inst;
197 
198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199    inst->predicate = predicate;
200 
201    return inst;
202 }
203 
204 /** Gfx6 IF with embedded comparison. */
205 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)206 vec4_visitor::IF(src_reg src0, src_reg src1,
207                  enum brw_conditional_mod condition)
208 {
209    assert(devinfo->ver == 6);
210 
211    vec4_instruction *inst;
212 
213    resolve_ud_negate(&src0);
214    resolve_ud_negate(&src1);
215 
216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 					src0, src1);
218    inst->conditional_mod = condition;
219 
220    return inst;
221 }
222 
223 /**
224  * CMP: Sets the low bit of the destination channels with the result
225  * of the comparison, while the upper bits are undefined, and updates
226  * the flag register with the packed 16 bits of the result.
227  */
228 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230                   enum brw_conditional_mod condition)
231 {
232    vec4_instruction *inst;
233 
234    /* Take the instruction:
235     *
236     * CMP null<d> src0<f> src1<f>
237     *
238     * Original gfx4 does type conversion to the destination type before
239     * comparison, producing garbage results for floating point comparisons.
240     *
241     * The destination type doesn't matter on newer generations, so we set the
242     * type to match src0 so we can compact the instruction.
243     */
244    dst.type = src0.type;
245 
246    resolve_ud_negate(&src0);
247    resolve_ud_negate(&src1);
248 
249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250    inst->conditional_mod = condition;
251 
252    return inst;
253 }
254 
255 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258    vec4_instruction *inst;
259 
260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ,
261 					dst, index);
262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
263    inst->mlen = 2;
264 
265    return inst;
266 }
267 
268 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270                             const src_reg &index)
271 {
272    vec4_instruction *inst;
273 
274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
275 					dst, src, index);
276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
277    inst->mlen = 3;
278 
279    return inst;
280 }
281 
282 src_reg
fix_3src_operand(const src_reg & src)283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286     * able to use vertical stride of zero to replicate the vec4 uniform, like
287     *
288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289     *
290     * But you can't, since vertical stride is always four in three-source
291     * instructions. Instead, insert a MOV instruction to do the replication so
292     * that the three-source instruction can consume it.
293     */
294 
295    /* The MOV is only needed if the source is a uniform or immediate. */
296    if (src.file != UNIFORM && src.file != IMM)
297       return src;
298 
299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300       return src;
301 
302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303    expanded.type = src.type;
304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305    return src_reg(expanded);
306 }
307 
308 src_reg
fix_math_operand(const src_reg & src)309 vec4_visitor::fix_math_operand(const src_reg &src)
310 {
311    if (devinfo->ver < 6 || src.file == BAD_FILE)
312       return src;
313 
314    /* The gfx6 math instruction ignores the source modifiers --
315     * swizzle, abs, negate, and at least some parts of the register
316     * region description.
317     *
318     * Rather than trying to enumerate all these cases, *always* expand the
319     * operand to a temp GRF for gfx6.
320     *
321     * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
322     * can't use.
323     */
324 
325    if (devinfo->ver == 7 && src.file != IMM)
326       return src;
327 
328    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329    expanded.type = src.type;
330    emit(MOV(expanded, src));
331    return src_reg(expanded);
332 }
333 
334 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)335 vec4_visitor::emit_math(enum opcode opcode,
336                         const dst_reg &dst,
337                         const src_reg &src0, const src_reg &src1)
338 {
339    vec4_instruction *math =
340       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341 
342    if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
343       /* MATH on Gfx6 must be align1, so we can't do writemasks. */
344       math->dst = dst_reg(this, glsl_type::vec4_type);
345       math->dst.type = dst.type;
346       math = emit(MOV(dst, src_reg(math->dst)));
347    } else if (devinfo->ver < 6) {
348       math->base_mrf = 1;
349       math->mlen = src1.file == BAD_FILE ? 1 : 2;
350    }
351 
352    return math;
353 }
354 
355 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358    if (devinfo->ver < 7) {
359       unreachable("ir_unop_pack_half_2x16 should be lowered");
360    }
361 
362    assert(dst.type == BRW_REGISTER_TYPE_UD);
363    assert(src0.type == BRW_REGISTER_TYPE_F);
364 
365    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366     *
367     *   Because this instruction does not have a 16-bit floating-point type,
368     *   the destination data type must be Word (W).
369     *
370     *   The destination must be DWord-aligned and specify a horizontal stride
371     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
372     *   each destination channel and the upper word is not modified.
373     *
374     * The above restriction implies that the f32to16 instruction must use
375     * align1 mode, because only in align1 mode is it possible to specify
376     * horizontal stride.  We choose here to defy the hardware docs and emit
377     * align16 instructions.
378     *
379     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380     * instructions. I was partially successful in that the code passed all
381     * tests.  However, the code was dubiously correct and fragile, and the
382     * tests were not harsh enough to probe that frailty. Not trusting the
383     * code, I chose instead to remain in align16 mode in defiance of the hw
384     * docs).
385     *
386     * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
387     * simulator, emitting a f32to16 in align16 mode with UD as destination
388     * data type is safe. The behavior differs from that specified in the PRM
389     * in that the upper word of each destination channel is cleared to 0.
390     */
391 
392    dst_reg tmp_dst(this, glsl_type::uvec2_type);
393    src_reg tmp_src(tmp_dst);
394 
395 #if 0
396    /* Verify the undocumented behavior on which the following instructions
397     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
398     * then the result of the bit-or instruction below will be incorrect.
399     *
400     * You should inspect the disasm output in order to verify that the MOV is
401     * not optimized away.
402     */
403    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404 #endif
405 
406    /* Give tmp the form below, where "." means untouched.
407     *
408     *     w z          y          x w z          y          x
409     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410     *
411     * That the upper word of each write-channel be 0 is required for the
412     * following bit-shift and bit-or instructions to work. Note that this
413     * relies on the undocumented hardware behavior mentioned above.
414     */
415    tmp_dst.writemask = WRITEMASK_XY;
416    emit(F32TO16(tmp_dst, src0));
417 
418    /* Give the write-channels of dst the form:
419     *   0xhhhh0000
420     */
421    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423 
424    /* Finally, give the write-channels of dst the form of packHalf2x16's
425     * output:
426     *   0xhhhhllll
427     */
428    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429    emit(OR(dst, src_reg(dst), tmp_src));
430 }
431 
432 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435    if (devinfo->ver < 7) {
436       unreachable("ir_unop_unpack_half_2x16 should be lowered");
437    }
438 
439    assert(dst.type == BRW_REGISTER_TYPE_F);
440    assert(src0.type == BRW_REGISTER_TYPE_UD);
441 
442    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443     *
444     *   Because this instruction does not have a 16-bit floating-point type,
445     *   the source data type must be Word (W). The destination type must be
446     *   F (Float).
447     *
448     * To use W as the source data type, we must adjust horizontal strides,
449     * which is only possible in align1 mode. All my [chadv] attempts at
450     * emitting align1 instructions for unpackHalf2x16 failed to pass the
451     * Piglit tests, so I gave up.
452     *
453     * I've verified that, on gfx7 hardware and the simulator, it is safe to
454     * emit f16to32 in align16 mode with UD as source data type.
455     */
456 
457    dst_reg tmp_dst(this, glsl_type::uvec2_type);
458    src_reg tmp_src(tmp_dst);
459 
460    tmp_dst.writemask = WRITEMASK_X;
461    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462 
463    tmp_dst.writemask = WRITEMASK_Y;
464    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465 
466    dst.writemask = WRITEMASK_XY;
467    emit(F16TO32(dst, tmp_src));
468 }
469 
470 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)471 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472 {
473    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475     * is not suitable to generate the shift values, but we can use the packed
476     * vector float and a type-converting MOV.
477     */
478    dst_reg shift(this, glsl_type::uvec4_type);
479    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480 
481    dst_reg shifted(this, glsl_type::uvec4_type);
482    src0.swizzle = BRW_SWIZZLE_XXXX;
483    emit(SHR(shifted, src0, src_reg(shift)));
484 
485    shifted.type = BRW_REGISTER_TYPE_UB;
486    dst_reg f(this, glsl_type::vec4_type);
487    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488 
489    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490 }
491 
492 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)493 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497     * is not suitable to generate the shift values, but we can use the packed
498     * vector float and a type-converting MOV.
499     */
500    dst_reg shift(this, glsl_type::uvec4_type);
501    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502 
503    dst_reg shifted(this, glsl_type::uvec4_type);
504    src0.swizzle = BRW_SWIZZLE_XXXX;
505    emit(SHR(shifted, src0, src_reg(shift)));
506 
507    shifted.type = BRW_REGISTER_TYPE_B;
508    dst_reg f(this, glsl_type::vec4_type);
509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510 
511    dst_reg scaled(this, glsl_type::vec4_type);
512    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513 
514    dst_reg max(this, glsl_type::vec4_type);
515    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517 }
518 
519 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)520 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521 {
522    dst_reg saturated(this, glsl_type::vec4_type);
523    vec4_instruction *inst = emit(MOV(saturated, src0));
524    inst->saturate = true;
525 
526    dst_reg scaled(this, glsl_type::vec4_type);
527    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528 
529    dst_reg rounded(this, glsl_type::vec4_type);
530    emit(RNDE(rounded, src_reg(scaled)));
531 
532    dst_reg u(this, glsl_type::uvec4_type);
533    emit(MOV(u, src_reg(rounded)));
534 
535    src_reg bytes(u);
536    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537 }
538 
539 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)540 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542    dst_reg max(this, glsl_type::vec4_type);
543    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544 
545    dst_reg min(this, glsl_type::vec4_type);
546    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547 
548    dst_reg scaled(this, glsl_type::vec4_type);
549    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550 
551    dst_reg rounded(this, glsl_type::vec4_type);
552    emit(RNDE(rounded, src_reg(scaled)));
553 
554    dst_reg i(this, glsl_type::ivec4_type);
555    emit(MOV(i, src_reg(rounded)));
556 
557    src_reg bytes(i);
558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560 
561 /*
562  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563  * false) elements needed to pack a type.
564  */
565 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)566 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567 {
568    unsigned int i;
569    int size;
570 
571    switch (type->base_type) {
572    case GLSL_TYPE_UINT:
573    case GLSL_TYPE_INT:
574    case GLSL_TYPE_FLOAT:
575    case GLSL_TYPE_FLOAT16:
576    case GLSL_TYPE_BOOL:
577    case GLSL_TYPE_DOUBLE:
578    case GLSL_TYPE_UINT16:
579    case GLSL_TYPE_INT16:
580    case GLSL_TYPE_UINT8:
581    case GLSL_TYPE_INT8:
582    case GLSL_TYPE_UINT64:
583    case GLSL_TYPE_INT64:
584       if (type->is_matrix()) {
585          const glsl_type *col_type = type->column_type();
586          unsigned col_slots =
587             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588          return type->matrix_columns * col_slots;
589       } else {
590          /* Regardless of size of vector, it gets a vec4. This is bad
591           * packing for things like floats, but otherwise arrays become a
592           * mess.  Hopefully a later pass over the code can pack scalars
593           * down if appropriate.
594           */
595          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596       }
597    case GLSL_TYPE_ARRAY:
598       assert(type->length > 0);
599       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600              type->length;
601    case GLSL_TYPE_STRUCT:
602    case GLSL_TYPE_INTERFACE:
603       size = 0;
604       for (i = 0; i < type->length; i++) {
605 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606                                  bindless);
607       }
608       return size;
609    case GLSL_TYPE_SUBROUTINE:
610       return 1;
611 
612    case GLSL_TYPE_SAMPLER:
613    case GLSL_TYPE_TEXTURE:
614       /* Samplers and textures take up no register space, since they're baked
615        * in at link time.
616        */
617       return bindless ? 1 : 0;
618    case GLSL_TYPE_ATOMIC_UINT:
619       return 0;
620    case GLSL_TYPE_IMAGE:
621       return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
622    case GLSL_TYPE_VOID:
623    case GLSL_TYPE_ERROR:
624    case GLSL_TYPE_FUNCTION:
625       unreachable("not reached");
626    }
627 
628    return 0;
629 }
630 
631 /**
632  * Returns the minimum number of vec4 elements needed to pack a type.
633  *
634  * For simple types, it will return 1 (a single vec4); for matrices, the
635  * number of columns; for array and struct, the sum of the vec4_size of
636  * each of its elements; and for sampler and atomic, zero.
637  *
638  * This method is useful to calculate how much register space is needed to
639  * store a particular type.
640  */
641 extern "C" int
type_size_vec4(const struct glsl_type * type,bool bindless)642 type_size_vec4(const struct glsl_type *type, bool bindless)
643 {
644    return type_size_xvec4(type, true, bindless);
645 }
646 
647 /**
648  * Returns the minimum number of dvec4 elements needed to pack a type.
649  *
650  * For simple types, it will return 1 (a single dvec4); for matrices, the
651  * number of columns; for array and struct, the sum of the dvec4_size of
652  * each of its elements; and for sampler and atomic, zero.
653  *
654  * This method is useful to calculate how much register space is needed to
655  * store a particular type.
656  *
657  * Measuring double-precision vertex inputs as dvec4 is required because
658  * ARB_vertex_attrib_64bit states that these uses the same number of locations
659  * than the single-precision version. That is, two consecutives dvec4 would be
660  * located in location "x" and location "x+1", not "x+2".
661  *
662  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
663  * remap_vs_attrs() will take in account both the location and also if the
664  * type fits in one or two vec4 slots.
665  */
666 extern "C" int
type_size_dvec4(const struct glsl_type * type,bool bindless)667 type_size_dvec4(const struct glsl_type *type, bool bindless)
668 {
669    return type_size_xvec4(type, false, bindless);
670 }
671 
src_reg(class vec4_visitor * v,const struct glsl_type * type)672 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
673 {
674    init();
675 
676    this->file = VGRF;
677    this->nr = v->alloc.allocate(type_size_vec4(type, false));
678 
679    if (type->is_array() || type->is_struct()) {
680       this->swizzle = BRW_SWIZZLE_NOOP;
681    } else {
682       this->swizzle = brw_swizzle_for_size(type->vector_elements);
683    }
684 
685    this->type = brw_type_for_base_type(type);
686 }
687 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)688 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
689 {
690    assert(size > 0);
691 
692    init();
693 
694    this->file = VGRF;
695    this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
696 
697    this->swizzle = BRW_SWIZZLE_NOOP;
698 
699    this->type = brw_type_for_base_type(type);
700 }
701 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)702 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
703 {
704    init();
705 
706    this->file = VGRF;
707    this->nr = v->alloc.allocate(type_size_vec4(type, false));
708 
709    if (type->is_array() || type->is_struct()) {
710       this->writemask = WRITEMASK_XYZW;
711    } else {
712       this->writemask = (1 << type->vector_elements) - 1;
713    }
714 
715    this->type = brw_type_for_base_type(type);
716 }
717 
718 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)719 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
720                           src_reg src0, src_reg src1)
721 {
722    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
723    inst->conditional_mod = conditionalmod;
724    return inst;
725 }
726 
727 /**
728  * Emits the instructions needed to perform a pull constant load. before_block
729  * and before_inst can be NULL in which case the instruction will be appended
730  * to the end of the instruction list.
731  */
732 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)733 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
734                                           src_reg surf_index,
735                                           src_reg offset_reg,
736                                           bblock_t *before_block,
737                                           vec4_instruction *before_inst)
738 {
739    assert((before_inst == NULL && before_block == NULL) ||
740           (before_inst && before_block));
741 
742    vec4_instruction *pull;
743 
744    if (devinfo->ver >= 7) {
745       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
746 
747       grf_offset.type = offset_reg.type;
748 
749       pull = MOV(grf_offset, offset_reg);
750 
751       if (before_inst)
752          emit_before(before_block, before_inst, pull);
753       else
754          emit(pull);
755 
756       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
757                                            dst,
758                                            surf_index,
759                                            src_reg(grf_offset));
760       pull->mlen = 1;
761    } else {
762       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
763                                            dst,
764                                            surf_index,
765                                            offset_reg);
766       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
767       pull->mlen = 1;
768    }
769 
770    if (before_inst)
771       emit_before(before_block, before_inst, pull);
772    else
773       emit(pull);
774 }
775 
776 src_reg
emit_uniformize(const src_reg & src)777 vec4_visitor::emit_uniformize(const src_reg &src)
778 {
779    const src_reg chan_index(this, glsl_type::uint_type);
780    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
781                               src.type);
782 
783    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
784       ->force_writemask_all = true;
785    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
786       ->force_writemask_all = true;
787 
788    return src_reg(dst);
789 }
790 
791 void
gs_emit_vertex(int)792 vec4_visitor::gs_emit_vertex(int /* stream_id */)
793 {
794    unreachable("not reached");
795 }
796 
797 void
gs_end_primitive()798 vec4_visitor::gs_end_primitive()
799 {
800    unreachable("not reached");
801 }
802 
803 void
emit_ndc_computation()804 vec4_visitor::emit_ndc_computation()
805 {
806    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
807       return;
808 
809    /* Get the position */
810    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
811 
812    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
813    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
814    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
815    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
816 
817    current_annotation = "NDC";
818    dst_reg ndc_w = ndc;
819    ndc_w.writemask = WRITEMASK_W;
820    src_reg pos_w = pos;
821    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
822    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
823 
824    dst_reg ndc_xyz = ndc;
825    ndc_xyz.writemask = WRITEMASK_XYZ;
826 
827    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
828 }
829 
830 void
emit_psiz_and_flags(dst_reg reg)831 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
832 {
833    if (devinfo->ver < 6 &&
834        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
835         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
836         devinfo->has_negative_rhw_bug)) {
837       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
838       dst_reg header1_w = header1;
839       header1_w.writemask = WRITEMASK_W;
840 
841       emit(MOV(header1, brw_imm_ud(0u)));
842 
843       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
844 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
845 
846 	 current_annotation = "Point size";
847 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
848 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
849       }
850 
851       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
852          current_annotation = "Clipping flags";
853          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
854 
855          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
856          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
857          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
858       }
859 
860       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
861          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
862          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
863          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
864          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
865          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
866       }
867 
868       /* i965 clipping workaround:
869        * 1) Test for -ve rhw
870        * 2) If set,
871        *      set ndc = (0,0,0,0)
872        *      set ucp[6] = 1
873        *
874        * Later, clipping will detect ucp[6] and ensure the primitive is
875        * clipped against all fixed planes.
876        */
877       if (devinfo->has_negative_rhw_bug &&
878           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
879          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
880          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
881          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
882          vec4_instruction *inst;
883          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
884          inst->predicate = BRW_PREDICATE_NORMAL;
885          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
886          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
887          inst->predicate = BRW_PREDICATE_NORMAL;
888       }
889 
890       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
891    } else if (devinfo->ver < 6) {
892       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
893    } else {
894       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
895       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
896          dst_reg reg_w = reg;
897          reg_w.writemask = WRITEMASK_W;
898          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
899          reg_as_src.type = reg_w.type;
900          reg_as_src.swizzle = brw_swizzle_for_size(1);
901          emit(MOV(reg_w, reg_as_src));
902       }
903       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
904          dst_reg reg_y = reg;
905          reg_y.writemask = WRITEMASK_Y;
906          reg_y.type = BRW_REGISTER_TYPE_D;
907          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
908          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
909       }
910       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
911          dst_reg reg_z = reg;
912          reg_z.writemask = WRITEMASK_Z;
913          reg_z.type = BRW_REGISTER_TYPE_D;
914          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
915          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
916       }
917    }
918 }
919 
920 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)921 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
922 {
923    assert(varying < VARYING_SLOT_MAX);
924 
925    unsigned num_comps = output_num_components[varying][component];
926    if (num_comps == 0)
927       return NULL;
928 
929    assert(output_reg[varying][component].type == reg.type);
930    current_annotation = output_reg_annotation[varying];
931    if (output_reg[varying][component].file != BAD_FILE) {
932       src_reg src = src_reg(output_reg[varying][component]);
933       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
934       reg.writemask =
935          brw_writemask_for_component_packing(num_comps, component);
936       return emit(MOV(reg, src));
937    }
938    return NULL;
939 }
940 
941 void
emit_urb_slot(dst_reg reg,int varying)942 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
943 {
944    reg.type = BRW_REGISTER_TYPE_F;
945    output_reg[varying][0].type = reg.type;
946 
947    switch (varying) {
948    case VARYING_SLOT_PSIZ:
949    {
950       /* PSIZ is always in slot 0, and is coupled with other flags. */
951       current_annotation = "indices, point width, clip flags";
952       emit_psiz_and_flags(reg);
953       break;
954    }
955    case BRW_VARYING_SLOT_NDC:
956       current_annotation = "NDC";
957       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
958          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
959       break;
960    case VARYING_SLOT_POS:
961       current_annotation = "gl_Position";
962       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
963          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
964       break;
965    case BRW_VARYING_SLOT_PAD:
966       /* No need to write to this slot */
967       break;
968    default:
969       for (int i = 0; i < 4; i++) {
970          emit_generic_urb_slot(reg, varying, i);
971       }
972       break;
973    }
974 }
975 
976 static unsigned
align_interleaved_urb_mlen(const struct intel_device_info * devinfo,unsigned mlen)977 align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
978                            unsigned mlen)
979 {
980    if (devinfo->ver >= 6) {
981       /* URB data written (does not include the message header reg) must
982        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
983        * section 5.4.3.2.2: URB_INTERLEAVED.
984        *
985        * URB entries are allocated on a multiple of 1024 bits, so an
986        * extra 128 bits written here to make the end align to 256 is
987        * no problem.
988        */
989       if ((mlen % 2) != 1)
990 	 mlen++;
991    }
992 
993    return mlen;
994 }
995 
996 
997 /**
998  * Generates the VUE payload plus the necessary URB write instructions to
999  * output it.
1000  *
1001  * The VUE layout is documented in Volume 2a.
1002  */
1003 void
emit_vertex()1004 vec4_visitor::emit_vertex()
1005 {
1006    /* MRF 0 is reserved for the debugger, so start with message header
1007     * in MRF 1.
1008     */
1009    int base_mrf = 1;
1010    int mrf = base_mrf;
1011    /* In the process of generating our URB write message contents, we
1012     * may need to unspill a register or load from an array.  Those
1013     * reads would use MRFs 14-15.
1014     */
1015    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1016 
1017    /* The following assertion verifies that max_usable_mrf causes an
1018     * even-numbered amount of URB write data, which will meet gfx6's
1019     * requirements for length alignment.
1020     */
1021    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1022 
1023    /* First mrf is the g0-based message header containing URB handles and
1024     * such.
1025     */
1026    emit_urb_write_header(mrf++);
1027 
1028    if (devinfo->ver < 6) {
1029       emit_ndc_computation();
1030    }
1031 
1032    /* We may need to split this up into several URB writes, so do them in a
1033     * loop.
1034     */
1035    int slot = 0;
1036    bool complete = false;
1037    do {
1038       /* URB offset is in URB row increments, and each of our MRFs is half of
1039        * one of those, since we're doing interleaved writes.
1040        */
1041       int offset = slot / 2;
1042 
1043       mrf = base_mrf + 1;
1044       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1045          emit_urb_slot(dst_reg(MRF, mrf++),
1046                        prog_data->vue_map.slot_to_varying[slot]);
1047 
1048          /* If this was max_usable_mrf, we can't fit anything more into this
1049           * URB WRITE. Same thing if we reached the maximum length available.
1050           */
1051          if (mrf > max_usable_mrf ||
1052              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1053             slot++;
1054             break;
1055          }
1056       }
1057 
1058       complete = slot >= prog_data->vue_map.num_slots;
1059       current_annotation = "URB write";
1060       vec4_instruction *inst = emit_urb_write_opcode(complete);
1061       inst->base_mrf = base_mrf;
1062       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1063       inst->offset += offset;
1064    } while(!complete);
1065 }
1066 
1067 
1068 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1069 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1070 				 src_reg *reladdr, int reg_offset)
1071 {
1072    /* Because we store the values to scratch interleaved like our
1073     * vertex data, we need to scale the vec4 index by 2.
1074     */
1075    int message_header_scale = 2;
1076 
1077    /* Pre-gfx6, the message header uses byte offsets instead of vec4
1078     * (16-byte) offset units.
1079     */
1080    if (devinfo->ver < 6)
1081       message_header_scale *= 16;
1082 
1083    if (reladdr) {
1084       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1085        * to multiply the reladdr by 2. Notice that the reg_offset part
1086        * is in units of 16 bytes and is used to select the low/high 16-byte
1087        * chunk of a full dvec4, so we don't want to multiply that part.
1088        */
1089       src_reg index = src_reg(this, glsl_type::int_type);
1090       if (type_sz(inst->dst.type) < 8) {
1091          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1092                                       brw_imm_d(reg_offset)));
1093          emit_before(block, inst, MUL(dst_reg(index), index,
1094                                       brw_imm_d(message_header_scale)));
1095       } else {
1096          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1097                                       brw_imm_d(message_header_scale * 2)));
1098          emit_before(block, inst, ADD(dst_reg(index), index,
1099                                       brw_imm_d(reg_offset * message_header_scale)));
1100       }
1101       return index;
1102    } else {
1103       return brw_imm_d(reg_offset * message_header_scale);
1104    }
1105 }
1106 
1107 /**
1108  * Emits an instruction before @inst to load the value named by @orig_src
1109  * from scratch space at @base_offset to @temp.
1110  *
1111  * @base_offset is measured in 32-byte units (the size of a register).
1112  */
1113 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1114 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1115 				dst_reg temp, src_reg orig_src,
1116 				int base_offset)
1117 {
1118    assert(orig_src.offset % REG_SIZE == 0);
1119    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1120    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1121                                       reg_offset);
1122 
1123    if (type_sz(orig_src.type) < 8) {
1124       emit_before(block, inst, SCRATCH_READ(temp, index));
1125    } else {
1126       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1127       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1128       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1129       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1130       vec4_instruction *last_read =
1131          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1132       emit_before(block, inst, last_read);
1133       shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1134    }
1135 }
1136 
1137 /**
1138  * Emits an instruction after @inst to store the value to be written
1139  * to @orig_dst to scratch space at @base_offset, from @temp.
1140  *
1141  * @base_offset is measured in 32-byte units (the size of a register).
1142  */
1143 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1144 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1145                                  int base_offset)
1146 {
1147    assert(inst->dst.offset % REG_SIZE == 0);
1148    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1149    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1150                                       reg_offset);
1151 
1152    /* Create a temporary register to store *inst's result in.
1153     *
1154     * We have to be careful in MOVing from our temporary result register in
1155     * the scratch write.  If we swizzle from channels of the temporary that
1156     * weren't initialized, it will confuse live interval analysis, which will
1157     * make spilling fail to make progress.
1158     */
1159    bool is_64bit = type_sz(inst->dst.type) == 8;
1160    const glsl_type *alloc_type =
1161       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1162    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1163                                        inst->dst.type),
1164                                 brw_swizzle_for_mask(inst->dst.writemask));
1165 
1166    if (!is_64bit) {
1167       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1168 				          inst->dst.writemask));
1169       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1170       if (inst->opcode != BRW_OPCODE_SEL)
1171          write->predicate = inst->predicate;
1172       write->ir = inst->ir;
1173       write->annotation = inst->annotation;
1174       inst->insert_after(block, write);
1175    } else {
1176       dst_reg shuffled = dst_reg(this, alloc_type);
1177       vec4_instruction *last =
1178          shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1179       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1180 
1181       uint8_t mask = 0;
1182       if (inst->dst.writemask & WRITEMASK_X)
1183          mask |= WRITEMASK_XY;
1184       if (inst->dst.writemask & WRITEMASK_Y)
1185          mask |= WRITEMASK_ZW;
1186       if (mask) {
1187          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1188 
1189          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1190          if (inst->opcode != BRW_OPCODE_SEL)
1191             write->predicate = inst->predicate;
1192          write->ir = inst->ir;
1193          write->annotation = inst->annotation;
1194          last->insert_after(block, write);
1195       }
1196 
1197       mask = 0;
1198       if (inst->dst.writemask & WRITEMASK_Z)
1199          mask |= WRITEMASK_XY;
1200       if (inst->dst.writemask & WRITEMASK_W)
1201          mask |= WRITEMASK_ZW;
1202       if (mask) {
1203          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1204 
1205          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1206                                             reg_offset + 1);
1207          vec4_instruction *write =
1208             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1209          if (inst->opcode != BRW_OPCODE_SEL)
1210             write->predicate = inst->predicate;
1211          write->ir = inst->ir;
1212          write->annotation = inst->annotation;
1213          last->insert_after(block, write);
1214       }
1215    }
1216 
1217    inst->dst.file = temp.file;
1218    inst->dst.nr = temp.nr;
1219    inst->dst.offset %= REG_SIZE;
1220    inst->dst.reladdr = NULL;
1221 }
1222 
1223 /**
1224  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1225  * adds the scratch read(s) before \p inst. The function also checks for
1226  * recursive reladdr scratch accesses, issuing the corresponding scratch
1227  * loads and rewriting reladdr references accordingly.
1228  *
1229  * \return \p src if it did not require a scratch load, otherwise, the
1230  * register holding the result of the scratch load that the caller should
1231  * use to rewrite src.
1232  */
1233 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1234 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1235                                    vec4_instruction *inst, src_reg src)
1236 {
1237    /* Resolve recursive reladdr scratch access by calling ourselves
1238     * with src.reladdr
1239     */
1240    if (src.reladdr)
1241       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1242                                           *src.reladdr);
1243 
1244    /* Now handle scratch access on src */
1245    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1246       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1247          glsl_type::dvec4_type : glsl_type::vec4_type);
1248       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1249       src.nr = temp.nr;
1250       src.offset %= REG_SIZE;
1251       src.reladdr = NULL;
1252    }
1253 
1254    return src;
1255 }
1256 
1257 /**
1258  * We can't generally support array access in GRF space, because a
1259  * single instruction's destination can only span 2 contiguous
1260  * registers.  So, we send all GRF arrays that get variable index
1261  * access to scratch space.
1262  */
1263 void
move_grf_array_access_to_scratch()1264 vec4_visitor::move_grf_array_access_to_scratch()
1265 {
1266    int scratch_loc[this->alloc.count];
1267    memset(scratch_loc, -1, sizeof(scratch_loc));
1268 
1269    /* First, calculate the set of virtual GRFs that need to be punted
1270     * to scratch due to having any array access on them, and where in
1271     * scratch.
1272     */
1273    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1274       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1275          if (scratch_loc[inst->dst.nr] == -1) {
1276             scratch_loc[inst->dst.nr] = last_scratch;
1277             last_scratch += this->alloc.sizes[inst->dst.nr];
1278          }
1279 
1280          for (src_reg *iter = inst->dst.reladdr;
1281               iter->reladdr;
1282               iter = iter->reladdr) {
1283             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1284                scratch_loc[iter->nr] = last_scratch;
1285                last_scratch += this->alloc.sizes[iter->nr];
1286             }
1287          }
1288       }
1289 
1290       for (int i = 0 ; i < 3; i++) {
1291          for (src_reg *iter = &inst->src[i];
1292               iter->reladdr;
1293               iter = iter->reladdr) {
1294             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1295                scratch_loc[iter->nr] = last_scratch;
1296                last_scratch += this->alloc.sizes[iter->nr];
1297             }
1298          }
1299       }
1300    }
1301 
1302    /* Now, for anything that will be accessed through scratch, rewrite
1303     * it to load/store.  Note that this is a _safe list walk, because
1304     * we may generate a new scratch_write instruction after the one
1305     * we're processing.
1306     */
1307    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1308       /* Set up the annotation tracking for new generated instructions. */
1309       base_ir = inst->ir;
1310       current_annotation = inst->annotation;
1311 
1312       /* First handle scratch access on the dst. Notice we have to handle
1313        * the case where the dst's reladdr also points to scratch space.
1314        */
1315       if (inst->dst.reladdr)
1316          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1317                                                    *inst->dst.reladdr);
1318 
1319       /* Now that we have handled any (possibly recursive) reladdr scratch
1320        * accesses for dst we can safely do the scratch write for dst itself
1321        */
1322       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1323          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1324 
1325       /* Now handle scratch access on any src. In this case, since inst->src[i]
1326        * already is a src_reg, we can just call emit_resolve_reladdr with
1327        * inst->src[i] and it will take care of handling scratch loads for
1328        * both src and src.reladdr (recursively).
1329        */
1330       for (int i = 0 ; i < 3; i++) {
1331          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1332                                              inst->src[i]);
1333       }
1334    }
1335 }
1336 
1337 void
resolve_ud_negate(src_reg * reg)1338 vec4_visitor::resolve_ud_negate(src_reg *reg)
1339 {
1340    if (reg->type != BRW_REGISTER_TYPE_UD ||
1341        !reg->negate)
1342       return;
1343 
1344    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1345    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1346    *reg = temp;
1347 }
1348 
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,bool debug_enabled)1349 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1350                            void *log_data,
1351                            const struct brw_sampler_prog_key_data *key_tex,
1352                            struct brw_vue_prog_data *prog_data,
1353                            const nir_shader *shader,
1354 			   void *mem_ctx,
1355                            bool no_spills,
1356                            bool debug_enabled)
1357    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base,
1358                     debug_enabled),
1359      key_tex(key_tex),
1360      prog_data(prog_data),
1361      fail_msg(NULL),
1362      first_non_payload_grf(0),
1363      ubo_push_start(),
1364      push_length(0),
1365      live_analysis(this), performance_analysis(this),
1366      need_all_constants_in_pull_buffer(false),
1367      no_spills(no_spills),
1368      last_scratch(0)
1369 {
1370    this->failed = false;
1371 
1372    this->base_ir = NULL;
1373    this->current_annotation = NULL;
1374    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1375 
1376    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1377 
1378    this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1379 
1380    this->uniforms = 0;
1381 
1382    this->nir_locals = NULL;
1383    this->nir_ssa_values = NULL;
1384 }
1385 
1386 
1387 void
fail(const char * format,...)1388 vec4_visitor::fail(const char *format, ...)
1389 {
1390    va_list va;
1391    char *msg;
1392 
1393    if (failed)
1394       return;
1395 
1396    failed = true;
1397 
1398    va_start(va, format);
1399    msg = ralloc_vasprintf(mem_ctx, format, va);
1400    va_end(va);
1401    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1402 
1403    this->fail_msg = msg;
1404 
1405    if (unlikely(debug_enabled)) {
1406       fprintf(stderr, "%s",  msg);
1407    }
1408 }
1409 
1410 } /* namespace brw */
1411