• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_nir.h"
25 #include "elk_nir_private.h"
26 #include "elk_vec4.h"
27 #include "elk_vec4_builder.h"
28 #include "elk_vec4_surface_builder.h"
29 #include "elk_eu.h"
30 #include "nir.h"
31 #include "nir_intrinsics.h"
32 #include "nir_intrinsics_indices.h"
33 
34 using namespace elk;
35 using namespace elk::surface_access;
36 
37 namespace elk {
38 
39 void
emit_nir_code()40 vec4_visitor::emit_nir_code()
41 {
42    /* Globally set the rounding mode based on the float controls.  gen7 doesn't
43     * support 16-bit floats, and gen8 switches to scalar VS.  So we don't need
44     * to do any per-instruction mode switching the way the scalar FS handles.
45     */
46    emit_shader_float_controls_execution_mode();
47    if (nir->num_uniforms > 0)
48       nir_setup_uniforms();
49 
50    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
51 }
52 
53 void
nir_setup_uniforms()54 vec4_visitor::nir_setup_uniforms()
55 {
56    uniforms = nir->num_uniforms / 16;
57 }
58 
59 void
nir_emit_impl(nir_function_impl * impl)60 vec4_visitor::nir_emit_impl(nir_function_impl *impl)
61 {
62    nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
63 
64    nir_emit_cf_list(&impl->body);
65 }
66 
67 void
nir_emit_cf_list(exec_list * list)68 vec4_visitor::nir_emit_cf_list(exec_list *list)
69 {
70    exec_list_validate(list);
71    foreach_list_typed(nir_cf_node, node, node, list) {
72       switch (node->type) {
73       case nir_cf_node_if:
74          nir_emit_if(nir_cf_node_as_if(node));
75          break;
76 
77       case nir_cf_node_loop:
78          nir_emit_loop(nir_cf_node_as_loop(node));
79          break;
80 
81       case nir_cf_node_block:
82          nir_emit_block(nir_cf_node_as_block(node));
83          break;
84 
85       default:
86          unreachable("Invalid CFG node block");
87       }
88    }
89 }
90 
91 void
nir_emit_if(nir_if * if_stmt)92 vec4_visitor::nir_emit_if(nir_if *if_stmt)
93 {
94    /* First, put the condition in f0 */
95    src_reg condition = get_nir_src(if_stmt->condition, ELK_REGISTER_TYPE_D, 1);
96    vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
97    inst->conditional_mod = ELK_CONDITIONAL_NZ;
98 
99    /* We can just predicate based on the X channel, as the condition only
100     * goes on its own line */
101    emit(IF(ELK_PREDICATE_ALIGN16_REPLICATE_X));
102 
103    nir_emit_cf_list(&if_stmt->then_list);
104 
105    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
106       emit(ELK_OPCODE_ELSE);
107       nir_emit_cf_list(&if_stmt->else_list);
108    }
109 
110    emit(ELK_OPCODE_ENDIF);
111 }
112 
113 void
nir_emit_loop(nir_loop * loop)114 vec4_visitor::nir_emit_loop(nir_loop *loop)
115 {
116    assert(!nir_loop_has_continue_construct(loop));
117    emit(ELK_OPCODE_DO);
118 
119    nir_emit_cf_list(&loop->body);
120 
121    emit(ELK_OPCODE_WHILE);
122 }
123 
124 void
nir_emit_block(nir_block * block)125 vec4_visitor::nir_emit_block(nir_block *block)
126 {
127    nir_foreach_instr(instr, block) {
128       nir_emit_instr(instr);
129    }
130 }
131 
132 void
nir_emit_instr(nir_instr * instr)133 vec4_visitor::nir_emit_instr(nir_instr *instr)
134 {
135    base_ir = instr;
136 
137    switch (instr->type) {
138    case nir_instr_type_load_const:
139       nir_emit_load_const(nir_instr_as_load_const(instr));
140       break;
141 
142    case nir_instr_type_intrinsic:
143       nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
144       break;
145 
146    case nir_instr_type_alu:
147       nir_emit_alu(nir_instr_as_alu(instr));
148       break;
149 
150    case nir_instr_type_jump:
151       nir_emit_jump(nir_instr_as_jump(instr));
152       break;
153 
154    case nir_instr_type_tex:
155       nir_emit_texture(nir_instr_as_tex(instr));
156       break;
157 
158    case nir_instr_type_undef:
159       nir_emit_undef(nir_instr_as_undef(instr));
160       break;
161 
162    default:
163       unreachable("VS instruction not yet implemented by NIR->vec4");
164    }
165 }
166 
167 static dst_reg
dst_reg_for_nir_reg(vec4_visitor * v,nir_def * handle,unsigned base_offset,nir_src * indirect)168 dst_reg_for_nir_reg(vec4_visitor *v, nir_def *handle,
169                     unsigned base_offset, nir_src *indirect)
170 {
171    nir_intrinsic_instr *decl = nir_reg_get_decl(handle);
172    dst_reg reg = v->nir_ssa_values[handle->index];
173    if (nir_intrinsic_bit_size(decl) == 64)
174       reg.type = ELK_REGISTER_TYPE_DF;
175 
176    reg = offset(reg, 8, base_offset);
177    if (indirect) {
178       reg.reladdr =
179          new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
180                                                 ELK_REGISTER_TYPE_D,
181                                                 1));
182    }
183    return reg;
184 }
185 
186 dst_reg
get_nir_def(const nir_def & def)187 vec4_visitor::get_nir_def(const nir_def &def)
188 {
189    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
190    if (!store_reg) {
191       dst_reg dst =
192          dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(def.bit_size, 32)));
193       if (def.bit_size == 64)
194          dst.type = ELK_REGISTER_TYPE_DF;
195       nir_ssa_values[def.index] = dst;
196       return dst;
197    } else {
198       nir_src *indirect =
199          (store_reg->intrinsic == nir_intrinsic_store_reg_indirect) ?
200          &store_reg->src[2] : NULL;
201 
202       dst_reg dst = dst_reg_for_nir_reg(this, store_reg->src[1].ssa,
203                                         nir_intrinsic_base(store_reg),
204                                         indirect);
205       dst.writemask = nir_intrinsic_write_mask(store_reg);
206       return dst;
207    }
208 }
209 
210 dst_reg
get_nir_def(const nir_def & def,enum elk_reg_type type)211 vec4_visitor::get_nir_def(const nir_def &def, enum elk_reg_type type)
212 {
213    return retype(get_nir_def(def), type);
214 }
215 
216 dst_reg
get_nir_def(const nir_def & def,nir_alu_type type)217 vec4_visitor::get_nir_def(const nir_def &def, nir_alu_type type)
218 {
219    return get_nir_def(def, elk_type_for_nir_type(devinfo, type));
220 }
221 
222 src_reg
get_nir_src(const nir_src & src,enum elk_reg_type type,unsigned num_components)223 vec4_visitor::get_nir_src(const nir_src &src, enum elk_reg_type type,
224                           unsigned num_components)
225 {
226    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
227 
228    dst_reg reg;
229    if (load_reg) {
230       nir_src *indirect =
231          (load_reg->intrinsic == nir_intrinsic_load_reg_indirect) ?
232          &load_reg->src[1] : NULL;
233 
234       reg = dst_reg_for_nir_reg(this, load_reg->src[0].ssa,
235                                       nir_intrinsic_base(load_reg),
236                                       indirect);
237    } else {
238       reg = nir_ssa_values[src.ssa->index];
239    }
240 
241    reg = retype(reg, type);
242 
243    src_reg reg_as_src = src_reg(reg);
244    reg_as_src.swizzle = elk_swizzle_for_size(num_components);
245    return reg_as_src;
246 }
247 
248 src_reg
get_nir_src(const nir_src & src,nir_alu_type type,unsigned num_components)249 vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
250                           unsigned num_components)
251 {
252    return get_nir_src(src, elk_type_for_nir_type(devinfo, type),
253                       num_components);
254 }
255 
256 src_reg
get_nir_src(const nir_src & src,unsigned num_components)257 vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
258 {
259    /* if type is not specified, default to signed int */
260    return get_nir_src(src, nir_type_int32, num_components);
261 }
262 
263 src_reg
get_nir_src_imm(const nir_src & src)264 vec4_visitor::get_nir_src_imm(const nir_src &src)
265 {
266    assert(nir_src_num_components(src) == 1);
267    assert(nir_src_bit_size(src) == 32);
268    return nir_src_is_const(src) ? src_reg(elk_imm_d(nir_src_as_int(src))) :
269                                   get_nir_src(src, 1);
270 }
271 
272 src_reg
get_indirect_offset(nir_intrinsic_instr * instr)273 vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
274 {
275    nir_src *offset_src = nir_get_io_offset_src(instr);
276 
277    if (nir_src_is_const(*offset_src)) {
278       /* The only constant offset we should find is 0.  elk_nir.c's
279        * add_const_offset_to_base() will fold other constant offsets
280        * into the base index.
281        */
282       assert(nir_src_as_uint(*offset_src) == 0);
283       return src_reg();
284    }
285 
286    return get_nir_src(*offset_src, ELK_REGISTER_TYPE_UD, 1);
287 }
288 
289 static src_reg
elk_setup_imm_df(const vec4_builder & bld,double v)290 elk_setup_imm_df(const vec4_builder &bld, double v)
291 {
292    const intel_device_info *devinfo = bld.shader->devinfo;
293    assert(devinfo->ver == 7);
294 
295    /* gfx7.5 does not support DF immediates straightforward but the DIM
296     * instruction allows to set the 64-bit immediate value.
297     */
298    if (devinfo->verx10 == 75) {
299       const vec4_builder ubld = bld.exec_all();
300       const dst_reg dst = bld.vgrf(ELK_REGISTER_TYPE_DF);
301       ubld.DIM(dst, elk_imm_df(v));
302       return swizzle(src_reg(dst), ELK_SWIZZLE_XXXX);
303    }
304 
305    /* gfx7 does not support DF immediates */
306    union {
307       double d;
308       struct {
309          uint32_t i1;
310          uint32_t i2;
311       };
312    } di;
313 
314    di.d = v;
315 
316    /* Write the low 32-bit of the constant to the X:UD channel and the
317     * high 32-bit to the Y:UD channel to build the constant in a VGRF.
318     * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
319     * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
320     * XXXX so any access to the VGRF only reads the constant data in these
321     * channels.
322     */
323    const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 2);
324    for (unsigned n = 0; n < 2; n++) {
325       const vec4_builder ubld = bld.exec_all().group(4, n);
326       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), elk_imm_ud(di.i1));
327       ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), elk_imm_ud(di.i2));
328    }
329 
330    return swizzle(src_reg(retype(tmp, ELK_REGISTER_TYPE_DF)), ELK_SWIZZLE_XXXX);
331 }
332 
333 void
nir_emit_load_const(nir_load_const_instr * instr)334 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
335 {
336    dst_reg reg;
337 
338    if (instr->def.bit_size == 64) {
339       reg = dst_reg(VGRF, alloc.allocate(2));
340       reg.type = ELK_REGISTER_TYPE_DF;
341    } else {
342       reg = dst_reg(VGRF, alloc.allocate(1));
343       reg.type = ELK_REGISTER_TYPE_D;
344    }
345 
346    const vec4_builder ibld = vec4_builder(this).at_end();
347    unsigned remaining = elk_writemask_for_size(instr->def.num_components);
348 
349    /* @FIXME: consider emitting vector operations to save some MOVs in
350     * cases where the components are representable in 8 bits.
351     * For now, we emit a MOV for each distinct value.
352     */
353    for (unsigned i = 0; i < instr->def.num_components; i++) {
354       unsigned writemask = 1 << i;
355 
356       if ((remaining & writemask) == 0)
357          continue;
358 
359       for (unsigned j = i; j < instr->def.num_components; j++) {
360          if ((instr->def.bit_size == 32 &&
361               instr->value[i].u32 == instr->value[j].u32) ||
362              (instr->def.bit_size == 64 &&
363               instr->value[i].f64 == instr->value[j].f64)) {
364             writemask |= 1 << j;
365          }
366       }
367 
368       reg.writemask = writemask;
369       if (instr->def.bit_size == 64) {
370          emit(MOV(reg, elk_setup_imm_df(ibld, instr->value[i].f64)));
371       } else {
372          emit(MOV(reg, elk_imm_d(instr->value[i].i32)));
373       }
374 
375       remaining &= ~writemask;
376    }
377 
378    /* Set final writemask */
379    reg.writemask = elk_writemask_for_size(instr->def.num_components);
380 
381    nir_ssa_values[instr->def.index] = reg;
382 }
383 
384 src_reg
get_nir_ssbo_intrinsic_index(nir_intrinsic_instr * instr)385 vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr)
386 {
387    /* SSBO stores are weird in that their index is in src[1] */
388    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
389 
390    if (nir_src_is_const(instr->src[src])) {
391       return elk_imm_ud(nir_src_as_uint(instr->src[src]));
392    } else {
393       return emit_uniformize(get_nir_src(instr->src[src]));
394    }
395 }
396 
397 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)398 vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
399 {
400    dst_reg dest;
401    src_reg src;
402 
403    switch (instr->intrinsic) {
404    case nir_intrinsic_decl_reg: {
405       unsigned bit_size = nir_intrinsic_bit_size(instr);
406       unsigned array_elems = nir_intrinsic_num_array_elems(instr);
407       if (array_elems == 0)
408          array_elems = 1;
409 
410       const unsigned num_regs = array_elems * DIV_ROUND_UP(bit_size, 32);
411       dst_reg reg(VGRF, alloc.allocate(num_regs));
412       if (bit_size == 64)
413          reg.type = ELK_REGISTER_TYPE_DF;
414 
415       nir_ssa_values[instr->def.index] = reg;
416       break;
417    }
418 
419    case nir_intrinsic_load_reg:
420    case nir_intrinsic_load_reg_indirect:
421    case nir_intrinsic_store_reg:
422    case nir_intrinsic_store_reg_indirect:
423       /* Nothing to do with these. */
424       break;
425 
426    case nir_intrinsic_load_input:
427    case nir_intrinsic_load_per_primitive_input: {
428       assert(instr->def.bit_size == 32);
429       /* We set EmitNoIndirectInput for VS */
430       unsigned load_offset = nir_src_as_uint(instr->src[0]);
431 
432       dest = get_nir_def(instr->def);
433 
434       src = src_reg(ATTR, nir_intrinsic_base(instr) + load_offset,
435                     glsl_uvec4_type());
436       src = retype(src, dest.type);
437 
438       /* Swizzle source based on component layout qualifier */
439       src.swizzle = ELK_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
440       emit(MOV(dest, src));
441       break;
442    }
443 
444    case nir_intrinsic_store_output: {
445       assert(nir_src_bit_size(instr->src[0]) == 32);
446       unsigned store_offset = nir_src_as_uint(instr->src[1]);
447       int varying = nir_intrinsic_base(instr) + store_offset;
448       src = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F,
449                         instr->num_components);
450 
451       unsigned c = nir_intrinsic_component(instr);
452       output_reg[varying][c] = dst_reg(src);
453       output_num_components[varying][c] = instr->num_components;
454       break;
455    }
456 
457    case nir_intrinsic_get_ssbo_size: {
458       assert(nir_src_num_components(instr->src[0]) == 1);
459       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
460                             nir_src_as_uint(instr->src[0]) : 0;
461 
462       dst_reg result_dst = get_nir_def(instr->def);
463       vec4_instruction *inst = new(mem_ctx)
464          vec4_instruction(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, result_dst);
465 
466       inst->base_mrf = 2;
467       inst->mlen = 1; /* always at least one */
468       inst->src[1] = elk_imm_ud(ssbo_index);
469 
470       /* MRF for the first parameter */
471       src_reg lod = elk_imm_d(0);
472       int param_base = inst->base_mrf;
473       int writemask = WRITEMASK_X;
474       emit(MOV(dst_reg(MRF, param_base, glsl_int_type(), writemask), lod));
475 
476       emit(inst);
477       break;
478    }
479 
480    case nir_intrinsic_store_ssbo: {
481       assert(devinfo->ver == 7);
482 
483       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
484       assert(nir_src_bit_size(instr->src[0]) == 32);
485       assert(nir_intrinsic_write_mask(instr) ==
486              (1u << instr->num_components) - 1);
487 
488       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
489       src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
490                                   ELK_REGISTER_TYPE_UD);
491 
492       /* Value */
493       src_reg val_reg = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_F, 4);
494 
495       /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
496        * writes will use SIMD8 mode. In order to hide this and keep symmetry across
497        * typed and untyped messages and across hardware platforms, the
498        * current implementation of the untyped messages will transparently convert
499        * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
500        * and enabling only channel X on the SEND instruction.
501        *
502        * The above, works well for full vector writes, but not for partial writes
503        * where we want to write some channels and not others, like when we have
504        * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
505        * quite restrictive with regards to the channel enables we can configure in
506        * the message descriptor (not all combinations are allowed) we cannot simply
507        * implement these scenarios with a single message while keeping the
508        * aforementioned symmetry in the implementation. For now we de decided that
509        * it is better to keep the symmetry to reduce complexity, so in situations
510        * such as the one described we end up emitting two untyped write messages
511        * (one for xy and another for w).
512        *
513        * The code below packs consecutive channels into a single write message,
514        * detects gaps in the vector write and if needed, sends a second message
515        * with the remaining channels. If in the future we decide that we want to
516        * emit a single message at the expense of losing the symmetry in the
517        * implementation we can:
518        *
519        * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
520        *    message payload. In this mode we can write up to 8 offsets and dwords
521        *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
522        *    and select which of the 8 channels carry data to write by setting the
523        *    appropriate writemask in the dst register of the SEND instruction.
524        *    It would require to write a new generator opcode specifically for
525        *    IvyBridge since we would need to prepare a SIMD8 payload that could
526        *    use any channel, not just X.
527        *
528        * 2) For Haswell+: Simply send a single write message but set the writemask
529        *    on the dst of the SEND instruction to select the channels we want to
530        *    write. It would require to modify the current messages to receive
531        *    and honor the writemask provided.
532        */
533       const vec4_builder bld = vec4_builder(this).at_end()
534                                .annotate(current_annotation, base_ir);
535 
536       emit_untyped_write(bld, surf_index, offset_reg, val_reg,
537                          1 /* dims */, instr->num_components /* size */,
538                          ELK_PREDICATE_NONE);
539       break;
540    }
541 
542    case nir_intrinsic_load_ssbo: {
543       assert(devinfo->ver == 7);
544 
545       /* elk_nir_lower_mem_access_bit_sizes takes care of this */
546       assert(instr->def.bit_size == 32);
547 
548       src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
549       src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
550                                   ELK_REGISTER_TYPE_UD);
551 
552       /* Read the vector */
553       const vec4_builder bld = vec4_builder(this).at_end()
554          .annotate(current_annotation, base_ir);
555 
556       src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
557                                               1 /* dims */, 4 /* size*/,
558                                               ELK_PREDICATE_NONE);
559       dst_reg dest = get_nir_def(instr->def);
560       read_result.type = dest.type;
561       read_result.swizzle = elk_swizzle_for_size(instr->num_components);
562       emit(MOV(dest, read_result));
563       break;
564    }
565 
566    case nir_intrinsic_ssbo_atomic:
567    case nir_intrinsic_ssbo_atomic_swap:
568       nir_emit_ssbo_atomic(lsc_op_to_legacy_atomic(elk_lsc_aop_for_nir_intrinsic(instr)), instr);
569       break;
570 
571    case nir_intrinsic_load_vertex_id:
572       unreachable("should be lowered by vertex_id_zero_based");
573 
574    case nir_intrinsic_load_vertex_id_zero_base:
575    case nir_intrinsic_load_base_vertex:
576    case nir_intrinsic_load_instance_id:
577    case nir_intrinsic_load_base_instance:
578    case nir_intrinsic_load_draw_id:
579    case nir_intrinsic_load_invocation_id:
580       unreachable("should be lowered by elk_nir_lower_vs_inputs()");
581 
582    case nir_intrinsic_load_uniform: {
583       /* Offsets are in bytes but they should always be multiples of 4 */
584       assert(nir_intrinsic_base(instr) % 4 == 0);
585 
586       dest = get_nir_def(instr->def);
587 
588       src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
589       src.type = dest.type;
590 
591       /* Uniforms don't actually have to be vec4 aligned.  In the case that
592        * it isn't, we have to use a swizzle to shift things around.  They
593        * do still have the std140 alignment requirement that vec2's have to
594        * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
595        *
596        * The swizzle also works in the indirect case as the generator adds
597        * the swizzle to the offset for us.
598        */
599       const int type_size = type_sz(src.type);
600       unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size;
601       assert(shift + instr->num_components <= 4);
602 
603       if (nir_src_is_const(instr->src[0])) {
604          const unsigned load_offset = nir_src_as_uint(instr->src[0]);
605          /* Offsets are in bytes but they should always be multiples of 4 */
606          assert(load_offset % 4 == 0);
607 
608          src.swizzle = elk_swizzle_for_size(instr->num_components);
609          dest.writemask = elk_writemask_for_size(instr->num_components);
610          unsigned offset = load_offset + shift * type_size;
611          src.offset = ROUND_DOWN_TO(offset, 16);
612          shift = (offset % 16) / type_size;
613          assert(shift + instr->num_components <= 4);
614          src.swizzle += ELK_SWIZZLE4(shift, shift, shift, shift);
615 
616          emit(MOV(dest, src));
617       } else {
618          /* Uniform arrays are vec4 aligned, because of std140 alignment
619           * rules.
620           */
621          assert(shift == 0);
622 
623          src_reg indirect = get_nir_src(instr->src[0], ELK_REGISTER_TYPE_UD, 1);
624 
625          /* MOV_INDIRECT is going to stomp the whole thing anyway */
626          dest.writemask = WRITEMASK_XYZW;
627 
628          emit(ELK_SHADER_OPCODE_MOV_INDIRECT, dest, src,
629               indirect, elk_imm_ud(nir_intrinsic_range(instr)));
630       }
631       break;
632    }
633 
634    case nir_intrinsic_load_ubo: {
635       src_reg surf_index;
636 
637       dest = get_nir_def(instr->def);
638 
639       if (nir_src_is_const(instr->src[0])) {
640          /* The block index is a constant, so just emit the binding table entry
641           * as an immediate.
642           */
643          const unsigned index = nir_src_as_uint(instr->src[0]);
644          surf_index = elk_imm_ud(index);
645       } else {
646          /* The block index is not a constant. Evaluate the index expression
647           * per-channel and add the base UBO index; we have to select a value
648           * from any live channel.
649           */
650          surf_index = src_reg(this, glsl_uint_type());
651          emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
652                                                    instr->num_components)));
653          surf_index = emit_uniformize(surf_index);
654       }
655 
656       src_reg push_reg;
657       src_reg offset_reg;
658       if (nir_src_is_const(instr->src[1])) {
659          unsigned load_offset = nir_src_as_uint(instr->src[1]);
660          unsigned aligned_offset = load_offset & ~15;
661          offset_reg = elk_imm_ud(aligned_offset);
662 
663          /* See if we've selected this as a push constant candidate */
664          if (nir_src_is_const(instr->src[0])) {
665             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
666             const unsigned offset_256b = aligned_offset / 32;
667 
668             for (int i = 0; i < 4; i++) {
669                const struct elk_ubo_range *range = &prog_data->base.ubo_ranges[i];
670                if (range->block == ubo_block &&
671                    offset_256b >= range->start &&
672                    offset_256b < range->start + range->length) {
673 
674                   push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i));
675                   push_reg.type = dest.type;
676                   push_reg.offset = aligned_offset - 32 * range->start;
677                   break;
678                }
679             }
680          }
681       } else {
682          offset_reg = src_reg(this, glsl_uint_type());
683          emit(MOV(dst_reg(offset_reg),
684                   get_nir_src(instr->src[1], nir_type_uint32, 1)));
685       }
686 
687       src_reg packed_consts;
688       if (push_reg.file != BAD_FILE) {
689          packed_consts = push_reg;
690       } else if (instr->def.bit_size == 32) {
691          packed_consts = src_reg(this, glsl_vec4_type());
692          emit_pull_constant_load_reg(dst_reg(packed_consts),
693                                      surf_index,
694                                      offset_reg,
695                                      NULL, NULL /* before_block/inst */);
696          prog_data->base.has_ubo_pull = true;
697       } else {
698          src_reg temp = src_reg(this, glsl_dvec4_type());
699          src_reg temp_float = retype(temp, ELK_REGISTER_TYPE_F);
700 
701          emit_pull_constant_load_reg(dst_reg(temp_float),
702                                      surf_index, offset_reg, NULL, NULL);
703          if (offset_reg.file == IMM)
704             offset_reg.ud += 16;
705          else
706             emit(ADD(dst_reg(offset_reg), offset_reg, elk_imm_ud(16u)));
707          emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
708                                      surf_index, offset_reg, NULL, NULL);
709          prog_data->base.has_ubo_pull = true;
710 
711          packed_consts = src_reg(this, glsl_dvec4_type());
712          shuffle_64bit_data(dst_reg(packed_consts), temp, false);
713       }
714 
715       packed_consts.swizzle = elk_swizzle_for_size(instr->num_components);
716       if (nir_src_is_const(instr->src[1])) {
717          unsigned load_offset = nir_src_as_uint(instr->src[1]);
718          unsigned type_size = type_sz(dest.type);
719          packed_consts.swizzle +=
720             ELK_SWIZZLE4(load_offset % 16 / type_size,
721                          load_offset % 16 / type_size,
722                          load_offset % 16 / type_size,
723                          load_offset % 16 / type_size);
724       }
725 
726       emit(MOV(dest, retype(packed_consts, dest.type)));
727 
728       break;
729    }
730 
731    case nir_intrinsic_barrier: {
732       if (nir_intrinsic_memory_scope(instr) == SCOPE_NONE)
733          break;
734       const vec4_builder bld =
735          vec4_builder(this).at_end().annotate(current_annotation, base_ir);
736       const dst_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
737       vec4_instruction *fence =
738          bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, tmp, elk_vec8_grf(0, 0));
739       fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
740       break;
741    }
742 
743    case nir_intrinsic_shader_clock: {
744       /* We cannot do anything if there is an event, so ignore it for now */
745       const src_reg shader_clock = get_timestamp();
746       const enum elk_reg_type type = elk_type_for_base_type(glsl_uvec2_type());
747 
748       dest = get_nir_def(instr->def, type);
749       emit(MOV(dest, shader_clock));
750       break;
751    }
752 
753    default:
754       unreachable("Unknown intrinsic");
755    }
756 }
757 
758 void
nir_emit_ssbo_atomic(int op,nir_intrinsic_instr * instr)759 vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
760 {
761    dst_reg dest;
762    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
763       dest = get_nir_def(instr->def);
764 
765    src_reg surface = get_nir_ssbo_intrinsic_index(instr);
766    src_reg offset = get_nir_src(instr->src[1], 1);
767    src_reg data1;
768    if (op != ELK_AOP_INC && op != ELK_AOP_DEC && op != ELK_AOP_PREDEC)
769       data1 = get_nir_src(instr->src[2], 1);
770    src_reg data2;
771    if (op == ELK_AOP_CMPWR)
772       data2 = get_nir_src(instr->src[3], 1);
773 
774    /* Emit the actual atomic operation operation */
775    const vec4_builder bld =
776       vec4_builder(this).at_end().annotate(current_annotation, base_ir);
777 
778    src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
779                                                data1, data2,
780                                                1 /* dims */, 1 /* rsize */,
781                                                op,
782                                                ELK_PREDICATE_NONE);
783    dest.type = atomic_result.type;
784    bld.MOV(dest, atomic_result);
785 }
786 
787 static unsigned
elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])788 elk_swizzle_for_nir_swizzle(uint8_t swizzle[4])
789 {
790    return ELK_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
791 }
792 
793 bool
optimize_predicate(nir_alu_instr * instr,enum elk_predicate * predicate)794 vec4_visitor::optimize_predicate(nir_alu_instr *instr,
795                                  enum elk_predicate *predicate)
796 {
797    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
798       return false;
799 
800    nir_alu_instr *cmp_instr =
801       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
802 
803    switch (cmp_instr->op) {
804    case nir_op_b32any_fnequal2:
805    case nir_op_b32any_inequal2:
806    case nir_op_b32any_fnequal3:
807    case nir_op_b32any_inequal3:
808    case nir_op_b32any_fnequal4:
809    case nir_op_b32any_inequal4:
810       *predicate = ELK_PREDICATE_ALIGN16_ANY4H;
811       break;
812    case nir_op_b32all_fequal2:
813    case nir_op_b32all_iequal2:
814    case nir_op_b32all_fequal3:
815    case nir_op_b32all_iequal3:
816    case nir_op_b32all_fequal4:
817    case nir_op_b32all_iequal4:
818       *predicate = ELK_PREDICATE_ALIGN16_ALL4H;
819       break;
820    default:
821       return false;
822    }
823 
824    unsigned size_swizzle =
825       elk_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
826 
827    src_reg op[2];
828    assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
829    for (unsigned i = 0; i < 2; i++) {
830       nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
831       unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
832       type = (nir_alu_type) (((unsigned) type) | bit_size);
833       op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
834       unsigned base_swizzle =
835          elk_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
836       op[i].swizzle = elk_compose_swizzle(size_swizzle, base_swizzle);
837    }
838 
839    emit(CMP(dst_null_d(), op[0], op[1],
840             elk_cmod_for_nir_comparison(cmp_instr->op)));
841 
842    return true;
843 }
844 
845 void
emit_conversion_from_double(dst_reg dst,src_reg src)846 vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src)
847 {
848    enum elk_opcode op;
849    switch (dst.type) {
850    case ELK_REGISTER_TYPE_D:
851       op = ELK_VEC4_OPCODE_DOUBLE_TO_D32;
852       break;
853    case ELK_REGISTER_TYPE_UD:
854       op = ELK_VEC4_OPCODE_DOUBLE_TO_U32;
855       break;
856    case ELK_REGISTER_TYPE_F:
857       op = ELK_VEC4_OPCODE_DOUBLE_TO_F32;
858       break;
859    default:
860       unreachable("Unknown conversion");
861    }
862 
863    dst_reg temp = dst_reg(this, glsl_dvec4_type());
864    emit(MOV(temp, src));
865    dst_reg temp2 = dst_reg(this, glsl_dvec4_type());
866    emit(op, temp2, src_reg(temp));
867 
868    emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2));
869    emit(MOV(dst, src_reg(retype(temp2, dst.type))));
870 }
871 
872 void
emit_conversion_to_double(dst_reg dst,src_reg src)873 vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src)
874 {
875    dst_reg tmp_dst = dst_reg(src_reg(this, glsl_dvec4_type()));
876    src_reg tmp_src = retype(src_reg(this, glsl_vec4_type()), src.type);
877    emit(MOV(dst_reg(tmp_src), src));
878    emit(ELK_VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
879    emit(MOV(dst, src_reg(tmp_dst)));
880 }
881 
882 /**
883  * Try to use an immediate value for a source
884  *
885  * In cases of flow control, constant propagation is sometimes unable to
886  * determine that a register contains a constant value.  To work around this,
887  * try to emit a literal as one of the sources.  If \c try_src0_also is set,
888  * \c op[0] will also be tried for an immediate value.
889  *
890  * If \c op[0] is modified, the operands will be exchanged so that \c op[1]
891  * will always be the immediate value.
892  *
893  * \return The index of the source that was modified, 0 or 1, if successful.
894  * Otherwise, -1.
895  *
896  * \param op - Operands to the instruction
897  * \param try_src0_also - True if \c op[0] should also be a candidate for
898  *                        getting an immediate value.  This should only be set
899  *                        for commutative operations.
900  */
901 static int
try_immediate_source(const nir_alu_instr * instr,src_reg * op,bool try_src0_also)902 try_immediate_source(const nir_alu_instr *instr, src_reg *op,
903                      bool try_src0_also)
904 {
905    unsigned idx;
906 
907    /* MOV should be the only single-source instruction passed to this
908     * function.  Any other unary instruction with a constant source should
909     * have been constant-folded away!
910     */
911    assert(nir_op_infos[instr->op].num_inputs > 1 ||
912           instr->op == nir_op_mov);
913 
914    if (instr->op != nir_op_mov &&
915        nir_src_bit_size(instr->src[1].src) == 32 &&
916        nir_src_is_const(instr->src[1].src)) {
917       idx = 1;
918    } else if (try_src0_also &&
919          nir_src_bit_size(instr->src[0].src) == 32 &&
920          nir_src_is_const(instr->src[0].src)) {
921       idx = 0;
922    } else {
923       return -1;
924    }
925 
926    const enum elk_reg_type old_type = op[idx].type;
927 
928    switch (old_type) {
929    case ELK_REGISTER_TYPE_D:
930    case ELK_REGISTER_TYPE_UD: {
931       int first_comp = -1;
932       int d = 0;
933 
934       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
935          if (nir_alu_instr_channel_used(instr, idx, i)) {
936             if (first_comp < 0) {
937                first_comp = i;
938                d = nir_src_comp_as_int(instr->src[idx].src,
939                                        instr->src[idx].swizzle[i]);
940             } else if (d != nir_src_comp_as_int(instr->src[idx].src,
941                                                 instr->src[idx].swizzle[i])) {
942                return -1;
943             }
944          }
945       }
946 
947       assert(first_comp >= 0);
948 
949       if (op[idx].abs)
950          d = MAX2(-d, d);
951 
952       if (op[idx].negate)
953          d = -d;
954 
955       op[idx] = retype(src_reg(elk_imm_d(d)), old_type);
956       break;
957    }
958 
959    case ELK_REGISTER_TYPE_F: {
960       int first_comp = -1;
961       float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f };
962       bool is_scalar = true;
963 
964       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
965          if (nir_alu_instr_channel_used(instr, idx, i)) {
966             f[i] = nir_src_comp_as_float(instr->src[idx].src,
967                                          instr->src[idx].swizzle[i]);
968             if (first_comp < 0) {
969                first_comp = i;
970             } else if (f[first_comp] != f[i]) {
971                is_scalar = false;
972             }
973          }
974       }
975 
976       if (is_scalar) {
977          if (op[idx].abs)
978             f[first_comp] = fabs(f[first_comp]);
979 
980          if (op[idx].negate)
981             f[first_comp] = -f[first_comp];
982 
983          op[idx] = src_reg(elk_imm_f(f[first_comp]));
984          assert(op[idx].type == old_type);
985       } else {
986          uint8_t vf_values[4] = { 0, 0, 0, 0 };
987 
988          for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) {
989 
990             if (op[idx].abs)
991                f[i] = fabs(f[i]);
992 
993             if (op[idx].negate)
994                f[i] = -f[i];
995 
996             const int vf = elk_float_to_vf(f[i]);
997             if (vf == -1)
998                return -1;
999 
1000             vf_values[i] = vf;
1001          }
1002 
1003          op[idx] = src_reg(elk_imm_vf4(vf_values[0], vf_values[1],
1004                                        vf_values[2], vf_values[3]));
1005       }
1006       break;
1007    }
1008 
1009    default:
1010       unreachable("Non-32bit type.");
1011    }
1012 
1013    /* If the instruction has more than one source, the instruction format only
1014     * allows source 1 to be an immediate value.  If the immediate value was
1015     * source 0, then the sources must be exchanged.
1016     */
1017    if (idx == 0 && instr->op != nir_op_mov) {
1018       src_reg tmp = op[0];
1019       op[0] = op[1];
1020       op[1] = tmp;
1021    }
1022 
1023    return idx;
1024 }
1025 
1026 void
fix_float_operands(src_reg op[3],nir_alu_instr * instr)1027 vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr)
1028 {
1029    bool fixed[3] = { false, false, false };
1030 
1031    for (unsigned i = 0; i < 2; i++) {
1032       if (!nir_src_is_const(instr->src[i].src))
1033          continue;
1034 
1035       for (unsigned j = i + 1; j < 3; j++) {
1036          if (fixed[j])
1037             continue;
1038 
1039          if (!nir_src_is_const(instr->src[j].src))
1040             continue;
1041 
1042          if (nir_alu_srcs_equal(instr, instr, i, j)) {
1043             if (!fixed[i])
1044                op[i] = fix_3src_operand(op[i]);
1045 
1046             op[j] = op[i];
1047 
1048             fixed[i] = true;
1049             fixed[j] = true;
1050          } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) {
1051             if (!fixed[i])
1052                op[i] = fix_3src_operand(op[i]);
1053 
1054             op[j] = op[i];
1055             op[j].negate = !op[j].negate;
1056 
1057             fixed[i] = true;
1058             fixed[j] = true;
1059          }
1060       }
1061    }
1062 
1063    for (unsigned i = 0; i < 3; i++) {
1064       if (!fixed[i])
1065          op[i] = fix_3src_operand(op[i]);
1066    }
1067 }
1068 
1069 static bool
const_src_fits_in_16_bits(const nir_src & src,elk_reg_type type)1070 const_src_fits_in_16_bits(const nir_src &src, elk_reg_type type)
1071 {
1072    assert(nir_src_is_const(src));
1073    if (elk_reg_type_is_unsigned_integer(type)) {
1074       return nir_src_comp_as_uint(src, 0) <= UINT16_MAX;
1075    } else {
1076       const int64_t c = nir_src_comp_as_int(src, 0);
1077       return c <= INT16_MAX && c >= INT16_MIN;
1078    }
1079 }
1080 
1081 void
nir_emit_alu(nir_alu_instr * instr)1082 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
1083 {
1084    vec4_instruction *inst;
1085 
1086    nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
1087                                            instr->def.bit_size);
1088    dst_reg dst = get_nir_def(instr->def, dst_type);
1089    dst.writemask &= nir_component_mask(instr->def.num_components);
1090 
1091    src_reg op[4];
1092    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1093       nir_alu_type src_type = (nir_alu_type)
1094          (nir_op_infos[instr->op].input_types[i] |
1095           nir_src_bit_size(instr->src[i].src));
1096       op[i] = get_nir_src(instr->src[i].src, src_type, 4);
1097       op[i].swizzle = elk_swizzle_for_nir_swizzle(instr->src[i].swizzle);
1098    }
1099 
1100 #ifndef NDEBUG
1101    /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit
1102     * integer types to ever exist.
1103     */
1104    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1105       assert(type_sz(op[i].type) > 1);
1106 #endif
1107 
1108    switch (instr->op) {
1109    case nir_op_mov:
1110       try_immediate_source(instr, &op[0], true);
1111       inst = emit(MOV(dst, op[0]));
1112       break;
1113 
1114    case nir_op_vec2:
1115    case nir_op_vec3:
1116    case nir_op_vec4:
1117       unreachable("not reached: should be handled by lower_vec_to_movs()");
1118 
1119    case nir_op_i2f32:
1120    case nir_op_u2f32:
1121       inst = emit(MOV(dst, op[0]));
1122       break;
1123 
1124    case nir_op_f2f32:
1125    case nir_op_f2i32:
1126    case nir_op_f2u32:
1127       if (nir_src_bit_size(instr->src[0].src) == 64)
1128          emit_conversion_from_double(dst, op[0]);
1129       else
1130          inst = emit(MOV(dst, op[0]));
1131       break;
1132 
1133    case nir_op_f2f64:
1134    case nir_op_i2f64:
1135    case nir_op_u2f64:
1136       emit_conversion_to_double(dst, op[0]);
1137       break;
1138 
1139    case nir_op_fsat:
1140       inst = emit(MOV(dst, op[0]));
1141       inst->saturate = true;
1142       break;
1143 
1144    case nir_op_fneg:
1145    case nir_op_ineg:
1146       op[0].negate = true;
1147       inst = emit(MOV(dst, op[0]));
1148       break;
1149 
1150    case nir_op_fabs:
1151    case nir_op_iabs:
1152       op[0].negate = false;
1153       op[0].abs = true;
1154       inst = emit(MOV(dst, op[0]));
1155       break;
1156 
1157    case nir_op_iadd:
1158       assert(instr->def.bit_size < 64);
1159       FALLTHROUGH;
1160    case nir_op_fadd:
1161       try_immediate_source(instr, op, true);
1162       inst = emit(ADD(dst, op[0], op[1]));
1163       break;
1164 
1165    case nir_op_uadd_sat:
1166       assert(instr->def.bit_size < 64);
1167       inst = emit(ADD(dst, op[0], op[1]));
1168       inst->saturate = true;
1169       break;
1170 
1171    case nir_op_fmul:
1172       try_immediate_source(instr, op, true);
1173       inst = emit(MUL(dst, op[0], op[1]));
1174       break;
1175 
1176    case nir_op_imul: {
1177       assert(instr->def.bit_size < 64);
1178 
1179       /* For integer multiplication, the MUL uses the low 16 bits of one of
1180        * the operands (src0 through SNB, src1 on IVB and later). The MACH
1181        * accumulates in the contribution of the upper 16 bits of that
1182        * operand. If we can determine that one of the args is in the low
1183        * 16 bits, though, we can just emit a single MUL.
1184        */
1185       if (nir_src_is_const(instr->src[0].src) &&
1186           nir_alu_instr_src_read_mask(instr, 0) == 1 &&
1187           const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) {
1188          if (devinfo->ver < 7)
1189             emit(MUL(dst, op[0], op[1]));
1190          else
1191             emit(MUL(dst, op[1], op[0]));
1192       } else if (nir_src_is_const(instr->src[1].src) &&
1193                  nir_alu_instr_src_read_mask(instr, 1) == 1 &&
1194                  const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) {
1195          if (devinfo->ver < 7)
1196             emit(MUL(dst, op[1], op[0]));
1197          else
1198             emit(MUL(dst, op[0], op[1]));
1199       } else {
1200          struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1201 
1202          emit(MUL(acc, op[0], op[1]));
1203          emit(MACH(dst_null_d(), op[0], op[1]));
1204          emit(MOV(dst, src_reg(acc)));
1205       }
1206       break;
1207    }
1208 
1209    case nir_op_imul_high:
1210    case nir_op_umul_high: {
1211       assert(instr->def.bit_size < 64);
1212       struct elk_reg acc = retype(elk_acc_reg(8), dst.type);
1213 
1214       emit(MUL(acc, op[0], op[1]));
1215       emit(MACH(dst, op[0], op[1]));
1216       break;
1217    }
1218 
1219    case nir_op_frcp:
1220       inst = emit_math(ELK_SHADER_OPCODE_RCP, dst, op[0]);
1221       break;
1222 
1223    case nir_op_fexp2:
1224       inst = emit_math(ELK_SHADER_OPCODE_EXP2, dst, op[0]);
1225       break;
1226 
1227    case nir_op_flog2:
1228       inst = emit_math(ELK_SHADER_OPCODE_LOG2, dst, op[0]);
1229       break;
1230 
1231    case nir_op_fsin:
1232       inst = emit_math(ELK_SHADER_OPCODE_SIN, dst, op[0]);
1233       break;
1234 
1235    case nir_op_fcos:
1236       inst = emit_math(ELK_SHADER_OPCODE_COS, dst, op[0]);
1237       break;
1238 
1239    case nir_op_idiv:
1240    case nir_op_udiv:
1241       assert(instr->def.bit_size < 64);
1242       emit_math(ELK_SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
1243       break;
1244 
1245    case nir_op_umod:
1246    case nir_op_irem:
1247       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1248        * appears that our hardware just does the right thing for signed
1249        * remainder.
1250        */
1251       assert(instr->def.bit_size < 64);
1252       emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1253       break;
1254 
1255    case nir_op_imod: {
1256       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1257       inst = emit_math(ELK_SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1258 
1259       /* Math instructions don't support conditional mod */
1260       inst = emit(MOV(dst_null_d(), src_reg(dst)));
1261       inst->conditional_mod = ELK_CONDITIONAL_NZ;
1262 
1263       /* Now, we need to determine if signs of the sources are different.
1264        * When we XOR the sources, the top bit is 0 if they are the same and 1
1265        * if they are different.  We can then use a conditional modifier to
1266        * turn that into a predicate.  This leads us to an XOR.l instruction.
1267        *
1268        * Technically, according to the PRM, you're not allowed to use .l on a
1269        * XOR instruction.  However, empirical experiments and Curro's reading
1270        * of the simulator source both indicate that it's safe.
1271        */
1272       src_reg tmp = src_reg(this, glsl_ivec4_type());
1273       inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
1274       inst->predicate = ELK_PREDICATE_NORMAL;
1275       inst->conditional_mod = ELK_CONDITIONAL_L;
1276 
1277       /* If the result of the initial remainder operation is non-zero and the
1278        * two sources have different signs, add in a copy of op[1] to get the
1279        * final integer modulus value.
1280        */
1281       inst = emit(ADD(dst, src_reg(dst), op[1]));
1282       inst->predicate = ELK_PREDICATE_NORMAL;
1283       break;
1284    }
1285 
1286    case nir_op_ldexp:
1287       unreachable("not reached: should be handled by ldexp_to_arith()");
1288 
1289    case nir_op_fsqrt:
1290       inst = emit_math(ELK_SHADER_OPCODE_SQRT, dst, op[0]);
1291       break;
1292 
1293    case nir_op_frsq:
1294       inst = emit_math(ELK_SHADER_OPCODE_RSQ, dst, op[0]);
1295       break;
1296 
1297    case nir_op_fpow:
1298       inst = emit_math(ELK_SHADER_OPCODE_POW, dst, op[0], op[1]);
1299       break;
1300 
1301    case nir_op_uadd_carry: {
1302       assert(instr->def.bit_size < 64);
1303       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1304 
1305       emit(ADDC(dst_null_ud(), op[0], op[1]));
1306       emit(MOV(dst, src_reg(acc)));
1307       break;
1308    }
1309 
1310    case nir_op_usub_borrow: {
1311       assert(instr->def.bit_size < 64);
1312       struct elk_reg acc = retype(elk_acc_reg(8), ELK_REGISTER_TYPE_UD);
1313 
1314       emit(SUBB(dst_null_ud(), op[0], op[1]));
1315       emit(MOV(dst, src_reg(acc)));
1316       break;
1317    }
1318 
1319    case nir_op_ftrunc:
1320       inst = emit(RNDZ(dst, op[0]));
1321       if (devinfo->ver < 6) {
1322          inst->conditional_mod = ELK_CONDITIONAL_R;
1323          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1324          inst->predicate = ELK_PREDICATE_NORMAL;
1325          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1326       }
1327       break;
1328 
1329    case nir_op_fceil: {
1330       src_reg tmp = src_reg(this, glsl_float_type());
1331       tmp.swizzle = elk_swizzle_for_size(nir_src_num_components(instr->src[0].src));
1332 
1333       op[0].negate = !op[0].negate;
1334       emit(RNDD(dst_reg(tmp), op[0]));
1335       tmp.negate = true;
1336       inst = emit(MOV(dst, tmp));
1337       break;
1338    }
1339 
1340    case nir_op_ffloor:
1341       inst = emit(RNDD(dst, op[0]));
1342       break;
1343 
1344    case nir_op_ffract:
1345       inst = emit(FRC(dst, op[0]));
1346       break;
1347 
1348    case nir_op_fround_even:
1349       inst = emit(RNDE(dst, op[0]));
1350       if (devinfo->ver < 6) {
1351          inst->conditional_mod = ELK_CONDITIONAL_R;
1352          inst = emit(ADD(dst, src_reg(dst), elk_imm_f(1.0f)));
1353          inst->predicate = ELK_PREDICATE_NORMAL;
1354          inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
1355       }
1356       break;
1357 
1358    case nir_op_fquantize2f16: {
1359       /* See also vec4_visitor::emit_pack_half_2x16() */
1360       src_reg tmp16 = src_reg(this, glsl_uvec4_type());
1361       src_reg tmp32 = src_reg(this, glsl_vec4_type());
1362       src_reg zero = src_reg(this, glsl_vec4_type());
1363 
1364       /* Check for denormal */
1365       src_reg abs_src0 = op[0];
1366       abs_src0.abs = true;
1367       emit(CMP(dst_null_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1368                ELK_CONDITIONAL_L));
1369       /* Get the appropriately signed zero */
1370       emit(AND(retype(dst_reg(zero), ELK_REGISTER_TYPE_UD),
1371                retype(op[0], ELK_REGISTER_TYPE_UD),
1372                elk_imm_ud(0x80000000)));
1373       /* Do the actual F32 -> F16 -> F32 conversion */
1374       emit(F32TO16(dst_reg(tmp16), op[0]));
1375       emit(F16TO32(dst_reg(tmp32), tmp16));
1376       /* Select that or zero based on normal status */
1377       inst = emit(ELK_OPCODE_SEL, dst, zero, tmp32);
1378       inst->predicate = ELK_PREDICATE_NORMAL;
1379       break;
1380    }
1381 
1382    case nir_op_imin:
1383    case nir_op_umin:
1384       assert(instr->def.bit_size < 64);
1385       FALLTHROUGH;
1386    case nir_op_fmin:
1387       try_immediate_source(instr, op, true);
1388       inst = emit_minmax(ELK_CONDITIONAL_L, dst, op[0], op[1]);
1389       break;
1390 
1391    case nir_op_imax:
1392    case nir_op_umax:
1393       assert(instr->def.bit_size < 64);
1394       FALLTHROUGH;
1395    case nir_op_fmax:
1396       try_immediate_source(instr, op, true);
1397       inst = emit_minmax(ELK_CONDITIONAL_GE, dst, op[0], op[1]);
1398       break;
1399 
1400    case nir_op_ilt32:
1401    case nir_op_ult32:
1402    case nir_op_ige32:
1403    case nir_op_uge32:
1404    case nir_op_ieq32:
1405    case nir_op_ine32:
1406       assert(instr->def.bit_size < 64);
1407       FALLTHROUGH;
1408    case nir_op_flt32:
1409    case nir_op_fge32:
1410    case nir_op_feq32:
1411    case nir_op_fneu32: {
1412       enum elk_conditional_mod conditional_mod =
1413          elk_cmod_for_nir_comparison(instr->op);
1414 
1415       if (nir_src_bit_size(instr->src[0].src) < 64) {
1416          /* If the order of the sources is changed due to an immediate value,
1417           * then the condition must also be changed.
1418           */
1419          if (try_immediate_source(instr, op, true) == 0)
1420             conditional_mod = elk_swap_cmod(conditional_mod);
1421 
1422          emit(CMP(dst, op[0], op[1], conditional_mod));
1423       } else {
1424          /* Produce a 32-bit boolean result from the DF comparison by selecting
1425           * only the low 32-bit in each DF produced. Do this in a temporary
1426           * so we can then move from there to the result using align16 again
1427           * to honor the original writemask.
1428           */
1429          dst_reg temp = dst_reg(this, glsl_dvec4_type());
1430          emit(CMP(temp, op[0], op[1], conditional_mod));
1431          dst_reg result = dst_reg(this, glsl_bvec4_type());
1432          emit(ELK_VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
1433          emit(MOV(dst, src_reg(result)));
1434       }
1435       break;
1436    }
1437 
1438    case nir_op_b32all_iequal2:
1439    case nir_op_b32all_iequal3:
1440    case nir_op_b32all_iequal4:
1441       assert(instr->def.bit_size < 64);
1442       FALLTHROUGH;
1443    case nir_op_b32all_fequal2:
1444    case nir_op_b32all_fequal3:
1445    case nir_op_b32all_fequal4: {
1446       unsigned swiz =
1447          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1448 
1449       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1450                elk_cmod_for_nir_comparison(instr->op)));
1451       emit(MOV(dst, elk_imm_d(0)));
1452       inst = emit(MOV(dst, elk_imm_d(~0)));
1453       inst->predicate = ELK_PREDICATE_ALIGN16_ALL4H;
1454       break;
1455    }
1456 
1457    case nir_op_b32any_inequal2:
1458    case nir_op_b32any_inequal3:
1459    case nir_op_b32any_inequal4:
1460       assert(instr->def.bit_size < 64);
1461       FALLTHROUGH;
1462    case nir_op_b32any_fnequal2:
1463    case nir_op_b32any_fnequal3:
1464    case nir_op_b32any_fnequal4: {
1465       unsigned swiz =
1466          elk_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1467 
1468       emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1469                elk_cmod_for_nir_comparison(instr->op)));
1470 
1471       emit(MOV(dst, elk_imm_d(0)));
1472       inst = emit(MOV(dst, elk_imm_d(~0)));
1473       inst->predicate = ELK_PREDICATE_ALIGN16_ANY4H;
1474       break;
1475    }
1476 
1477    case nir_op_inot:
1478       assert(instr->def.bit_size < 64);
1479       emit(NOT(dst, op[0]));
1480       break;
1481 
1482    case nir_op_ixor:
1483       assert(instr->def.bit_size < 64);
1484       try_immediate_source(instr, op, true);
1485       emit(XOR(dst, op[0], op[1]));
1486       break;
1487 
1488    case nir_op_ior:
1489       assert(instr->def.bit_size < 64);
1490       try_immediate_source(instr, op, true);
1491       emit(OR(dst, op[0], op[1]));
1492       break;
1493 
1494    case nir_op_iand:
1495       assert(instr->def.bit_size < 64);
1496       try_immediate_source(instr, op, true);
1497       emit(AND(dst, op[0], op[1]));
1498       break;
1499 
1500    case nir_op_b2i32:
1501    case nir_op_b2f32:
1502    case nir_op_b2f64:
1503       if (instr->def.bit_size > 32) {
1504          assert(dst.type == ELK_REGISTER_TYPE_DF);
1505          emit_conversion_to_double(dst, negate(op[0]));
1506       } else {
1507          emit(MOV(dst, negate(op[0])));
1508       }
1509       break;
1510 
1511    case nir_op_unpack_half_2x16_split_x:
1512    case nir_op_unpack_half_2x16_split_y:
1513    case nir_op_pack_half_2x16_split:
1514       unreachable("not reached: should not occur in vertex shader");
1515 
1516    case nir_op_unpack_snorm_2x16:
1517    case nir_op_unpack_unorm_2x16:
1518    case nir_op_pack_snorm_2x16:
1519    case nir_op_pack_unorm_2x16:
1520       unreachable("not reached: should be handled by lower_packing_builtins");
1521 
1522    case nir_op_pack_uvec4_to_uint:
1523       unreachable("not reached");
1524 
1525    case nir_op_pack_uvec2_to_uint: {
1526       dst_reg tmp1 = dst_reg(this, glsl_uint_type());
1527       tmp1.writemask = WRITEMASK_X;
1528       op[0].swizzle = ELK_SWIZZLE_YYYY;
1529       emit(SHL(tmp1, op[0], src_reg(elk_imm_ud(16u))));
1530 
1531       dst_reg tmp2 = dst_reg(this, glsl_uint_type());
1532       tmp2.writemask = WRITEMASK_X;
1533       op[0].swizzle = ELK_SWIZZLE_XXXX;
1534       emit(AND(tmp2, op[0], src_reg(elk_imm_ud(0xffffu))));
1535 
1536       emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
1537       break;
1538    }
1539 
1540    case nir_op_pack_64_2x32_split: {
1541       dst_reg result = dst_reg(this, glsl_dvec4_type());
1542       dst_reg tmp = dst_reg(this, glsl_uvec4_type());
1543       emit(MOV(tmp, retype(op[0], ELK_REGISTER_TYPE_UD)));
1544       emit(ELK_VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
1545       emit(MOV(tmp, retype(op[1], ELK_REGISTER_TYPE_UD)));
1546       emit(ELK_VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
1547       emit(MOV(dst, src_reg(result)));
1548       break;
1549    }
1550 
1551    case nir_op_unpack_64_2x32_split_x:
1552    case nir_op_unpack_64_2x32_split_y: {
1553       enum elk_opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
1554          ELK_VEC4_OPCODE_PICK_LOW_32BIT : ELK_VEC4_OPCODE_PICK_HIGH_32BIT;
1555       dst_reg tmp = dst_reg(this, glsl_dvec4_type());
1556       emit(MOV(tmp, op[0]));
1557       dst_reg tmp2 = dst_reg(this, glsl_uvec4_type());
1558       emit(oper, tmp2, src_reg(tmp));
1559       emit(MOV(dst, src_reg(tmp2)));
1560       break;
1561    }
1562 
1563    case nir_op_unpack_half_2x16:
1564       /* As NIR does not guarantee that we have a correct swizzle outside the
1565        * boundaries of a vector, and the implementation of emit_unpack_half_2x16
1566        * uses the source operand in an operation with WRITEMASK_Y while our
1567        * source operand has only size 1, it accessed incorrect data producing
1568        * regressions in Piglit. We repeat the swizzle of the first component on the
1569        * rest of components to avoid regressions. In the vec4_visitor IR code path
1570        * this is not needed because the operand has already the correct swizzle.
1571        */
1572       op[0].swizzle = elk_compose_swizzle(ELK_SWIZZLE_XXXX, op[0].swizzle);
1573       emit_unpack_half_2x16(dst, op[0]);
1574       break;
1575 
1576    case nir_op_pack_half_2x16:
1577       emit_pack_half_2x16(dst, op[0]);
1578       break;
1579 
1580    case nir_op_unpack_unorm_4x8:
1581       assert(instr->def.bit_size < 64);
1582       emit_unpack_unorm_4x8(dst, op[0]);
1583       break;
1584 
1585    case nir_op_pack_unorm_4x8:
1586       assert(instr->def.bit_size < 64);
1587       emit_pack_unorm_4x8(dst, op[0]);
1588       break;
1589 
1590    case nir_op_unpack_snorm_4x8:
1591       assert(instr->def.bit_size < 64);
1592       emit_unpack_snorm_4x8(dst, op[0]);
1593       break;
1594 
1595    case nir_op_pack_snorm_4x8:
1596       assert(instr->def.bit_size < 64);
1597       emit_pack_snorm_4x8(dst, op[0]);
1598       break;
1599 
1600    case nir_op_bitfield_reverse:
1601       assert(instr->def.bit_size == 32);
1602       assert(nir_src_bit_size(instr->src[0].src) == 32);
1603       emit(BFREV(dst, op[0]));
1604       break;
1605 
1606    case nir_op_bit_count:
1607       assert(instr->def.bit_size == 32);
1608       assert(nir_src_bit_size(instr->src[0].src) < 64);
1609       emit(CBIT(dst, op[0]));
1610       break;
1611 
1612    case nir_op_ifind_msb: {
1613       assert(instr->def.bit_size == 32);
1614       assert(nir_src_bit_size(instr->src[0].src) == 32);
1615       assert(devinfo->ver >= 7);
1616 
1617       vec4_builder bld = vec4_builder(this).at_end();
1618       src_reg src(dst);
1619 
1620       emit(FBH(retype(dst, ELK_REGISTER_TYPE_UD), op[0]));
1621 
1622       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1623        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1624        * subtract the result from 31 to convert the MSB count into an LSB
1625        * count.
1626        */
1627       bld.CMP(dst_null_d(), src, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1628 
1629       inst = bld.ADD(dst, src, elk_imm_d(31));
1630       inst->predicate = ELK_PREDICATE_NORMAL;
1631       inst->src[0].negate = true;
1632       break;
1633    }
1634 
1635    case nir_op_uclz:
1636       assert(instr->def.bit_size == 32);
1637       assert(nir_src_bit_size(instr->src[0].src) == 32);
1638       emit(LZD(dst, op[0]));
1639       break;
1640 
1641    case nir_op_find_lsb:
1642       assert(instr->def.bit_size == 32);
1643       assert(nir_src_bit_size(instr->src[0].src) == 32);
1644       assert(devinfo->ver >= 7);
1645       emit(FBL(dst, op[0]));
1646       break;
1647 
1648    case nir_op_ubitfield_extract:
1649    case nir_op_ibitfield_extract:
1650       unreachable("should have been lowered");
1651    case nir_op_ubfe:
1652    case nir_op_ibfe:
1653       assert(instr->def.bit_size < 64);
1654       op[0] = fix_3src_operand(op[0]);
1655       op[1] = fix_3src_operand(op[1]);
1656       op[2] = fix_3src_operand(op[2]);
1657 
1658       emit(BFE(dst, op[2], op[1], op[0]));
1659       break;
1660 
1661    case nir_op_bfm:
1662       assert(instr->def.bit_size < 64);
1663       emit(BFI1(dst, op[0], op[1]));
1664       break;
1665 
1666    case nir_op_bfi:
1667       assert(instr->def.bit_size < 64);
1668       op[0] = fix_3src_operand(op[0]);
1669       op[1] = fix_3src_operand(op[1]);
1670       op[2] = fix_3src_operand(op[2]);
1671 
1672       emit(BFI2(dst, op[0], op[1], op[2]));
1673       break;
1674 
1675    case nir_op_bitfield_insert:
1676       unreachable("not reached: should have been lowered");
1677 
1678    case nir_op_fsign:
1679        if (type_sz(op[0].type) < 8) {
1680          /* AND(val, 0x80000000) gives the sign bit.
1681           *
1682           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1683           * zero.
1684           */
1685          emit(CMP(dst_null_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ));
1686 
1687          op[0].type = ELK_REGISTER_TYPE_UD;
1688          dst.type = ELK_REGISTER_TYPE_UD;
1689          emit(AND(dst, op[0], elk_imm_ud(0x80000000u)));
1690 
1691          inst = emit(OR(dst, src_reg(dst), elk_imm_ud(0x3f800000u)));
1692          inst->predicate = ELK_PREDICATE_NORMAL;
1693          dst.type = ELK_REGISTER_TYPE_F;
1694       } else {
1695           unreachable("Should have been lowered by nir_opt_algebraic.");
1696       }
1697       break;
1698 
1699    case nir_op_ishl:
1700       assert(instr->def.bit_size < 64);
1701       try_immediate_source(instr, op, false);
1702       emit(SHL(dst, op[0], op[1]));
1703       break;
1704 
1705    case nir_op_ishr:
1706       assert(instr->def.bit_size < 64);
1707       try_immediate_source(instr, op, false);
1708       emit(ASR(dst, op[0], op[1]));
1709       break;
1710 
1711    case nir_op_ushr:
1712       assert(instr->def.bit_size < 64);
1713       try_immediate_source(instr, op, false);
1714       emit(SHR(dst, op[0], op[1]));
1715       break;
1716 
1717    case nir_op_ffma:
1718       if (type_sz(dst.type) == 8) {
1719          dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
1720          emit(MUL(mul_dst, op[1], op[0]));
1721          inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
1722       } else {
1723          fix_float_operands(op, instr);
1724          inst = emit(MAD(dst, op[2], op[1], op[0]));
1725       }
1726       break;
1727 
1728    case nir_op_flrp:
1729       fix_float_operands(op, instr);
1730       inst = emit(LRP(dst, op[2], op[1], op[0]));
1731       break;
1732 
1733    case nir_op_b32csel:
1734       enum elk_predicate predicate;
1735       if (!optimize_predicate(instr, &predicate)) {
1736          emit(CMP(dst_null_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ));
1737          switch (dst.writemask) {
1738          case WRITEMASK_X:
1739             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_X;
1740             break;
1741          case WRITEMASK_Y:
1742             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Y;
1743             break;
1744          case WRITEMASK_Z:
1745             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_Z;
1746             break;
1747          case WRITEMASK_W:
1748             predicate = ELK_PREDICATE_ALIGN16_REPLICATE_W;
1749             break;
1750          default:
1751             predicate = ELK_PREDICATE_NORMAL;
1752             break;
1753          }
1754       }
1755       inst = emit(ELK_OPCODE_SEL, dst, op[1], op[2]);
1756       inst->predicate = predicate;
1757       break;
1758 
1759    case nir_op_fdot2_replicated:
1760       try_immediate_source(instr, op, true);
1761       inst = emit(ELK_OPCODE_DP2, dst, op[0], op[1]);
1762       break;
1763 
1764    case nir_op_fdot3_replicated:
1765       try_immediate_source(instr, op, true);
1766       inst = emit(ELK_OPCODE_DP3, dst, op[0], op[1]);
1767       break;
1768 
1769    case nir_op_fdot4_replicated:
1770       try_immediate_source(instr, op, true);
1771       inst = emit(ELK_OPCODE_DP4, dst, op[0], op[1]);
1772       break;
1773 
1774    case nir_op_fdph_replicated:
1775       try_immediate_source(instr, op, false);
1776       inst = emit(ELK_OPCODE_DPH, dst, op[0], op[1]);
1777       break;
1778 
1779    case nir_op_fdiv:
1780       unreachable("not reached: should be lowered by lower_fdiv in the compiler");
1781 
1782    case nir_op_fmod:
1783       unreachable("not reached: should be lowered by lower_fmod in the compiler");
1784 
1785    case nir_op_fsub:
1786    case nir_op_isub:
1787       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1788 
1789    default:
1790       unreachable("Unimplemented ALU operation");
1791    }
1792 
1793    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1794     * to sign extend the low bit to 0/~0
1795     */
1796    if (devinfo->ver <= 5 &&
1797        (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) ==
1798        ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1799       dst_reg masked = dst_reg(this, glsl_int_type());
1800       masked.writemask = dst.writemask;
1801       emit(AND(masked, src_reg(dst), elk_imm_d(1)));
1802       src_reg masked_neg = src_reg(masked);
1803       masked_neg.negate = true;
1804       emit(MOV(retype(dst, ELK_REGISTER_TYPE_D), masked_neg));
1805    }
1806 }
1807 
1808 void
nir_emit_jump(nir_jump_instr * instr)1809 vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
1810 {
1811    switch (instr->type) {
1812    case nir_jump_break:
1813       emit(ELK_OPCODE_BREAK);
1814       break;
1815 
1816    case nir_jump_continue:
1817       emit(ELK_OPCODE_CONTINUE);
1818       break;
1819 
1820    case nir_jump_return:
1821       FALLTHROUGH;
1822    default:
1823       unreachable("unknown jump");
1824    }
1825 }
1826 
1827 static bool
is_high_sampler(const struct intel_device_info * devinfo,src_reg sampler)1828 is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler)
1829 {
1830    if (devinfo->verx10 != 75)
1831       return false;
1832 
1833    return sampler.file != IMM || sampler.ud >= 16;
1834 }
1835 
1836 void
nir_emit_texture(nir_tex_instr * instr)1837 vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
1838 {
1839    unsigned texture = instr->texture_index;
1840    unsigned sampler = instr->sampler_index;
1841    src_reg texture_reg = elk_imm_ud(texture);
1842    src_reg sampler_reg = elk_imm_ud(sampler);
1843    src_reg coordinate;
1844    const glsl_type *coord_type = NULL;
1845    src_reg shadow_comparator;
1846    src_reg offset_value;
1847    src_reg lod, lod2;
1848    src_reg sample_index;
1849    src_reg mcs;
1850 
1851    dst_reg dest = get_nir_def(instr->def, instr->dest_type);
1852 
1853    /* The hardware requires a LOD for buffer textures */
1854    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
1855       lod = elk_imm_d(0);
1856 
1857    /* Load the texture operation sources */
1858    uint32_t constant_offset = 0;
1859    for (unsigned i = 0; i < instr->num_srcs; i++) {
1860       switch (instr->src[i].src_type) {
1861       case nir_tex_src_comparator:
1862          shadow_comparator = get_nir_src(instr->src[i].src,
1863                                          ELK_REGISTER_TYPE_F, 1);
1864          break;
1865 
1866       case nir_tex_src_coord: {
1867          unsigned src_size = nir_tex_instr_src_size(instr, i);
1868 
1869          switch (instr->op) {
1870          case nir_texop_txf:
1871          case nir_texop_txf_ms:
1872          case nir_texop_samples_identical:
1873             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D,
1874                                      src_size);
1875             coord_type = glsl_ivec_type(src_size);
1876             break;
1877 
1878          default:
1879             coordinate = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1880                                      src_size);
1881             coord_type = glsl_vec_type(src_size);
1882             break;
1883          }
1884          break;
1885       }
1886 
1887       case nir_tex_src_ddx:
1888          lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1889                            nir_tex_instr_src_size(instr, i));
1890          break;
1891 
1892       case nir_tex_src_ddy:
1893          lod2 = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F,
1894                            nir_tex_instr_src_size(instr, i));
1895          break;
1896 
1897       case nir_tex_src_lod:
1898          switch (instr->op) {
1899          case nir_texop_txs:
1900          case nir_texop_txf:
1901             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1902             break;
1903 
1904          default:
1905             lod = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_F, 1);
1906             break;
1907          }
1908          break;
1909 
1910       case nir_tex_src_ms_index: {
1911          sample_index = get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 1);
1912          break;
1913       }
1914 
1915       case nir_tex_src_offset:
1916          if (!elk_texture_offset(instr, i, &constant_offset)) {
1917             offset_value =
1918                get_nir_src(instr->src[i].src, ELK_REGISTER_TYPE_D, 2);
1919          }
1920          break;
1921 
1922       case nir_tex_src_texture_offset: {
1923          assert(texture_reg.is_zero());
1924          texture_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1925                                                    ELK_REGISTER_TYPE_UD, 1));
1926          break;
1927       }
1928 
1929       case nir_tex_src_sampler_offset: {
1930          assert(sampler_reg.is_zero());
1931          sampler_reg = emit_uniformize(get_nir_src(instr->src[i].src,
1932                                                    ELK_REGISTER_TYPE_UD, 1));
1933          break;
1934       }
1935 
1936       case nir_tex_src_projector:
1937          unreachable("Should be lowered by nir_lower_tex");
1938 
1939       case nir_tex_src_bias:
1940          unreachable("LOD bias is not valid for vertex shaders.\n");
1941 
1942       default:
1943          unreachable("unknown texture source");
1944       }
1945    }
1946 
1947    if (instr->op == nir_texop_txf_ms ||
1948        instr->op == nir_texop_samples_identical) {
1949       assert(coord_type != NULL);
1950       if (devinfo->ver >= 7) {
1951          mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
1952       } else {
1953          mcs = elk_imm_ud(0u);
1954       }
1955    }
1956 
1957    /* Stuff the channel select bits in the top of the texture offset */
1958    if (instr->op == nir_texop_tg4) {
1959       if (instr->component == 1 &&
1960           (key_tex->gather_channel_quirk_mask & (1 << texture))) {
1961          /* gather4 sampler is broken for green channel on RG32F --
1962           * we must ask for blue instead.
1963           */
1964          constant_offset |= 2 << 16;
1965       } else {
1966          constant_offset |= instr->component << 16;
1967       }
1968    }
1969 
1970    enum elk_opcode opcode;
1971    switch (instr->op) {
1972    case nir_texop_tex:             opcode = ELK_SHADER_OPCODE_TXL;        break;
1973    case nir_texop_txl:             opcode = ELK_SHADER_OPCODE_TXL;        break;
1974    case nir_texop_txd:             opcode = ELK_SHADER_OPCODE_TXD;        break;
1975    case nir_texop_txf:             opcode = ELK_SHADER_OPCODE_TXF;        break;
1976    case nir_texop_txf_ms:          opcode = ELK_SHADER_OPCODE_TXF_CMS;    break;
1977    case nir_texop_txs:             opcode = ELK_SHADER_OPCODE_TXS;        break;
1978    case nir_texop_query_levels:    opcode = ELK_SHADER_OPCODE_TXS;        break;
1979    case nir_texop_texture_samples: opcode = ELK_SHADER_OPCODE_SAMPLEINFO; break;
1980    case nir_texop_tg4:
1981       opcode = offset_value.file != BAD_FILE ? ELK_SHADER_OPCODE_TG4_OFFSET
1982                                              : ELK_SHADER_OPCODE_TG4;
1983       break;
1984    case nir_texop_samples_identical: {
1985       /* There are some challenges implementing this for vec4, and it seems
1986        * unlikely to be used anyway.  For now, just return false ways.
1987        */
1988       emit(MOV(dest, elk_imm_ud(0u)));
1989       return;
1990    }
1991    case nir_texop_txb:
1992    case nir_texop_lod:
1993       unreachable("Implicit LOD is only valid inside fragment shaders.");
1994    default:
1995       unreachable("Unrecognized tex op");
1996    }
1997 
1998    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
1999 
2000    inst->offset = constant_offset;
2001 
2002    /* The message header is necessary for:
2003     * - Gfx4 (always)
2004     * - Texel offsets
2005     * - Gather channel selection
2006     * - Sampler indices too large to fit in a 4-bit value.
2007     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2008     */
2009    inst->header_size =
2010       (devinfo->ver < 5 ||
2011        inst->offset != 0 ||
2012        opcode == ELK_SHADER_OPCODE_TG4 ||
2013        opcode == ELK_SHADER_OPCODE_TG4_OFFSET ||
2014        opcode == ELK_SHADER_OPCODE_SAMPLEINFO ||
2015        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2016    inst->base_mrf = 2;
2017    inst->mlen = inst->header_size;
2018    inst->dst.writemask = WRITEMASK_XYZW;
2019    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
2020 
2021    inst->src[1] = texture_reg;
2022    inst->src[2] = sampler_reg;
2023 
2024    /* MRF for the first parameter */
2025    int param_base = inst->base_mrf + inst->header_size;
2026 
2027    if (opcode == ELK_SHADER_OPCODE_TXS) {
2028       int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X;
2029       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2030       inst->mlen++;
2031    } else if (opcode == ELK_SHADER_OPCODE_SAMPLEINFO) {
2032       inst->dst.writemask = WRITEMASK_X;
2033    } else {
2034       /* Load the coordinate */
2035       /* FINISHME: gl_clamp_mask and saturate */
2036       int coord_mask = (1 << instr->coord_components) - 1;
2037       int zero_mask = 0xf & ~coord_mask;
2038 
2039       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2040                coordinate));
2041       inst->mlen++;
2042 
2043       if (zero_mask != 0) {
2044          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2045                   elk_imm_d(0)));
2046       }
2047       /* Load the shadow comparator */
2048       if (shadow_comparator.file != BAD_FILE &&
2049           opcode != ELK_SHADER_OPCODE_TXD &&
2050           opcode != ELK_SHADER_OPCODE_TG4_OFFSET) {
2051 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
2052 			  WRITEMASK_X),
2053 		  shadow_comparator));
2054 	 inst->mlen++;
2055       }
2056 
2057       /* Load the LOD info */
2058       switch (opcode) {
2059       case ELK_SHADER_OPCODE_TXL: {
2060 	 int mrf, writemask;
2061 	 if (devinfo->ver >= 5) {
2062 	    mrf = param_base + 1;
2063 	    if (shadow_comparator.file != BAD_FILE) {
2064 	       writemask = WRITEMASK_Y;
2065 	       /* mlen already incremented */
2066 	    } else {
2067 	       writemask = WRITEMASK_X;
2068 	       inst->mlen++;
2069 	    }
2070 	 } else /* devinfo->ver == 4 */ {
2071 	    mrf = param_base;
2072 	    writemask = WRITEMASK_W;
2073 	 }
2074 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2075          break;
2076       }
2077 
2078       case ELK_SHADER_OPCODE_TXF:
2079          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2080          break;
2081 
2082       case ELK_SHADER_OPCODE_TXF_CMS:
2083          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2084                   sample_index));
2085          if (devinfo->ver >= 7) {
2086             /* MCS data is in the first channel of `mcs`, but we need to get it into
2087              * the .y channel of the second vec4 of params, so replicate .x across
2088              * the whole vec4 and then mask off everything except .y
2089              */
2090             mcs.swizzle = ELK_SWIZZLE_XXXX;
2091             emit(MOV(dst_reg(MRF, param_base + 1, glsl_uint_type(), WRITEMASK_Y),
2092                      mcs));
2093          }
2094          inst->mlen++;
2095          break;
2096 
2097       case ELK_SHADER_OPCODE_TXD: {
2098          const elk_reg_type type = lod.type;
2099 
2100 	 if (devinfo->ver >= 5) {
2101 	    lod.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2102 	    lod2.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_X,ELK_SWIZZLE_X,ELK_SWIZZLE_Y,ELK_SWIZZLE_Y);
2103 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2104 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2105 	    inst->mlen++;
2106 
2107 	    if (nir_tex_instr_dest_size(instr) == 3 ||
2108                 shadow_comparator.file != BAD_FILE) {
2109 	       lod.swizzle = ELK_SWIZZLE_ZZZZ;
2110 	       lod2.swizzle = ELK_SWIZZLE_ZZZZ;
2111 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2112 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2113 	       inst->mlen++;
2114 
2115                if (shadow_comparator.file != BAD_FILE) {
2116                   emit(MOV(dst_reg(MRF, param_base + 2,
2117                                    shadow_comparator.type, WRITEMASK_Z),
2118                            shadow_comparator));
2119                }
2120 	    }
2121 	 } else /* devinfo->ver == 4 */ {
2122 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2123 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2124 	    inst->mlen += 2;
2125 	 }
2126          break;
2127       }
2128 
2129       case ELK_SHADER_OPCODE_TG4_OFFSET:
2130          if (shadow_comparator.file != BAD_FILE) {
2131             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
2132                      shadow_comparator));
2133          }
2134 
2135          emit(MOV(dst_reg(MRF, param_base + 1, glsl_ivec2_type(), WRITEMASK_XY),
2136                   offset_value));
2137          inst->mlen++;
2138          break;
2139 
2140       default:
2141          break;
2142       }
2143    }
2144 
2145    emit(inst);
2146 
2147    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2148     * spec requires layers.
2149     */
2150    if (instr->op == nir_texop_txs && devinfo->ver < 7) {
2151       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
2152       emit_minmax(ELK_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
2153                   src_reg(inst->dst), elk_imm_d(1));
2154    }
2155 
2156    if (instr->op == nir_texop_query_levels) {
2157       /* # levels is in .w */
2158       src_reg swizzled(dest);
2159       swizzled.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W,
2160                                       ELK_SWIZZLE_W, ELK_SWIZZLE_W);
2161       emit(MOV(dest, swizzled));
2162    }
2163 }
2164 
2165 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)2166 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2167                              src_reg coordinate, src_reg surface)
2168 {
2169    vec4_instruction *inst =
2170       new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_TXF_MCS,
2171                                     dst_reg(this, glsl_uvec4_type()));
2172    inst->base_mrf = 2;
2173    inst->src[1] = surface;
2174    inst->src[2] = elk_imm_ud(0); /* sampler */
2175    inst->mlen = 1;
2176 
2177    const int param_base = inst->base_mrf;
2178 
2179    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2180    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2181    int zero_mask = 0xf & ~coord_mask;
2182 
2183    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2184             coordinate));
2185 
2186    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2187             elk_imm_d(0)));
2188 
2189    emit(inst);
2190    return src_reg(inst->dst);
2191 }
2192 
2193 void
nir_emit_undef(nir_undef_instr * instr)2194 vec4_visitor::nir_emit_undef(nir_undef_instr *instr)
2195 {
2196    nir_ssa_values[instr->def.index] =
2197       dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
2198 }
2199 
2200 /* SIMD4x2 64bit data is stored in register space like this:
2201  *
2202  * r0.0:DF  x0 y0 z0 w0
2203  * r1.0:DF  x1 y1 z1 w1
2204  *
2205  * When we need to write data such as this to memory using 32-bit write
2206  * messages we need to shuffle it in this fashion:
2207  *
2208  * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
2209  * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
2210  *
2211  * We need to do the inverse operation when we read using 32-bit messages,
2212  * which we can do by applying the same exact shuffling on the 64-bit data
2213  * read, only that because the data for each vertex is positioned differently
2214  * we need to apply different channel enables.
2215  *
2216  * This function takes 64bit data and shuffles it as explained above.
2217  *
2218  * The @for_write parameter is used to specify if the shuffling is being done
2219  * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
2220  * write message (for_write = true), or instead we are doing the inverse
2221  * operation and we have just read 64-bit data using a 32-bit messages that we
2222  * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
2223  *
2224  * If @block and @ref are non-NULL, then the shuffling is done after @ref,
2225  * otherwise the instructions are emitted normally at the end. The function
2226  * returns the last instruction inserted.
2227  *
2228  * Notice that @src and @dst cannot be the same register.
2229  */
2230 vec4_instruction *
shuffle_64bit_data(dst_reg dst,src_reg src,bool for_write,bool for_scratch,elk_bblock_t * block,vec4_instruction * ref)2231 vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
2232                                  bool for_scratch,
2233                                  elk_bblock_t *block, vec4_instruction *ref)
2234 {
2235    assert(type_sz(src.type) == 8);
2236    assert(type_sz(dst.type) == 8);
2237    assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
2238    assert(!ref == !block);
2239 
2240    elk_opcode mov_op = for_scratch ? ELK_VEC4_OPCODE_MOV_FOR_SCRATCH : ELK_OPCODE_MOV;
2241 
2242    const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
2243                                    vec4_builder(this).at(block, ref->next);
2244 
2245    /* Resolve swizzle in src */
2246    if (src.swizzle != ELK_SWIZZLE_XYZW) {
2247       dst_reg data = dst_reg(this, glsl_dvec4_type());
2248       bld.emit(mov_op, data, src);
2249       src = src_reg(data);
2250    }
2251 
2252    /* dst+0.XY = src+0.XY */
2253    bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src);
2254 
2255    /* dst+0.ZW = src+1.XY */
2256    bld.group(4, for_write ? 1 : 0)
2257             .emit(mov_op, writemask(dst, WRITEMASK_ZW),
2258                   swizzle(byte_offset(src, REG_SIZE), ELK_SWIZZLE_XYXY));
2259 
2260    /* dst+1.XY = src+0.ZW */
2261    bld.group(4, for_write ? 0 : 1)
2262             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
2263                   swizzle(src, ELK_SWIZZLE_ZWZW));
2264 
2265    /* dst+1.ZW = src+1.ZW */
2266    return bld.group(4, 1)
2267             .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
2268                   byte_offset(src, REG_SIZE));
2269 }
2270 
2271 }
2272