• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2014 Scott Mansell
3  * Copyright © 2014 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include <inttypes.h>
26 #include "util/format/u_format.h"
27 #include "util/crc32.h"
28 #include "util/perf/cpu_trace.h"
29 #include "util/u_helpers.h"
30 #include "util/u_math.h"
31 #include "util/u_memory.h"
32 #include "util/ralloc.h"
33 #include "util/hash_table.h"
34 #include "tgsi/tgsi_dump.h"
35 #include "compiler/glsl_types.h"
36 #include "compiler/nir/nir.h"
37 #include "compiler/nir/nir_builder.h"
38 #include "nir/tgsi_to_nir.h"
39 #include "vc4_context.h"
40 #include "vc4_qpu.h"
41 #include "vc4_qir.h"
42 
43 static struct qreg
44 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
45 static void
46 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
47 
48 static struct vc4_compiled_shader *
49 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
50                         struct vc4_key *key);
51 
52 static int
type_size(const struct glsl_type * type,bool bindless)53 type_size(const struct glsl_type *type, bool bindless)
54 {
55    return glsl_count_attribute_slots(type, false);
56 }
57 
58 static void
resize_qreg_array(struct vc4_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)59 resize_qreg_array(struct vc4_compile *c,
60                   struct qreg **regs,
61                   uint32_t *size,
62                   uint32_t decl_size)
63 {
64         if (*size >= decl_size)
65                 return;
66 
67         uint32_t old_size = *size;
68         *size = MAX2(*size * 2, decl_size);
69         *regs = reralloc(c, *regs, struct qreg, *size);
70         if (!*regs) {
71                 fprintf(stderr, "Malloc failure\n");
72                 abort();
73         }
74 
75         for (uint32_t i = old_size; i < *size; i++)
76                 (*regs)[i] = c->undef;
77 }
78 
79 static void
ntq_emit_thrsw(struct vc4_compile * c)80 ntq_emit_thrsw(struct vc4_compile *c)
81 {
82         if (!c->fs_threaded)
83                 return;
84 
85         /* Always thread switch after each texture operation for now.
86          *
87          * We could do better by batching a bunch of texture fetches up and
88          * then doing one thread switch and collecting all their results
89          * afterward.
90          */
91         qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
92                                     c->undef, c->undef));
93         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
94 }
95 
96 static struct qreg
indirect_uniform_load(struct vc4_compile * c,nir_intrinsic_instr * intr)97 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
98 {
99         struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
100 
101         /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
102         uint32_t range = nir_intrinsic_range(intr);
103         indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
104         indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
105                                         qir_uniform_ui(c, range - 4));
106 
107         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
108                      indirect_offset,
109                      qir_uniform(c, QUNIFORM_UBO0_ADDR,
110                                  nir_intrinsic_base(intr)));
111 
112         c->num_texture_samples++;
113 
114         ntq_emit_thrsw(c);
115 
116         return qir_TEX_RESULT(c);
117 }
118 
119 static struct qreg
vc4_ubo_load(struct vc4_compile * c,nir_intrinsic_instr * intr)120 vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
121 {
122         ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);
123         assert(buffer_index == 1);
124         assert(c->stage == QSTAGE_FRAG);
125 
126         struct qreg offset = ntq_get_src(c, intr->src[1], 0);
127 
128         /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
129         offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
130         offset = qir_MIN_NOIMM(c, offset,
131                                qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
132 
133         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
134                      offset,
135                      qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
136 
137         c->num_texture_samples++;
138 
139         ntq_emit_thrsw(c);
140 
141         return qir_TEX_RESULT(c);
142 }
143 
144 nir_def *
vc4_nir_get_swizzled_channel(nir_builder * b,nir_def ** srcs,int swiz)145 vc4_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
146 {
147         switch (swiz) {
148         default:
149         case PIPE_SWIZZLE_NONE:
150                 fprintf(stderr, "warning: unknown swizzle\n");
151                 FALLTHROUGH;
152         case PIPE_SWIZZLE_0:
153                 return nir_imm_float(b, 0.0);
154         case PIPE_SWIZZLE_1:
155                 return nir_imm_float(b, 1.0);
156         case PIPE_SWIZZLE_X:
157         case PIPE_SWIZZLE_Y:
158         case PIPE_SWIZZLE_Z:
159         case PIPE_SWIZZLE_W:
160                 return srcs[swiz];
161         }
162 }
163 
164 static struct qreg *
ntq_init_ssa_def(struct vc4_compile * c,nir_def * def)165 ntq_init_ssa_def(struct vc4_compile *c, nir_def *def)
166 {
167         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
168                                           def->num_components);
169         _mesa_hash_table_insert(c->def_ht, def, qregs);
170         return qregs;
171 }
172 
173 /**
174  * This function is responsible for getting QIR results into the associated
175  * storage for a NIR instruction.
176  *
177  * If it's a NIR SSA def, then we just set the associated hash table entry to
178  * the new result.
179  *
180  * If it's a NIR reg, then we need to update the existing qreg assigned to the
181  * NIR destination with the incoming value.  To do that without introducing
182  * new MOVs, we require that the incoming qreg either be a uniform, or be
183  * SSA-defined by the previous QIR instruction in the block and rewritable by
184  * this function.  That lets us sneak ahead and insert the SF flag beforehand
185  * (knowing that the previous instruction doesn't depend on flags) and rewrite
186  * its destination to be the NIR reg's destination
187  */
188 static void
ntq_store_def(struct vc4_compile * c,nir_def * def,int chan,struct qreg result)189 ntq_store_def(struct vc4_compile *c, nir_def *def, int chan,
190               struct qreg result)
191 {
192         struct qinst *last_inst = NULL;
193         if (!list_is_empty(&c->cur_block->instructions))
194                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
195 
196         assert(result.file == QFILE_UNIF ||
197                (result.file == QFILE_TEMP &&
198                 last_inst && last_inst == c->defs[result.index]));
199 
200         nir_intrinsic_instr *store = nir_store_reg_for_def(def);
201         if (store == NULL) {
202                 assert(chan < def->num_components);
203 
204                 struct qreg *qregs;
205                 struct hash_entry *entry =
206                         _mesa_hash_table_search(c->def_ht, def);
207 
208                 if (entry)
209                         qregs = entry->data;
210                 else
211                         qregs = ntq_init_ssa_def(c, def);
212 
213                 qregs[chan] = result;
214         } else {
215                 nir_def *reg = store->src[1].ssa;
216                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
217                 assert(nir_intrinsic_base(store) == 0);
218                 assert(nir_intrinsic_num_array_elems(decl) == 0);
219                 struct hash_entry *entry =
220                         _mesa_hash_table_search(c->def_ht, reg);
221                 struct qreg *qregs = entry->data;
222 
223                 /* Insert a MOV if the source wasn't an SSA def in the
224                  * previous instruction.
225                  */
226                 if (result.file == QFILE_UNIF) {
227                         result = qir_MOV(c, result);
228                         last_inst = c->defs[result.index];
229                 }
230 
231                 /* We know they're both temps, so just rewrite index. */
232                 c->defs[last_inst->dst.index] = NULL;
233                 last_inst->dst.index = qregs[chan].index;
234 
235                 /* If we're in control flow, then make this update of the reg
236                  * conditional on the execution mask.
237                  */
238                 if (c->execute.file != QFILE_NULL) {
239                         last_inst->dst.index = qregs[chan].index;
240 
241                         /* Set the flags to the current exec mask.  To insert
242                          * the SF, we temporarily remove our SSA instruction.
243                          */
244                         list_del(&last_inst->link);
245                         qir_SF(c, c->execute);
246                         list_addtail(&last_inst->link,
247                                      &c->cur_block->instructions);
248 
249                         last_inst->cond = QPU_COND_ZS;
250                         last_inst->cond_is_exec_mask = true;
251                 }
252         }
253 }
254 
255 static struct qreg
ntq_get_src(struct vc4_compile * c,nir_src src,int i)256 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
257 {
258         struct hash_entry *entry;
259 
260         nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
261         if (load == NULL) {
262                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
263                 assert(i < src.ssa->num_components);
264         } else {
265                 nir_def *reg = load->src[0].ssa;
266                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
267                 assert(nir_intrinsic_base(load) == 0);
268                 assert(nir_intrinsic_num_array_elems(decl) == 0);
269                 entry = _mesa_hash_table_search(c->def_ht, reg);
270                 assert(i < nir_intrinsic_num_components(decl));
271         }
272 
273         struct qreg *qregs = entry->data;
274         return qregs[i];
275 }
276 
277 static struct qreg
ntq_get_alu_src(struct vc4_compile * c,nir_alu_instr * instr,unsigned src)278 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
279                 unsigned src)
280 {
281         struct qreg r = ntq_get_src(c, instr->src[src].src,
282                                     instr->src[src].swizzle[0]);
283 
284         return r;
285 };
286 
287 static inline struct qreg
qir_SAT(struct vc4_compile * c,struct qreg val)288 qir_SAT(struct vc4_compile *c, struct qreg val)
289 {
290         return qir_FMAX(c,
291                         qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
292                         qir_uniform_f(c, 0.0));
293 }
294 
295 static struct qreg
ntq_rcp(struct vc4_compile * c,struct qreg x)296 ntq_rcp(struct vc4_compile *c, struct qreg x)
297 {
298         struct qreg r = qir_RCP(c, x);
299 
300         /* Apply a Newton-Raphson step to improve the accuracy. */
301         r = qir_FMUL(c, r, qir_FSUB(c,
302                                     qir_uniform_f(c, 2.0),
303                                     qir_FMUL(c, x, r)));
304 
305         return r;
306 }
307 
308 static struct qreg
ntq_rsq(struct vc4_compile * c,struct qreg x)309 ntq_rsq(struct vc4_compile *c, struct qreg x)
310 {
311         struct qreg r = qir_RSQ(c, x);
312 
313         /* Apply a Newton-Raphson step to improve the accuracy. */
314         r = qir_FMUL(c, r, qir_FSUB(c,
315                                     qir_uniform_f(c, 1.5),
316                                     qir_FMUL(c,
317                                              qir_uniform_f(c, 0.5),
318                                              qir_FMUL(c, x,
319                                                       qir_FMUL(c, r, r)))));
320 
321         return r;
322 }
323 
324 static struct qreg
ntq_umul(struct vc4_compile * c,struct qreg src0,struct qreg src1)325 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
326 {
327         struct qreg src0_hi = qir_SHR(c, src0,
328                                       qir_uniform_ui(c, 24));
329         struct qreg src1_hi = qir_SHR(c, src1,
330                                       qir_uniform_ui(c, 24));
331 
332         struct qreg hilo = qir_MUL24(c, src0_hi, src1);
333         struct qreg lohi = qir_MUL24(c, src0, src1_hi);
334         struct qreg lolo = qir_MUL24(c, src0, src1);
335 
336         return qir_ADD(c, lolo, qir_SHL(c,
337                                         qir_ADD(c, hilo, lohi),
338                                         qir_uniform_ui(c, 24)));
339 }
340 
341 static struct qreg
ntq_scale_depth_texture(struct vc4_compile * c,struct qreg src)342 ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
343 {
344         struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
345                                                  qir_uniform_ui(c, 8)));
346         return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
347 }
348 
349 /**
350  * Emits a lowered TXF_MS from an MSAA texture.
351  *
352  * The addressing math has been lowered in NIR, and now we just need to read
353  * it like a UBO.
354  */
355 static void
ntq_emit_txf(struct vc4_compile * c,nir_tex_instr * instr)356 ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
357 {
358         uint32_t tile_width = 32;
359         uint32_t tile_height = 32;
360         uint32_t tile_size = (tile_height * tile_width *
361                               VC4_MAX_SAMPLES * sizeof(uint32_t));
362 
363         unsigned unit = instr->texture_index;
364         uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
365         uint32_t w_tiles = w / tile_width;
366         uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
367         uint32_t h_tiles = h / tile_height;
368         uint32_t size = w_tiles * h_tiles * tile_size;
369 
370         struct qreg addr;
371         assert(instr->num_srcs == 1);
372         assert(instr->src[0].src_type == nir_tex_src_coord);
373         addr = ntq_get_src(c, instr->src[0].src, 0);
374 
375         /* Perform the clamping required by kernel validation. */
376         addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
377         addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
378 
379         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
380                      addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
381 
382         ntq_emit_thrsw(c);
383 
384         struct qreg tex = qir_TEX_RESULT(c);
385         c->num_texture_samples++;
386 
387         enum pipe_format format = c->key->tex[unit].format;
388         if (util_format_is_depth_or_stencil(format)) {
389                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
390                 for (int i = 0; i < 4; i++)
391                         ntq_store_def(c, &instr->def, i, qir_MOV(c, scaled));
392         } else {
393                 for (int i = 0; i < 4; i++)
394                         ntq_store_def(c, &instr->def, i,
395                                       qir_UNPACK_8_F(c, tex, i));
396         }
397 }
398 
399 static void
ntq_emit_tex(struct vc4_compile * c,nir_tex_instr * instr)400 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
401 {
402         struct qreg s, t, r, lod, compare;
403         bool is_txb = false, is_txl = false;
404         unsigned unit = instr->texture_index;
405 
406         if (instr->op == nir_texop_txf) {
407                 ntq_emit_txf(c, instr);
408                 return;
409         }
410 
411         for (unsigned i = 0; i < instr->num_srcs; i++) {
412                 switch (instr->src[i].src_type) {
413                 case nir_tex_src_coord:
414                         s = ntq_get_src(c, instr->src[i].src, 0);
415                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
416                                 t = qir_uniform_f(c, 0.5);
417                         else
418                                 t = ntq_get_src(c, instr->src[i].src, 1);
419                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
420                                 r = ntq_get_src(c, instr->src[i].src, 2);
421                         break;
422                 case nir_tex_src_bias:
423                         lod = ntq_get_src(c, instr->src[i].src, 0);
424                         is_txb = true;
425                         break;
426                 case nir_tex_src_lod:
427                         lod = ntq_get_src(c, instr->src[i].src, 0);
428                         is_txl = true;
429                         break;
430                 case nir_tex_src_comparator:
431                         compare = ntq_get_src(c, instr->src[i].src, 0);
432                         break;
433                 default:
434                         unreachable("unknown texture source");
435                 }
436         }
437 
438         if (c->stage != QSTAGE_FRAG && !is_txl) {
439                 /* From the GLSL 1.20 spec:
440                  *
441                  *     "If it is mip-mapped and running on the vertex shader,
442                  *      then the base texture is used."
443                  */
444                 is_txl = true;
445                 lod = qir_uniform_ui(c, 0);
446         }
447 
448         if (c->key->tex[unit].force_first_level) {
449                 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
450                 is_txl = true;
451                 is_txb = false;
452         }
453 
454         struct qreg texture_u[] = {
455                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
456                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
457                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
458                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
459         };
460         uint32_t next_texture_u = 0;
461 
462         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
463                 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
464                                            unit | (is_txl << 16));
465         }
466 
467         struct qinst *tmu;
468         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
469                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
470                 tmu->src[qir_get_tex_uniform_src(tmu)] =
471                         texture_u[next_texture_u++];
472         } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
473                    c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
474                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
475                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
476                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
477                                    qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
478                                                unit));
479                 tmu->src[qir_get_tex_uniform_src(tmu)] =
480                         texture_u[next_texture_u++];
481         }
482 
483         if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
484                 s = qir_SAT(c, s);
485         }
486 
487         if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
488                 t = qir_SAT(c, t);
489         }
490 
491         tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
492         tmu->src[qir_get_tex_uniform_src(tmu)] =
493                 texture_u[next_texture_u++];
494 
495         if (is_txl || is_txb) {
496                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
497                 tmu->src[qir_get_tex_uniform_src(tmu)] =
498                         texture_u[next_texture_u++];
499         }
500 
501         tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
502         tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
503 
504         c->num_texture_samples++;
505 
506         ntq_emit_thrsw(c);
507 
508         struct qreg tex = qir_TEX_RESULT(c);
509 
510         enum pipe_format format = c->key->tex[unit].format;
511 
512         if (util_format_is_depth_or_stencil(format)) {
513                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
514                 struct qreg depth_output;
515 
516                 struct qreg u0 = qir_uniform_f(c, 0.0f);
517                 struct qreg u1 = qir_uniform_f(c, 1.0f);
518                 if (c->key->tex[unit].compare_mode) {
519                         /* From the GL_ARB_shadow spec:
520                          *
521                          *     "Let Dt (D subscript t) be the depth texture
522                          *      value, in the range [0, 1].  Let R be the
523                          *      interpolated texture coordinate clamped to the
524                          *      range [0, 1]."
525                          */
526                         compare = qir_SAT(c, compare);
527 
528                         switch (c->key->tex[unit].compare_func) {
529                         case PIPE_FUNC_NEVER:
530                                 depth_output = qir_uniform_f(c, 0.0f);
531                                 break;
532                         case PIPE_FUNC_ALWAYS:
533                                 depth_output = u1;
534                                 break;
535                         case PIPE_FUNC_EQUAL:
536                                 qir_SF(c, qir_FSUB(c, compare, normalized));
537                                 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
538                                 break;
539                         case PIPE_FUNC_NOTEQUAL:
540                                 qir_SF(c, qir_FSUB(c, compare, normalized));
541                                 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
542                                 break;
543                         case PIPE_FUNC_GREATER:
544                                 qir_SF(c, qir_FSUB(c, compare, normalized));
545                                 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
546                                 break;
547                         case PIPE_FUNC_GEQUAL:
548                                 qir_SF(c, qir_FSUB(c, normalized, compare));
549                                 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
550                                 break;
551                         case PIPE_FUNC_LESS:
552                                 qir_SF(c, qir_FSUB(c, compare, normalized));
553                                 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
554                                 break;
555                         case PIPE_FUNC_LEQUAL:
556                                 qir_SF(c, qir_FSUB(c, normalized, compare));
557                                 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
558                                 break;
559                         }
560                 } else {
561                         depth_output = normalized;
562                 }
563 
564                 for (int i = 0; i < 4; i++)
565                         ntq_store_def(c, &instr->def, i,
566                                       qir_MOV(c, depth_output));
567         } else {
568                 for (int i = 0; i < 4; i++)
569                         ntq_store_def(c, &instr->def, i,
570                                       qir_UNPACK_8_F(c, tex, i));
571         }
572 }
573 
574 /**
575  * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
576  * to zero).
577  */
578 static struct qreg
ntq_ffract(struct vc4_compile * c,struct qreg src)579 ntq_ffract(struct vc4_compile *c, struct qreg src)
580 {
581         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
582         struct qreg diff = qir_FSUB(c, src, trunc);
583         qir_SF(c, diff);
584 
585         qir_FADD_dest(c, diff,
586                       diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
587 
588         return qir_MOV(c, diff);
589 }
590 
591 /**
592  * Computes floor(x), which is tricky because our FTOI truncates (rounds to
593  * zero).
594  */
595 static struct qreg
ntq_ffloor(struct vc4_compile * c,struct qreg src)596 ntq_ffloor(struct vc4_compile *c, struct qreg src)
597 {
598         struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
599 
600         /* This will be < 0 if we truncated and the truncation was of a value
601          * that was < 0 in the first place.
602          */
603         qir_SF(c, qir_FSUB(c, src, result));
604 
605         struct qinst *sub = qir_FSUB_dest(c, result,
606                                           result, qir_uniform_f(c, 1.0));
607         sub->cond = QPU_COND_NS;
608 
609         return qir_MOV(c, result);
610 }
611 
612 /**
613  * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
614  * zero).
615  */
616 static struct qreg
ntq_fceil(struct vc4_compile * c,struct qreg src)617 ntq_fceil(struct vc4_compile *c, struct qreg src)
618 {
619         struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
620 
621         /* This will be < 0 if we truncated and the truncation was of a value
622          * that was > 0 in the first place.
623          */
624         qir_SF(c, qir_FSUB(c, result, src));
625 
626         qir_FADD_dest(c, result,
627                       result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
628 
629         return qir_MOV(c, result);
630 }
631 
632 static struct qreg
ntq_shrink_sincos_input_range(struct vc4_compile * c,struct qreg x)633 ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
634 {
635         /* Since we're using a Taylor approximation, we want to have a small
636          * number of coefficients and take advantage of sin/cos repeating
637          * every 2pi.  We keep our x as close to 0 as we can, since the series
638          * will be less accurate as |x| increases.  (Also, be careful of
639          * shifting the input x value to be tricky with sin/cos relations,
640          * because getting accurate values for x==0 is very important for SDL
641          * rendering)
642          */
643         struct qreg scaled_x =
644                 qir_FMUL(c, x,
645                          qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
646         /* Note: FTOI truncates toward 0. */
647         struct qreg x_frac = qir_FSUB(c, scaled_x,
648                                       qir_ITOF(c, qir_FTOI(c, scaled_x)));
649         /* Map [0.5, 1] to [-0.5, 0] */
650         qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
651         qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
652         /* Map [-1, -0.5] to [0, 0.5] */
653         qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
654         qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
655 
656         return x_frac;
657 }
658 
659 static struct qreg
ntq_fsin(struct vc4_compile * c,struct qreg src)660 ntq_fsin(struct vc4_compile *c, struct qreg src)
661 {
662         float coeff[] = {
663                 2.0 * M_PI,
664                 -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
665                 pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
666                 -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
667                 pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
668         };
669 
670         struct qreg x = ntq_shrink_sincos_input_range(c, src);
671         struct qreg x2 = qir_FMUL(c, x, x);
672         struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
673         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
674                 x = qir_FMUL(c, x, x2);
675                 sum = qir_FADD(c,
676                                sum,
677                                qir_FMUL(c,
678                                         x,
679                                         qir_uniform_f(c, coeff[i])));
680         }
681         return sum;
682 }
683 
684 static struct qreg
ntq_fcos(struct vc4_compile * c,struct qreg src)685 ntq_fcos(struct vc4_compile *c, struct qreg src)
686 {
687         float coeff[] = {
688                 1.0f,
689                 -pow(2.0 * M_PI, 2) / (2 * 1),
690                 pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
691                 -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
692                 pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
693                 -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
694         };
695 
696         struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
697         struct qreg sum = qir_uniform_f(c, coeff[0]);
698         struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
699         struct qreg x = x2; /* Current x^2, x^4, or x^6 */
700         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
701                 if (i != 1)
702                         x = qir_FMUL(c, x, x2);
703 
704                 sum = qir_FADD(c, qir_FMUL(c,
705                                            x,
706                                            qir_uniform_f(c, coeff[i])),
707                                sum);
708         }
709         return sum;
710 }
711 
712 static struct qreg
ntq_fsign(struct vc4_compile * c,struct qreg src)713 ntq_fsign(struct vc4_compile *c, struct qreg src)
714 {
715         struct qreg t = qir_get_temp(c);
716 
717         qir_SF(c, src);
718         qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
719         qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
720         qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
721         return qir_MOV(c, t);
722 }
723 
724 static void
emit_vertex_input(struct vc4_compile * c,int attr)725 emit_vertex_input(struct vc4_compile *c, int attr)
726 {
727         enum pipe_format format = c->vs_key->attr_formats[attr];
728         uint32_t attr_size = util_format_get_blocksize(format);
729 
730         c->vattr_sizes[attr] = align(attr_size, 4);
731         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
732                 c->inputs[attr * 4 + i] =
733                         qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
734                 c->num_inputs++;
735         }
736 }
737 
738 static void
emit_fragcoord_input(struct vc4_compile * c,int attr)739 emit_fragcoord_input(struct vc4_compile *c, int attr)
740 {
741         c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
742         c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
743         c->inputs[attr * 4 + 2] =
744                 qir_FMUL(c,
745                          qir_ITOF(c, qir_FRAG_Z(c)),
746                          qir_uniform_f(c, 1.0 / 0xffffff));
747         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
748 }
749 
750 static struct qreg
emit_fragment_varying(struct vc4_compile * c,gl_varying_slot slot,uint8_t swizzle)751 emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
752                       uint8_t swizzle)
753 {
754         uint32_t i = c->num_input_slots++;
755         struct qreg vary = {
756                 QFILE_VARY,
757                 i
758         };
759 
760         if (c->num_input_slots >= c->input_slots_array_size) {
761                 c->input_slots_array_size =
762                         MAX2(4, c->input_slots_array_size * 2);
763 
764                 c->input_slots = reralloc(c, c->input_slots,
765                                           struct vc4_varying_slot,
766                                           c->input_slots_array_size);
767         }
768 
769         c->input_slots[i].slot = slot;
770         c->input_slots[i].swizzle = swizzle;
771 
772         return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
773 }
774 
775 static void
emit_fragment_input(struct vc4_compile * c,int attr,gl_varying_slot slot)776 emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
777 {
778         for (int i = 0; i < 4; i++) {
779                 c->inputs[attr * 4 + i] =
780                         emit_fragment_varying(c, slot, i);
781                 c->num_inputs++;
782         }
783 }
784 
785 static void
add_output(struct vc4_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)786 add_output(struct vc4_compile *c,
787            uint32_t decl_offset,
788            uint8_t slot,
789            uint8_t swizzle)
790 {
791         uint32_t old_array_size = c->outputs_array_size;
792         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
793                           decl_offset + 1);
794 
795         if (old_array_size != c->outputs_array_size) {
796                 c->output_slots = reralloc(c,
797                                            c->output_slots,
798                                            struct vc4_varying_slot,
799                                            c->outputs_array_size);
800         }
801 
802         c->output_slots[decl_offset].slot = slot;
803         c->output_slots[decl_offset].swizzle = swizzle;
804 }
805 
806 static bool
ntq_src_is_only_ssa_def_user(nir_src * src)807 ntq_src_is_only_ssa_def_user(nir_src *src)
808 {
809         return list_is_singular(&src->ssa->uses) &&
810                nir_load_reg_for_def(src->ssa) == NULL;
811 }
812 
813 /**
814  * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
815  * bit set.
816  *
817  * However, as an optimization, it tries to find the instructions generating
818  * the sources to be packed and just emit the pack flag there, if possible.
819  */
820 static void
ntq_emit_pack_unorm_4x8(struct vc4_compile * c,nir_alu_instr * instr)821 ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
822 {
823         struct qreg result = qir_get_temp(c);
824         struct nir_alu_instr *vec4 = NULL;
825 
826         /* If packing from a vec4 op (as expected), identify it so that we can
827          * peek back at what generated its sources.
828          */
829         if (instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
830             nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
831             nir_op_vec4) {
832                 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
833         }
834 
835         /* If the pack is replicating the same channel 4 times, use the 8888
836          * pack flag.  This is common for blending using the alpha
837          * channel.
838          */
839         if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
840             instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
841             instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
842                 struct qreg rep = ntq_get_src(c,
843                                               instr->src[0].src,
844                                               instr->src[0].swizzle[0]);
845                 ntq_store_def(c, &instr->def, 0, qir_PACK_8888_F(c, rep));
846                 return;
847         }
848 
849         for (int i = 0; i < 4; i++) {
850                 int swiz = instr->src[0].swizzle[i];
851                 struct qreg src;
852                 if (vec4) {
853                         src = ntq_get_src(c, vec4->src[swiz].src,
854                                           vec4->src[swiz].swizzle[0]);
855                 } else {
856                         src = ntq_get_src(c, instr->src[0].src, swiz);
857                 }
858 
859                 if (vec4 &&
860                     ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
861                     src.file == QFILE_TEMP &&
862                     c->defs[src.index] &&
863                     qir_is_mul(c->defs[src.index]) &&
864                     !c->defs[src.index]->dst.pack) {
865                         struct qinst *rewrite = c->defs[src.index];
866                         c->defs[src.index] = NULL;
867                         rewrite->dst = result;
868                         rewrite->dst.pack = QPU_PACK_MUL_8A + i;
869                         continue;
870                 }
871 
872                 qir_PACK_8_F(c, result, src, i);
873         }
874 
875         ntq_store_def(c, &instr->def, 0, qir_MOV(c, result));
876 }
877 
878 /** Handles sign-extended bitfield extracts for 16 bits. */
879 static struct qreg
ntq_emit_ibfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)880 ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
881               struct qreg bits)
882 {
883         assert(bits.file == QFILE_UNIF &&
884                c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
885                c->uniform_data[bits.index] == 16);
886 
887         assert(offset.file == QFILE_UNIF &&
888                c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
889         int offset_bit = c->uniform_data[offset.index];
890         assert(offset_bit % 16 == 0);
891 
892         return qir_UNPACK_16_I(c, base, offset_bit / 16);
893 }
894 
895 /** Handles unsigned bitfield extracts for 8 bits. */
896 static struct qreg
ntq_emit_ubfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)897 ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
898               struct qreg bits)
899 {
900         assert(bits.file == QFILE_UNIF &&
901                c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
902                c->uniform_data[bits.index] == 8);
903 
904         assert(offset.file == QFILE_UNIF &&
905                c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
906         int offset_bit = c->uniform_data[offset.index];
907         assert(offset_bit % 8 == 0);
908 
909         return qir_UNPACK_8_I(c, base, offset_bit / 8);
910 }
911 
912 /**
913  * If compare_instr is a valid comparison instruction, emits the
914  * compare_instr's comparison and returns the sel_instr's return value based
915  * on the compare_instr's result.
916  */
917 static bool
ntq_emit_comparison(struct vc4_compile * c,struct qreg * dest,nir_alu_instr * compare_instr,nir_alu_instr * sel_instr)918 ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
919                     nir_alu_instr *compare_instr,
920                     nir_alu_instr *sel_instr)
921 {
922         enum qpu_cond cond;
923 
924         switch (compare_instr->op) {
925         case nir_op_feq32:
926         case nir_op_ieq32:
927         case nir_op_seq:
928                 cond = QPU_COND_ZS;
929                 break;
930         case nir_op_fneu32:
931         case nir_op_ine32:
932         case nir_op_sne:
933                 cond = QPU_COND_ZC;
934                 break;
935         case nir_op_fge32:
936         case nir_op_ige32:
937         case nir_op_uge32:
938         case nir_op_sge:
939                 cond = QPU_COND_NC;
940                 break;
941         case nir_op_flt32:
942         case nir_op_ilt32:
943         case nir_op_ult32:
944         case nir_op_slt:
945                 cond = QPU_COND_NS;
946                 break;
947         default:
948                 return false;
949         }
950 
951         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
952         struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
953 
954         unsigned unsized_type =
955                 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
956         if (unsized_type == nir_type_float)
957                 qir_SF(c, qir_FSUB(c, src0, src1));
958         else
959                 qir_SF(c, qir_SUB(c, src0, src1));
960 
961         switch (sel_instr->op) {
962         case nir_op_seq:
963         case nir_op_sne:
964         case nir_op_sge:
965         case nir_op_slt:
966                 *dest = qir_SEL(c, cond,
967                                 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
968                 break;
969 
970         case nir_op_b32csel:
971                 *dest = qir_SEL(c, cond,
972                                 ntq_get_alu_src(c, sel_instr, 1),
973                                 ntq_get_alu_src(c, sel_instr, 2));
974                 break;
975 
976         default:
977                 *dest = qir_SEL(c, cond,
978                                 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
979                 break;
980         }
981 
982         /* Make the temporary for nir_store_def(). */
983         *dest = qir_MOV(c, *dest);
984 
985         return true;
986 }
987 
988 /**
989  * Attempts to fold a comparison generating a boolean result into the
990  * condition code for selecting between two values, instead of comparing the
991  * boolean result against 0 to generate the condition code.
992  */
ntq_emit_bcsel(struct vc4_compile * c,nir_alu_instr * instr,struct qreg * src)993 static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
994                                   struct qreg *src)
995 {
996         if (nir_load_reg_for_def(instr->src[0].src.ssa))
997                 goto out;
998         if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
999                 goto out;
1000         nir_alu_instr *compare =
1001                 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1002         if (!compare)
1003                 goto out;
1004 
1005         struct qreg dest;
1006         if (ntq_emit_comparison(c, &dest, compare, instr))
1007                 return dest;
1008 
1009 out:
1010         qir_SF(c, src[0]);
1011         return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
1012 }
1013 
1014 static struct qreg
ntq_fddx(struct vc4_compile * c,struct qreg src)1015 ntq_fddx(struct vc4_compile *c, struct qreg src)
1016 {
1017         /* Make sure that we have a bare temp to use for MUL rotation, so it
1018          * can be allocated to an accumulator.
1019          */
1020         if (src.pack || src.file != QFILE_TEMP)
1021                 src = qir_MOV(c, src);
1022 
1023         struct qreg from_left = qir_ROT_MUL(c, src, 1);
1024         struct qreg from_right = qir_ROT_MUL(c, src, 15);
1025 
1026         /* Distinguish left/right pixels of the quad. */
1027         qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
1028                           qir_uniform_ui(c, 1)));
1029 
1030         return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1031                                   qir_FSUB(c, from_right, src),
1032                                   qir_FSUB(c, src, from_left)));
1033 }
1034 
1035 static struct qreg
ntq_fddy(struct vc4_compile * c,struct qreg src)1036 ntq_fddy(struct vc4_compile *c, struct qreg src)
1037 {
1038         if (src.pack || src.file != QFILE_TEMP)
1039                 src = qir_MOV(c, src);
1040 
1041         struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
1042         struct qreg from_top = qir_ROT_MUL(c, src, 14);
1043 
1044         /* Distinguish top/bottom pixels of the quad. */
1045         qir_SF(c, qir_AND(c,
1046                           qir_reg(QFILE_QPU_ELEMENT, 0),
1047                           qir_uniform_ui(c, 2)));
1048 
1049         return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1050                                   qir_FSUB(c, from_top, src),
1051                                   qir_FSUB(c, src, from_bottom)));
1052 }
1053 
1054 static struct qreg
ntq_emit_cond_to_int(struct vc4_compile * c,enum qpu_cond cond)1055 ntq_emit_cond_to_int(struct vc4_compile *c, enum qpu_cond cond)
1056 {
1057         return qir_MOV(c, qir_SEL(c, cond,
1058                                   qir_uniform_ui(c, 1),
1059                                   qir_uniform_ui(c, 0)));
1060 }
1061 
1062 static void
ntq_emit_alu(struct vc4_compile * c,nir_alu_instr * instr)1063 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
1064 {
1065         /* Vectors are special in that they have non-scalarized writemasks,
1066          * and just take the first swizzle channel for each argument in order
1067          * into each writemask channel.
1068          */
1069         if (instr->op == nir_op_vec2 ||
1070             instr->op == nir_op_vec3 ||
1071             instr->op == nir_op_vec4) {
1072                 struct qreg srcs[4];
1073                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1074                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1075                                               instr->src[i].swizzle[0]);
1076                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1077                         ntq_store_def(c, &instr->def, i,
1078                                       qir_MOV(c, srcs[i]));
1079                 return;
1080         }
1081 
1082         if (instr->op == nir_op_pack_unorm_4x8) {
1083                 ntq_emit_pack_unorm_4x8(c, instr);
1084                 return;
1085         }
1086 
1087         if (instr->op == nir_op_unpack_unorm_4x8) {
1088                 struct qreg src = ntq_get_src(c, instr->src[0].src,
1089                                               instr->src[0].swizzle[0]);
1090                 unsigned count = instr->def.num_components;
1091                 for (int i = 0; i < count; i++) {
1092                         ntq_store_def(c, &instr->def, i,
1093                                       qir_UNPACK_8_F(c, src, i));
1094                 }
1095                 return;
1096         }
1097 
1098         /* General case: We can just grab the one used channel per src. */
1099         struct qreg src[nir_op_infos[instr->op].num_inputs];
1100         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1101                 src[i] = ntq_get_alu_src(c, instr, i);
1102         }
1103 
1104         struct qreg result;
1105 
1106         switch (instr->op) {
1107         case nir_op_mov:
1108                 result = qir_MOV(c, src[0]);
1109                 break;
1110         case nir_op_fmul:
1111                 result = qir_FMUL(c, src[0], src[1]);
1112                 break;
1113         case nir_op_fadd:
1114                 result = qir_FADD(c, src[0], src[1]);
1115                 break;
1116         case nir_op_fsub:
1117                 result = qir_FSUB(c, src[0], src[1]);
1118                 break;
1119         case nir_op_fmin:
1120                 result = qir_FMIN(c, src[0], src[1]);
1121                 break;
1122         case nir_op_fmax:
1123                 result = qir_FMAX(c, src[0], src[1]);
1124                 break;
1125 
1126         case nir_op_f2i32:
1127         case nir_op_f2u32:
1128                 result = qir_FTOI(c, src[0]);
1129                 break;
1130         case nir_op_i2f32:
1131         case nir_op_u2f32:
1132                 result = qir_ITOF(c, src[0]);
1133                 break;
1134         case nir_op_b2f32:
1135                 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
1136                 break;
1137         case nir_op_b2i32:
1138                 result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
1139                 break;
1140 
1141         case nir_op_iadd:
1142                 result = qir_ADD(c, src[0], src[1]);
1143                 break;
1144         case nir_op_ushr:
1145                 result = qir_SHR(c, src[0], src[1]);
1146                 break;
1147         case nir_op_isub:
1148                 result = qir_SUB(c, src[0], src[1]);
1149                 break;
1150         case nir_op_ishr:
1151                 result = qir_ASR(c, src[0], src[1]);
1152                 break;
1153         case nir_op_ishl:
1154                 result = qir_SHL(c, src[0], src[1]);
1155                 break;
1156         case nir_op_imin:
1157                 result = qir_MIN(c, src[0], src[1]);
1158                 break;
1159         case nir_op_imax:
1160                 result = qir_MAX(c, src[0], src[1]);
1161                 break;
1162         case nir_op_iand:
1163                 result = qir_AND(c, src[0], src[1]);
1164                 break;
1165         case nir_op_ior:
1166                 result = qir_OR(c, src[0], src[1]);
1167                 break;
1168         case nir_op_ixor:
1169                 result = qir_XOR(c, src[0], src[1]);
1170                 break;
1171         case nir_op_inot:
1172                 result = qir_NOT(c, src[0]);
1173                 break;
1174 
1175         case nir_op_imul:
1176                 result = ntq_umul(c, src[0], src[1]);
1177                 break;
1178 
1179         case nir_op_seq:
1180         case nir_op_sne:
1181         case nir_op_sge:
1182         case nir_op_slt:
1183         case nir_op_feq32:
1184         case nir_op_fneu32:
1185         case nir_op_fge32:
1186         case nir_op_flt32:
1187         case nir_op_ieq32:
1188         case nir_op_ine32:
1189         case nir_op_ige32:
1190         case nir_op_uge32:
1191         case nir_op_ilt32:
1192         case nir_op_ult32:
1193                 if (!ntq_emit_comparison(c, &result, instr, instr)) {
1194                         fprintf(stderr, "Bad comparison instruction\n");
1195                 }
1196                 break;
1197 
1198         case nir_op_b32csel:
1199                 result = ntq_emit_bcsel(c, instr, src);
1200                 break;
1201         case nir_op_fcsel:
1202                 qir_SF(c, src[0]);
1203                 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
1204                 break;
1205 
1206         case nir_op_frcp:
1207                 result = ntq_rcp(c, src[0]);
1208                 break;
1209         case nir_op_frsq:
1210                 result = ntq_rsq(c, src[0]);
1211                 break;
1212         case nir_op_fexp2:
1213                 result = qir_EXP2(c, src[0]);
1214                 break;
1215         case nir_op_flog2:
1216                 result = qir_LOG2(c, src[0]);
1217                 break;
1218 
1219         case nir_op_ftrunc:
1220                 result = qir_ITOF(c, qir_FTOI(c, src[0]));
1221                 break;
1222         case nir_op_fceil:
1223                 result = ntq_fceil(c, src[0]);
1224                 break;
1225         case nir_op_ffract:
1226                 result = ntq_ffract(c, src[0]);
1227                 break;
1228         case nir_op_ffloor:
1229                 result = ntq_ffloor(c, src[0]);
1230                 break;
1231 
1232         case nir_op_fsin:
1233                 result = ntq_fsin(c, src[0]);
1234                 break;
1235         case nir_op_fcos:
1236                 result = ntq_fcos(c, src[0]);
1237                 break;
1238 
1239         case nir_op_fsign:
1240                 result = ntq_fsign(c, src[0]);
1241                 break;
1242 
1243         case nir_op_fabs:
1244                 result = qir_FMAXABS(c, src[0], src[0]);
1245                 break;
1246         case nir_op_iabs:
1247                 result = qir_MAX(c, src[0],
1248                                 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1249                 break;
1250 
1251         case nir_op_ibitfield_extract:
1252                 result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
1253                 break;
1254 
1255         case nir_op_ubitfield_extract:
1256                 result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
1257                 break;
1258 
1259         case nir_op_usadd_4x8_vc4:
1260                 result = qir_V8ADDS(c, src[0], src[1]);
1261                 break;
1262 
1263         case nir_op_ussub_4x8_vc4:
1264                 result = qir_V8SUBS(c, src[0], src[1]);
1265                 break;
1266 
1267         case nir_op_umin_4x8_vc4:
1268                 result = qir_V8MIN(c, src[0], src[1]);
1269                 break;
1270 
1271         case nir_op_umax_4x8_vc4:
1272                 result = qir_V8MAX(c, src[0], src[1]);
1273                 break;
1274 
1275         case nir_op_umul_unorm_4x8_vc4:
1276                 result = qir_V8MULD(c, src[0], src[1]);
1277                 break;
1278 
1279         case nir_op_uadd_carry:
1280                 qir_SF(c, qir_ADD(c, src[0], src[1]));
1281                 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1282                 break;
1283 
1284         case nir_op_usub_borrow:
1285                 qir_SF(c, qir_SUB(c, src[0], src[1]));
1286                 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1287                 break;
1288 
1289         default:
1290                 fprintf(stderr, "unknown NIR ALU inst: ");
1291                 nir_print_instr(&instr->instr, stderr);
1292                 fprintf(stderr, "\n");
1293                 abort();
1294         }
1295 
1296         ntq_store_def(c, &instr->def, 0, result);
1297 }
1298 
1299 static void
emit_frag_end(struct vc4_compile * c)1300 emit_frag_end(struct vc4_compile *c)
1301 {
1302         struct qreg color;
1303         if (c->output_color_index != -1) {
1304                 color = c->outputs[c->output_color_index];
1305         } else {
1306                 color = qir_uniform_ui(c, 0);
1307         }
1308 
1309         uint32_t discard_cond = QPU_COND_ALWAYS;
1310         if (c->s->info.fs.uses_discard) {
1311                 qir_SF(c, c->discard);
1312                 discard_cond = QPU_COND_ZS;
1313         }
1314 
1315         if (c->fs_key->stencil_enabled) {
1316                 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1317                              qir_uniform(c, QUNIFORM_STENCIL, 0));
1318                 if (c->fs_key->stencil_twoside) {
1319                         qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1320                                      qir_uniform(c, QUNIFORM_STENCIL, 1));
1321                 }
1322                 if (c->fs_key->stencil_full_writemasks) {
1323                         qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1324                                      qir_uniform(c, QUNIFORM_STENCIL, 2));
1325                 }
1326         }
1327 
1328         if (c->output_sample_mask_index != -1) {
1329                 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1330         }
1331 
1332         if (c->fs_key->depth_enabled) {
1333                 if (c->output_position_index != -1) {
1334                         qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1335                                       qir_FMUL(c,
1336                                                c->outputs[c->output_position_index],
1337                                                qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
1338                 } else {
1339                         qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1340                                      qir_FRAG_Z(c))->cond = discard_cond;
1341                 }
1342         }
1343 
1344         if (!c->msaa_per_sample_output) {
1345                 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
1346                              color)->cond = discard_cond;
1347         } else {
1348                 for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
1349                         qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
1350                                      c->sample_colors[i])->cond = discard_cond;
1351                 }
1352         }
1353 }
1354 
1355 static void
emit_scaled_viewport_write(struct vc4_compile * c,struct qreg rcp_w)1356 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1357 {
1358         struct qreg packed = qir_get_temp(c);
1359 
1360         for (int i = 0; i < 2; i++) {
1361                 struct qreg scale =
1362                         qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1363 
1364                 struct qreg packed_chan = packed;
1365                 packed_chan.pack = QPU_PACK_A_16A + i;
1366 
1367                 qir_FTOI_dest(c, packed_chan,
1368                               qir_FMUL(c,
1369                                        qir_FMUL(c,
1370                                                 c->outputs[c->output_position_index + i],
1371                                                 scale),
1372                                        rcp_w));
1373         }
1374 
1375         qir_VPM_WRITE(c, packed);
1376 }
1377 
1378 static void
emit_zs_write(struct vc4_compile * c,struct qreg rcp_w)1379 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1380 {
1381         struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1382         struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1383 
1384         qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1385                                                           c->outputs[c->output_position_index + 2],
1386                                                           zscale),
1387                                               rcp_w),
1388                                   zoffset));
1389 }
1390 
1391 static void
emit_rcp_wc_write(struct vc4_compile * c,struct qreg rcp_w)1392 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1393 {
1394         qir_VPM_WRITE(c, rcp_w);
1395 }
1396 
1397 static void
emit_point_size_write(struct vc4_compile * c)1398 emit_point_size_write(struct vc4_compile *c)
1399 {
1400         struct qreg point_size;
1401 
1402         if (c->output_point_size_index != -1)
1403                 point_size = c->outputs[c->output_point_size_index];
1404         else
1405                 point_size = qir_uniform_f(c, 1.0);
1406 
1407         qir_VPM_WRITE(c, point_size);
1408 }
1409 
1410 /**
1411  * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1412  *
1413  * The simulator insists that there be at least one vertex attribute, so
1414  * vc4_draw.c will emit one if it wouldn't have otherwise.  The simulator also
1415  * insists that all vertex attributes loaded get read by the VS/CS, so we have
1416  * to consume it here.
1417  */
1418 static void
emit_stub_vpm_read(struct vc4_compile * c)1419 emit_stub_vpm_read(struct vc4_compile *c)
1420 {
1421         if (c->num_inputs)
1422                 return;
1423 
1424         c->vattr_sizes[0] = 4;
1425         (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
1426         c->num_inputs++;
1427 }
1428 
1429 static void
emit_vert_end(struct vc4_compile * c,struct vc4_varying_slot * fs_inputs,uint32_t num_fs_inputs)1430 emit_vert_end(struct vc4_compile *c,
1431               struct vc4_varying_slot *fs_inputs,
1432               uint32_t num_fs_inputs)
1433 {
1434         struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1435 
1436         emit_stub_vpm_read(c);
1437 
1438         emit_scaled_viewport_write(c, rcp_w);
1439         emit_zs_write(c, rcp_w);
1440         emit_rcp_wc_write(c, rcp_w);
1441         if (c->vs_key->per_vertex_point_size)
1442                 emit_point_size_write(c);
1443 
1444         for (int i = 0; i < num_fs_inputs; i++) {
1445                 struct vc4_varying_slot *input = &fs_inputs[i];
1446                 int j;
1447 
1448                 for (j = 0; j < c->num_outputs; j++) {
1449                         struct vc4_varying_slot *output =
1450                                 &c->output_slots[j];
1451 
1452                         if (input->slot == output->slot &&
1453                             input->swizzle == output->swizzle) {
1454                                 qir_VPM_WRITE(c, c->outputs[j]);
1455                                 break;
1456                         }
1457                 }
1458                 /* Emit padding if we didn't find a declared VS output for
1459                  * this FS input.
1460                  */
1461                 if (j == c->num_outputs)
1462                         qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1463         }
1464 }
1465 
1466 static void
emit_coord_end(struct vc4_compile * c)1467 emit_coord_end(struct vc4_compile *c)
1468 {
1469         struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1470 
1471         emit_stub_vpm_read(c);
1472 
1473         for (int i = 0; i < 4; i++)
1474                 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1475 
1476         emit_scaled_viewport_write(c, rcp_w);
1477         emit_zs_write(c, rcp_w);
1478         emit_rcp_wc_write(c, rcp_w);
1479         if (c->vs_key->per_vertex_point_size)
1480                 emit_point_size_write(c);
1481 }
1482 
1483 static void
vc4_optimize_nir(struct nir_shader * s)1484 vc4_optimize_nir(struct nir_shader *s)
1485 {
1486         bool progress;
1487         unsigned lower_flrp =
1488                 (s->options->lower_flrp16 ? 16 : 0) |
1489                 (s->options->lower_flrp32 ? 32 : 0) |
1490                 (s->options->lower_flrp64 ? 64 : 0);
1491 
1492         do {
1493                 progress = false;
1494 
1495                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1496                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1497                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1498                 NIR_PASS(progress, s, nir_copy_prop);
1499                 NIR_PASS(progress, s, nir_opt_remove_phis);
1500                 NIR_PASS(progress, s, nir_opt_dce);
1501                 NIR_PASS(progress, s, nir_opt_dead_cf);
1502                 NIR_PASS(progress, s, nir_opt_cse);
1503                 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1504                 NIR_PASS(progress, s, nir_opt_algebraic);
1505                 NIR_PASS(progress, s, nir_opt_constant_folding);
1506                 if (lower_flrp != 0) {
1507                         bool lower_flrp_progress = false;
1508 
1509                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1510                                  lower_flrp,
1511                                  false /* always_precise */);
1512                         if (lower_flrp_progress) {
1513                                 NIR_PASS(progress, s, nir_opt_constant_folding);
1514                                 progress = true;
1515                         }
1516 
1517                         /* Nothing should rematerialize any flrps, so we only
1518                          * need to do this lowering once.
1519                          */
1520                         lower_flrp = 0;
1521                 }
1522 
1523                 NIR_PASS(progress, s, nir_opt_undef);
1524                 NIR_PASS(progress, s, nir_opt_loop_unroll);
1525         } while (progress);
1526 }
1527 
1528 static int
driver_location_compare(const void * in_a,const void * in_b)1529 driver_location_compare(const void *in_a, const void *in_b)
1530 {
1531         const nir_variable *const *a = in_a;
1532         const nir_variable *const *b = in_b;
1533 
1534         return (*a)->data.driver_location - (*b)->data.driver_location;
1535 }
1536 
1537 static void
ntq_setup_inputs(struct vc4_compile * c)1538 ntq_setup_inputs(struct vc4_compile *c)
1539 {
1540         unsigned num_entries = 0;
1541         nir_foreach_shader_in_variable(var, c->s)
1542                 num_entries++;
1543 
1544         if (num_entries == 0)
1545                 return;
1546 
1547         nir_variable *vars[num_entries];
1548 
1549         unsigned i = 0;
1550         nir_foreach_shader_in_variable(var, c->s)
1551                 vars[i++] = var;
1552 
1553         /* Sort the variables so that we emit the input setup in
1554          * driver_location order.  This is required for VPM reads, whose data
1555          * is fetched into the VPM in driver_location (TGSI register index)
1556          * order.
1557          */
1558         qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1559 
1560         for (unsigned i = 0; i < num_entries; i++) {
1561                 nir_variable *var = vars[i];
1562                 assert(glsl_type_is_vector_or_scalar(var->type));
1563                 unsigned loc = var->data.driver_location;
1564 
1565                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1566                                   (loc + 1) * 4);
1567 
1568                 if (c->stage == QSTAGE_FRAG) {
1569                         if (var->data.location == VARYING_SLOT_POS) {
1570                                 emit_fragcoord_input(c, loc);
1571                         } else if (util_varying_is_point_coord(var->data.location,
1572                                                                c->fs_key->point_sprite_mask)) {
1573                                 c->inputs[loc * 4 + 0] = c->point_x;
1574                                 c->inputs[loc * 4 + 1] = c->point_y;
1575                         } else {
1576                                 emit_fragment_input(c, loc, var->data.location);
1577                         }
1578                 } else {
1579                         emit_vertex_input(c, loc);
1580                 }
1581         }
1582 }
1583 
1584 static void
ntq_setup_outputs(struct vc4_compile * c)1585 ntq_setup_outputs(struct vc4_compile *c)
1586 {
1587         nir_foreach_shader_out_variable(var, c->s) {
1588                 assert(glsl_type_is_vector_or_scalar(var->type));
1589                 unsigned loc = var->data.driver_location * 4;
1590 
1591                 for (int i = 0; i < 4; i++)
1592                         add_output(c, loc + i, var->data.location, i);
1593 
1594                 if (c->stage == QSTAGE_FRAG) {
1595                         switch (var->data.location) {
1596                         case FRAG_RESULT_COLOR:
1597                         case FRAG_RESULT_DATA0:
1598                                 c->output_color_index = loc;
1599                                 break;
1600                         case FRAG_RESULT_DEPTH:
1601                                 c->output_position_index = loc;
1602                                 break;
1603                         case FRAG_RESULT_SAMPLE_MASK:
1604                                 c->output_sample_mask_index = loc;
1605                                 break;
1606                         }
1607                 } else {
1608                         switch (var->data.location) {
1609                         case VARYING_SLOT_POS:
1610                                 c->output_position_index = loc;
1611                                 break;
1612                         case VARYING_SLOT_PSIZ:
1613                                 c->output_point_size_index = loc;
1614                                 break;
1615                         }
1616                 }
1617         }
1618 }
1619 
1620 /**
1621  * Sets up the mapping from nir_register to struct qreg *.
1622  *
1623  * Each nir_register gets a struct qreg per 32-bit component being stored.
1624  */
1625 static void
ntq_setup_registers(struct vc4_compile * c,nir_function_impl * impl)1626 ntq_setup_registers(struct vc4_compile *c, nir_function_impl *impl)
1627 {
1628         nir_foreach_reg_decl(decl, impl) {
1629                 unsigned num_components = nir_intrinsic_num_components(decl);
1630                 unsigned array_len = nir_intrinsic_num_array_elems(decl);
1631                 array_len = MAX2(array_len, 1);
1632                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1633                                                   array_len * num_components);
1634 
1635                 nir_def *nir_reg = &decl->def;
1636                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1637 
1638                 for (int i = 0; i < array_len * num_components; i++)
1639                         qregs[i] = qir_get_temp(c);
1640         }
1641 }
1642 
1643 static void
ntq_emit_load_const(struct vc4_compile * c,nir_load_const_instr * instr)1644 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1645 {
1646         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1647         for (int i = 0; i < instr->def.num_components; i++)
1648                 qregs[i] = qir_uniform_ui(c, instr->value[i].u32);
1649 
1650         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1651 }
1652 
1653 static void
ntq_emit_ssa_undef(struct vc4_compile * c,nir_undef_instr * instr)1654 ntq_emit_ssa_undef(struct vc4_compile *c, nir_undef_instr *instr)
1655 {
1656         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1657 
1658         /* QIR needs there to be *some* value, so pick 0 (same as for
1659          * ntq_setup_registers().
1660          */
1661         for (int i = 0; i < instr->def.num_components; i++)
1662                 qregs[i] = qir_uniform_ui(c, 0);
1663 }
1664 
1665 static void
ntq_emit_color_read(struct vc4_compile * c,nir_intrinsic_instr * instr)1666 ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
1667 {
1668         assert(nir_src_as_uint(instr->src[0]) == 0);
1669 
1670         /* Reads of the per-sample color need to be done in
1671          * order.
1672          */
1673         int sample_index = nir_intrinsic_base(instr);
1674         for (int i = 0; i <= sample_index; i++) {
1675                 if (c->color_reads[i].file == QFILE_NULL) {
1676                         c->color_reads[i] =
1677                                 qir_TLB_COLOR_READ(c);
1678                 }
1679         }
1680         ntq_store_def(c, &instr->def, 0,
1681                       qir_MOV(c, c->color_reads[sample_index]));
1682 }
1683 
1684 static void
ntq_emit_load_input(struct vc4_compile * c,nir_intrinsic_instr * instr)1685 ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
1686 {
1687         assert(instr->num_components == 1);
1688         assert(nir_src_is_const(instr->src[0]) &&
1689                "vc4 doesn't support indirect inputs");
1690 
1691         uint32_t offset = nir_intrinsic_base(instr) +
1692                           nir_src_as_uint(instr->src[0]);
1693         int comp = nir_intrinsic_component(instr);
1694         ntq_store_def(c, &instr->def, 0,
1695                       qir_MOV(c, c->inputs[offset * 4 + comp]));
1696 }
1697 
1698 static void
ntq_emit_intrinsic(struct vc4_compile * c,nir_intrinsic_instr * instr)1699 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1700 {
1701         unsigned offset;
1702 
1703         switch (instr->intrinsic) {
1704         case nir_intrinsic_decl_reg:
1705         case nir_intrinsic_load_reg:
1706         case nir_intrinsic_store_reg:
1707                 break; /* Ignore these */
1708 
1709         case nir_intrinsic_load_uniform:
1710                 assert(instr->num_components == 1);
1711                 if (nir_src_is_const(instr->src[0])) {
1712                         offset = nir_intrinsic_base(instr) +
1713                                  nir_src_as_uint(instr->src[0]);
1714                         assert(offset % 4 == 0);
1715                         /* We need dwords */
1716                         offset = offset / 4;
1717                         ntq_store_def(c, &instr->def, 0,
1718                                       qir_uniform(c, QUNIFORM_UNIFORM,
1719                                                    offset));
1720                 } else {
1721                         ntq_store_def(c, &instr->def, 0,
1722                                       indirect_uniform_load(c, instr));
1723                 }
1724                 break;
1725 
1726         case nir_intrinsic_load_ubo:
1727                 assert(instr->num_components == 1);
1728                 ntq_store_def(c, &instr->def, 0, vc4_ubo_load(c, instr));
1729                 break;
1730 
1731         case nir_intrinsic_load_user_clip_plane:
1732                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
1733                         ntq_store_def(c, &instr->def, i,
1734                                       qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1735                                                   nir_intrinsic_ucp_id(instr) *
1736                                                   4 + i));
1737                 }
1738                 break;
1739 
1740         case nir_intrinsic_load_blend_const_color_r_float:
1741         case nir_intrinsic_load_blend_const_color_g_float:
1742         case nir_intrinsic_load_blend_const_color_b_float:
1743         case nir_intrinsic_load_blend_const_color_a_float:
1744                 ntq_store_def(c, &instr->def, 0,
1745                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
1746                                           (instr->intrinsic -
1747                                            nir_intrinsic_load_blend_const_color_r_float),
1748                                           0));
1749                 break;
1750 
1751         case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
1752                 ntq_store_def(c, &instr->def, 0,
1753                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
1754                                           0));
1755                 break;
1756 
1757         case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
1758                 ntq_store_def(c, &instr->def, 0,
1759                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
1760                                           0));
1761                 break;
1762 
1763         case nir_intrinsic_load_sample_mask_in:
1764                 ntq_store_def(c, &instr->def, 0,
1765                               qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1766                 break;
1767 
1768         case nir_intrinsic_load_front_face:
1769                 /* The register contains 0 (front) or 1 (back), and we need to
1770                  * turn it into a NIR bool where true means front.
1771                  */
1772                 ntq_store_def(c, &instr->def, 0,
1773                               qir_ADD(c,
1774                                       qir_uniform_ui(c, -1),
1775                                       qir_reg(QFILE_FRAG_REV_FLAG, 0)));
1776                 break;
1777 
1778         case nir_intrinsic_load_input:
1779                 ntq_emit_load_input(c, instr);
1780                 break;
1781 
1782         case nir_intrinsic_load_tlb_color_brcm:
1783                 ntq_emit_color_read(c, instr);
1784                 break;
1785 
1786         case nir_intrinsic_store_output:
1787                 assert(nir_src_is_const(instr->src[1]) &&
1788                        "vc4 doesn't support indirect outputs");
1789                 offset = nir_intrinsic_base(instr) +
1790                          nir_src_as_uint(instr->src[1]);
1791 
1792                 /* MSAA color outputs are the only case where we have an
1793                  * output that's not lowered to being a store of a single 32
1794                  * bit value.
1795                  */
1796                 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
1797                         assert(offset == c->output_color_index);
1798                         for (int i = 0; i < 4; i++) {
1799                                 c->sample_colors[i] =
1800                                         qir_MOV(c, ntq_get_src(c, instr->src[0],
1801                                                                i));
1802                         }
1803                 } else {
1804                         offset = offset * 4 + nir_intrinsic_component(instr);
1805                         assert(instr->num_components == 1);
1806                         c->outputs[offset] =
1807                                 qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
1808                         c->num_outputs = MAX2(c->num_outputs, offset + 1);
1809                 }
1810                 break;
1811 
1812         case nir_intrinsic_terminate:
1813                 if (c->execute.file != QFILE_NULL) {
1814                         qir_SF(c, c->execute);
1815                         qir_MOV_cond(c, QPU_COND_ZS, c->discard,
1816                                      qir_uniform_ui(c, ~0));
1817                 } else {
1818                         qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
1819                 }
1820                 break;
1821 
1822         case nir_intrinsic_terminate_if: {
1823                 /* true (~0) if we're discarding */
1824                 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1825 
1826                 if (c->execute.file != QFILE_NULL) {
1827                         /* execute == 0 means the channel is active.  Invert
1828                          * the condition so that we can use zero as "executing
1829                          * and discarding."
1830                          */
1831                         qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
1832                         qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
1833                 } else {
1834                         qir_OR_dest(c, c->discard, c->discard,
1835                                     ntq_get_src(c, instr->src[0], 0));
1836                 }
1837 
1838                 break;
1839         }
1840 
1841         case nir_intrinsic_load_texture_scale: {
1842                 assert(nir_src_is_const(instr->src[0]));
1843                 int sampler = nir_src_as_int(instr->src[0]);
1844 
1845                 ntq_store_def(c, &instr->def, 0,
1846                               qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));
1847                 ntq_store_def(c, &instr->def, 1,
1848                               qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));
1849                 break;
1850         }
1851 
1852         case nir_intrinsic_ddx:
1853         case nir_intrinsic_ddx_coarse:
1854         case nir_intrinsic_ddx_fine:
1855                 ntq_store_def(c, &instr->def, 0,
1856                               ntq_fddx(c, ntq_get_src(c, instr->src[0], 0)));
1857                 break;
1858 
1859         case nir_intrinsic_ddy:
1860         case nir_intrinsic_ddy_coarse:
1861         case nir_intrinsic_ddy_fine:
1862                 ntq_store_def(c, &instr->def, 0,
1863                               ntq_fddy(c, ntq_get_src(c, instr->src[0], 0)));
1864                 break;
1865 
1866         default:
1867                 fprintf(stderr, "Unknown intrinsic: ");
1868                 nir_print_instr(&instr->instr, stderr);
1869                 fprintf(stderr, "\n");
1870                 break;
1871         }
1872 }
1873 
1874 /* Clears (activates) the execute flags for any channels whose jump target
1875  * matches this block.
1876  */
1877 static void
ntq_activate_execute_for_block(struct vc4_compile * c)1878 ntq_activate_execute_for_block(struct vc4_compile *c)
1879 {
1880         qir_SF(c, qir_SUB(c,
1881                           c->execute,
1882                           qir_uniform_ui(c, c->cur_block->index)));
1883         qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
1884 }
1885 
1886 static void
ntq_emit_if(struct vc4_compile * c,nir_if * if_stmt)1887 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1888 {
1889         if (!c->vc4->screen->has_control_flow) {
1890                 fprintf(stderr,
1891                         "IF statement support requires updated kernel.\n");
1892                 return;
1893         }
1894 
1895         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1896         bool empty_else_block =
1897                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1898                  exec_list_is_empty(&nir_else_block->instr_list));
1899 
1900         struct qblock *then_block = qir_new_block(c);
1901         struct qblock *after_block = qir_new_block(c);
1902         struct qblock *else_block;
1903         if (empty_else_block)
1904                 else_block = after_block;
1905         else
1906                 else_block = qir_new_block(c);
1907 
1908         bool was_top_level = false;
1909         if (c->execute.file == QFILE_NULL) {
1910                 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
1911                 was_top_level = true;
1912         }
1913 
1914         /* Set ZS for executing (execute == 0) and jumping (if->condition ==
1915          * 0) channels, and then update execute flags for those to point to
1916          * the ELSE block.
1917          */
1918         qir_SF(c, qir_OR(c,
1919                          c->execute,
1920                          ntq_get_src(c, if_stmt->condition, 0)));
1921         qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1922                      qir_uniform_ui(c, else_block->index));
1923 
1924         /* Jump to ELSE if nothing is active for THEN, otherwise fall
1925          * through.
1926          */
1927         qir_SF(c, c->execute);
1928         qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
1929         qir_link_blocks(c->cur_block, else_block);
1930         qir_link_blocks(c->cur_block, then_block);
1931 
1932         /* Process the THEN block. */
1933         qir_set_emit_block(c, then_block);
1934         ntq_emit_cf_list(c, &if_stmt->then_list);
1935 
1936         if (!empty_else_block) {
1937                 /* Handle the end of the THEN block.  First, all currently
1938                  * active channels update their execute flags to point to
1939                  * ENDIF
1940                  */
1941                 qir_SF(c, c->execute);
1942                 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1943                              qir_uniform_ui(c, after_block->index));
1944 
1945                 /* If everything points at ENDIF, then jump there immediately. */
1946                 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
1947                 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1948                 qir_link_blocks(c->cur_block, after_block);
1949                 qir_link_blocks(c->cur_block, else_block);
1950 
1951                 qir_set_emit_block(c, else_block);
1952                 ntq_activate_execute_for_block(c);
1953                 ntq_emit_cf_list(c, &if_stmt->else_list);
1954         }
1955 
1956         qir_link_blocks(c->cur_block, after_block);
1957 
1958         qir_set_emit_block(c, after_block);
1959         if (was_top_level) {
1960                 c->execute = c->undef;
1961                 c->last_top_block = c->cur_block;
1962         } else {
1963                 ntq_activate_execute_for_block(c);
1964         }
1965 }
1966 
1967 static void
ntq_emit_jump(struct vc4_compile * c,nir_jump_instr * jump)1968 ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
1969 {
1970         struct qblock *jump_block;
1971         switch (jump->type) {
1972         case nir_jump_break:
1973                 jump_block = c->loop_break_block;
1974                 break;
1975         case nir_jump_continue:
1976                 jump_block = c->loop_cont_block;
1977                 break;
1978         default:
1979                 unreachable("Unsupported jump type\n");
1980         }
1981 
1982         qir_SF(c, c->execute);
1983         qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1984                      qir_uniform_ui(c, jump_block->index));
1985 
1986         /* Jump to the destination block if everyone has taken the jump. */
1987         qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
1988         qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1989         struct qblock *new_block = qir_new_block(c);
1990         qir_link_blocks(c->cur_block, jump_block);
1991         qir_link_blocks(c->cur_block, new_block);
1992         qir_set_emit_block(c, new_block);
1993 }
1994 
1995 static void
ntq_emit_instr(struct vc4_compile * c,nir_instr * instr)1996 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
1997 {
1998         switch (instr->type) {
1999         case nir_instr_type_alu:
2000                 ntq_emit_alu(c, nir_instr_as_alu(instr));
2001                 break;
2002 
2003         case nir_instr_type_intrinsic:
2004                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2005                 break;
2006 
2007         case nir_instr_type_load_const:
2008                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2009                 break;
2010 
2011         case nir_instr_type_undef:
2012                 ntq_emit_ssa_undef(c, nir_instr_as_undef(instr));
2013                 break;
2014 
2015         case nir_instr_type_tex:
2016                 ntq_emit_tex(c, nir_instr_as_tex(instr));
2017                 break;
2018 
2019         case nir_instr_type_jump:
2020                 ntq_emit_jump(c, nir_instr_as_jump(instr));
2021                 break;
2022 
2023         default:
2024                 fprintf(stderr, "Unknown NIR instr type: ");
2025                 nir_print_instr(instr, stderr);
2026                 fprintf(stderr, "\n");
2027                 abort();
2028         }
2029 }
2030 
2031 static void
ntq_emit_block(struct vc4_compile * c,nir_block * block)2032 ntq_emit_block(struct vc4_compile *c, nir_block *block)
2033 {
2034         nir_foreach_instr(instr, block) {
2035                 ntq_emit_instr(c, instr);
2036         }
2037 }
2038 
2039 static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
2040 
2041 static void
ntq_emit_loop(struct vc4_compile * c,nir_loop * loop)2042 ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
2043 {
2044         assert(!nir_loop_has_continue_construct(loop));
2045         if (!c->vc4->screen->has_control_flow) {
2046                 fprintf(stderr,
2047                         "loop support requires updated kernel.\n");
2048                 ntq_emit_cf_list(c, &loop->body);
2049                 return;
2050         }
2051 
2052         bool was_top_level = false;
2053         if (c->execute.file == QFILE_NULL) {
2054                 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
2055                 was_top_level = true;
2056         }
2057 
2058         struct qblock *save_loop_cont_block = c->loop_cont_block;
2059         struct qblock *save_loop_break_block = c->loop_break_block;
2060 
2061         c->loop_cont_block = qir_new_block(c);
2062         c->loop_break_block = qir_new_block(c);
2063 
2064         qir_link_blocks(c->cur_block, c->loop_cont_block);
2065         qir_set_emit_block(c, c->loop_cont_block);
2066         ntq_activate_execute_for_block(c);
2067 
2068         ntq_emit_cf_list(c, &loop->body);
2069 
2070         /* If anything had explicitly continued, or is here at the end of the
2071          * loop, then we need to loop again.  SF updates are masked by the
2072          * instruction's condition, so we can do the OR of the two conditions
2073          * within SF.
2074          */
2075         qir_SF(c, c->execute);
2076         struct qinst *cont_check =
2077                 qir_SUB_dest(c,
2078                              c->undef,
2079                              c->execute,
2080                              qir_uniform_ui(c, c->loop_cont_block->index));
2081         cont_check->cond = QPU_COND_ZC;
2082         cont_check->sf = true;
2083 
2084         qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
2085         qir_link_blocks(c->cur_block, c->loop_cont_block);
2086         qir_link_blocks(c->cur_block, c->loop_break_block);
2087 
2088         qir_set_emit_block(c, c->loop_break_block);
2089         if (was_top_level) {
2090                 c->execute = c->undef;
2091                 c->last_top_block = c->cur_block;
2092         } else {
2093                 ntq_activate_execute_for_block(c);
2094         }
2095 
2096         c->loop_break_block = save_loop_break_block;
2097         c->loop_cont_block = save_loop_cont_block;
2098 }
2099 
2100 static void
ntq_emit_function(struct vc4_compile * c,nir_function_impl * func)2101 ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
2102 {
2103         fprintf(stderr, "FUNCTIONS not handled.\n");
2104         abort();
2105 }
2106 
2107 static void
ntq_emit_cf_list(struct vc4_compile * c,struct exec_list * list)2108 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
2109 {
2110         foreach_list_typed(nir_cf_node, node, node, list) {
2111                 switch (node->type) {
2112                 case nir_cf_node_block:
2113                         ntq_emit_block(c, nir_cf_node_as_block(node));
2114                         break;
2115 
2116                 case nir_cf_node_if:
2117                         ntq_emit_if(c, nir_cf_node_as_if(node));
2118                         break;
2119 
2120                 case nir_cf_node_loop:
2121                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
2122                         break;
2123 
2124                 case nir_cf_node_function:
2125                         ntq_emit_function(c, nir_cf_node_as_function(node));
2126                         break;
2127 
2128                 default:
2129                         fprintf(stderr, "Unknown NIR node type\n");
2130                         abort();
2131                 }
2132         }
2133 }
2134 
2135 static void
ntq_emit_impl(struct vc4_compile * c,nir_function_impl * impl)2136 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
2137 {
2138         ntq_setup_registers(c, impl);
2139         ntq_emit_cf_list(c, &impl->body);
2140 }
2141 
2142 static void
nir_to_qir(struct vc4_compile * c)2143 nir_to_qir(struct vc4_compile *c)
2144 {
2145         if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
2146                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
2147 
2148         ntq_setup_inputs(c);
2149         ntq_setup_outputs(c);
2150 
2151         /* Find the main function and emit the body. */
2152         nir_foreach_function(function, c->s) {
2153                 assert(strcmp(function->name, "main") == 0);
2154                 assert(function->impl);
2155                 ntq_emit_impl(c, function->impl);
2156         }
2157 }
2158 
2159 static const nir_shader_compiler_options nir_options = {
2160         .lower_all_io_to_temps = true,
2161         .lower_extract_byte = true,
2162         .lower_extract_word = true,
2163         .lower_insert_byte = true,
2164         .lower_insert_word = true,
2165         .lower_fdiv = true,
2166         .lower_ffma16 = true,
2167         .lower_ffma32 = true,
2168         .lower_ffma64 = true,
2169         .lower_flrp32 = true,
2170         .lower_fmod = true,
2171         .lower_fpow = true,
2172         .lower_fsat = true,
2173         .lower_fsqrt = true,
2174         .lower_ldexp = true,
2175         .lower_fneg = true,
2176         .lower_ineg = true,
2177         .lower_to_scalar = true,
2178         .lower_umax = true,
2179         .lower_umin = true,
2180         .lower_isign = true,
2181         .has_fsub = true,
2182         .has_isub = true,
2183         .has_texture_scaling = true,
2184         .lower_mul_high = true,
2185         .max_unroll_iterations = 32,
2186         .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp),
2187         .scalarize_ddx = true,
2188 };
2189 
2190 const void *
vc4_screen_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,enum pipe_shader_type shader)2191 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
2192                                 enum pipe_shader_ir ir,
2193                                 enum pipe_shader_type shader)
2194 {
2195         return &nir_options;
2196 }
2197 
2198 static int
count_nir_instrs(nir_shader * nir)2199 count_nir_instrs(nir_shader *nir)
2200 {
2201         int count = 0;
2202         nir_foreach_function_impl(impl, nir) {
2203                 nir_foreach_block(block, impl) {
2204                         nir_foreach_instr(instr, block)
2205                                 count++;
2206                 }
2207         }
2208         return count;
2209 }
2210 
2211 static struct vc4_compile *
vc4_shader_ntq(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key,bool fs_threaded)2212 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2213                struct vc4_key *key, bool fs_threaded)
2214 {
2215         struct vc4_compile *c = qir_compile_init();
2216 
2217         MESA_TRACE_FUNC();
2218 
2219         c->vc4 = vc4;
2220         c->stage = stage;
2221         c->shader_state = &key->shader_state->base;
2222         c->program_id = key->shader_state->program_id;
2223         c->variant_id =
2224                 p_atomic_inc_return(&key->shader_state->compiled_variant_count);
2225         c->fs_threaded = fs_threaded;
2226 
2227         c->key = key;
2228         switch (stage) {
2229         case QSTAGE_FRAG:
2230                 c->fs_key = (struct vc4_fs_key *)key;
2231                 if (c->fs_key->is_points) {
2232                         c->point_x = emit_fragment_varying(c, ~0, 0);
2233                         c->point_y = emit_fragment_varying(c, ~0, 0);
2234                 } else if (c->fs_key->is_lines) {
2235                         c->line_x = emit_fragment_varying(c, ~0, 0);
2236                 }
2237                 break;
2238         case QSTAGE_VERT:
2239                 c->vs_key = (struct vc4_vs_key *)key;
2240                 break;
2241         case QSTAGE_COORD:
2242                 c->vs_key = (struct vc4_vs_key *)key;
2243                 break;
2244         }
2245 
2246         c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
2247 
2248         if (stage == QSTAGE_FRAG) {
2249                 NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
2250         }
2251 
2252         struct nir_lower_tex_options tex_options = {
2253                 .lower_txp = ~0,
2254 
2255                 /* Apply swizzles to all samplers. */
2256                 .swizzle_result = ~0,
2257                 .lower_invalid_implicit_lod = true,
2258         };
2259 
2260         /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
2261          * The format swizzling applies before sRGB decode, and
2262          * ARB_texture_swizzle is the last thing before returning the sample.
2263          */
2264         for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
2265                 enum pipe_format format = c->key->tex[i].format;
2266 
2267                 if (!format)
2268                         continue;
2269 
2270                 const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
2271 
2272                 for (int j = 0; j < 4; j++) {
2273                         uint8_t arb_swiz = c->key->tex[i].swizzle[j];
2274 
2275                         if (arb_swiz <= 3) {
2276                                 tex_options.swizzles[i][j] =
2277                                         format_swizzle[arb_swiz];
2278                         } else {
2279                                 tex_options.swizzles[i][j] = arb_swiz;
2280                         }
2281                 }
2282 
2283                 if (util_format_is_srgb(format))
2284                         tex_options.lower_srgb |= (1 << i);
2285         }
2286 
2287         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
2288 
2289         if (c->key->ucp_enables) {
2290                 if (stage == QSTAGE_FRAG) {
2291                         NIR_PASS_V(c->s, nir_lower_clip_fs,
2292                                    c->key->ucp_enables, false, false);
2293                 } else {
2294                         NIR_PASS_V(c->s, nir_lower_clip_vs,
2295                                    c->key->ucp_enables, false, false, NULL);
2296                         NIR_PASS_V(c->s, nir_lower_io_to_scalar,
2297                                    nir_var_shader_out, NULL, NULL);
2298                 }
2299         }
2300 
2301         /* FS input scalarizing must happen after nir_lower_two_sided_color,
2302          * which only handles a vec4 at a time.  Similarly, VS output
2303          * scalarizing must happen after nir_lower_clip_vs.
2304          */
2305         if (c->stage == QSTAGE_FRAG)
2306                 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
2307         else
2308                 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
2309 
2310         NIR_PASS_V(c->s, vc4_nir_lower_io, c);
2311         NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
2312         nir_lower_idiv_options idiv_options = {
2313                 .allow_fp16 = true,
2314         };
2315         NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
2316         NIR_PASS(_, c->s, nir_lower_alu);
2317 
2318         vc4_optimize_nir(c->s);
2319 
2320         /* Do late algebraic optimization to turn add(a, neg(b)) back into
2321          * subs, then the mandatory cleanup after algebraic.  Note that it may
2322          * produce fnegs, and if so then we need to keep running to squash
2323          * fneg(fneg(a)).
2324          */
2325         bool more_late_algebraic = true;
2326         while (more_late_algebraic) {
2327                 more_late_algebraic = false;
2328                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
2329                 NIR_PASS_V(c->s, nir_opt_constant_folding);
2330                 NIR_PASS_V(c->s, nir_copy_prop);
2331                 NIR_PASS_V(c->s, nir_opt_dce);
2332                 NIR_PASS_V(c->s, nir_opt_cse);
2333         }
2334 
2335         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
2336 
2337         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
2338         NIR_PASS_V(c->s, nir_trivialize_registers);
2339 
2340         if (VC4_DBG(NIR)) {
2341                 fprintf(stderr, "%s prog %d/%d NIR:\n",
2342                         qir_get_stage_name(c->stage),
2343                         c->program_id, c->variant_id);
2344                 nir_print_shader(c->s, stderr);
2345         }
2346 
2347         nir_to_qir(c);
2348 
2349         switch (stage) {
2350         case QSTAGE_FRAG:
2351                 /* FS threading requires that the thread execute
2352                  * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
2353                  * (with no other THRSW afterwards, obviously).  If we didn't
2354                  * fetch a texture at a top level block, this wouldn't be
2355                  * true.
2356                  */
2357                 if (c->fs_threaded && !c->last_thrsw_at_top_level) {
2358                         c->failed = true;
2359                         return c;
2360                 }
2361 
2362                 emit_frag_end(c);
2363                 break;
2364         case QSTAGE_VERT:
2365                 emit_vert_end(c,
2366                               c->vs_key->fs_inputs->input_slots,
2367                               c->vs_key->fs_inputs->num_inputs);
2368                 break;
2369         case QSTAGE_COORD:
2370                 emit_coord_end(c);
2371                 break;
2372         }
2373 
2374         if (VC4_DBG(QIR)) {
2375                 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2376                         qir_get_stage_name(c->stage),
2377                         c->program_id, c->variant_id);
2378                 qir_dump(c);
2379                 fprintf(stderr, "\n");
2380         }
2381 
2382         qir_optimize(c);
2383         qir_lower_uniforms(c);
2384 
2385         qir_schedule_instructions(c);
2386         qir_emit_uniform_stream_resets(c);
2387 
2388         if (VC4_DBG(QIR)) {
2389                 fprintf(stderr, "%s prog %d/%d QIR:\n",
2390                         qir_get_stage_name(c->stage),
2391                         c->program_id, c->variant_id);
2392                 qir_dump(c);
2393                 fprintf(stderr, "\n");
2394         }
2395 
2396         qir_reorder_uniforms(c);
2397         vc4_generate_code(vc4, c);
2398 
2399         ralloc_free(c->s);
2400 
2401         return c;
2402 }
2403 
2404 static void
vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader * uncompiled,struct vc4_key * key)2405 vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader *uncompiled,
2406                                 struct vc4_key *key)
2407 {
2408         nir_shader *s = uncompiled->base.ir.nir;
2409 
2410         for (int i = 0; i < s->info.num_textures; i++) {
2411                 key->tex[i].format = PIPE_FORMAT_R8G8B8A8_UNORM;
2412                 key->tex[i].swizzle[0] = PIPE_SWIZZLE_X;
2413                 key->tex[i].swizzle[1] = PIPE_SWIZZLE_Y;
2414                 key->tex[i].swizzle[2] = PIPE_SWIZZLE_Z;
2415                 key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
2416         }
2417 }
2418 
2419 static inline struct vc4_varying_slot
vc4_slot_from_slot_and_component(uint8_t slot,uint8_t component)2420 vc4_slot_from_slot_and_component(uint8_t slot, uint8_t component)
2421 {
2422         assume(slot < 255 / 4);
2423         return (struct vc4_varying_slot){ (slot << 2) + component };
2424 }
2425 
2426 static void
precompile_all_fs_inputs(nir_shader * s,struct vc4_fs_inputs * fs_inputs)2427 precompile_all_fs_inputs(nir_shader *s,
2428                          struct vc4_fs_inputs *fs_inputs)
2429 {
2430         /* Assume all VS outputs will actually be used by the FS and output
2431          * them (the two sides have to match exactly) */
2432         nir_foreach_shader_out_variable(var, s) {
2433                 const int array_len =
2434                         glsl_type_is_vector_or_scalar(var->type) ?
2435                         1 : glsl_get_length(var->type);
2436                 for (int j = 0; j < array_len; j++) {
2437                         const int slot = var->data.location + j;
2438                         const int num_components =
2439                                 glsl_get_components(var->type);
2440                         for (int i = 0; i < num_components; i++) {
2441                                 const int swiz = var->data.location_frac + i;
2442                                 fs_inputs->input_slots[fs_inputs->num_inputs++] =
2443                                         vc4_slot_from_slot_and_component(slot,
2444                                                                          swiz);
2445                         }
2446                 }
2447         }
2448 }
2449 
2450 /**
2451  * Precompiles a shader variant at shader state creation time if
2452  * VC4_DEBUG=shaderdb is set.
2453  */
2454 static void
vc4_shader_precompile(struct vc4_context * vc4,struct vc4_uncompiled_shader * so)2455 vc4_shader_precompile(struct vc4_context *vc4,
2456                       struct vc4_uncompiled_shader *so)
2457 {
2458         nir_shader *s = so->base.ir.nir;
2459 
2460         if (s->info.stage == MESA_SHADER_FRAGMENT) {
2461                 struct vc4_fs_key key = {
2462                         .base.shader_state = so,
2463                         .depth_enabled = true,
2464                         .logicop_func = PIPE_LOGICOP_COPY,
2465                         .color_format = PIPE_FORMAT_R8G8B8A8_UNORM,
2466                         .blend = {
2467                                 .blend_enable = false,
2468                                 .colormask = PIPE_MASK_RGBA,
2469                         },
2470                 };
2471 
2472                 vc4_setup_shared_precompile_key(so, &key.base);
2473                 vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key.base);
2474         } else {
2475                 assert(s->info.stage == MESA_SHADER_VERTEX);
2476                 struct vc4_varying_slot input_slots[64] = {};
2477                 struct vc4_fs_inputs fs_inputs = {
2478                         .input_slots = input_slots,
2479                         .num_inputs = 0,
2480                 };
2481                 struct vc4_vs_key key = {
2482                         .base.shader_state = so,
2483                         .fs_inputs = &fs_inputs,
2484                 };
2485 
2486                 vc4_setup_shared_precompile_key(so, &key.base);
2487                 precompile_all_fs_inputs(s, &fs_inputs);
2488                 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2489 
2490                 /* Compile VS bin shader: only position (XXX: include TF) */
2491                 key.is_coord = true;
2492                 fs_inputs.num_inputs = 0;
2493                 precompile_all_fs_inputs(s, &fs_inputs);
2494                 for (int i = 0; i < 4; i++) {
2495                         fs_inputs.input_slots[fs_inputs.num_inputs++] =
2496                                 vc4_slot_from_slot_and_component(VARYING_SLOT_POS,
2497                                                                  i);
2498                 }
2499                 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2500         }
2501 }
2502 
2503 static void *
vc4_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)2504 vc4_shader_state_create(struct pipe_context *pctx,
2505                         const struct pipe_shader_state *cso)
2506 {
2507         struct vc4_context *vc4 = vc4_context(pctx);
2508         struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2509         if (!so)
2510                 return NULL;
2511 
2512         so->program_id = vc4->next_uncompiled_program_id++;
2513 
2514         nir_shader *s;
2515 
2516         if (cso->type == PIPE_SHADER_IR_NIR) {
2517                 /* The backend takes ownership of the NIR shader on state
2518                  * creation.
2519                  */
2520                 s = cso->ir.nir;
2521        } else {
2522                 assert(cso->type == PIPE_SHADER_IR_TGSI);
2523 
2524                 if (VC4_DBG(TGSI)) {
2525                         fprintf(stderr, "prog %d TGSI:\n",
2526                                 so->program_id);
2527                         tgsi_dump(cso->tokens, 0);
2528                         fprintf(stderr, "\n");
2529                 }
2530                 s = tgsi_to_nir(cso->tokens, pctx->screen, false);
2531         }
2532 
2533         if (s->info.stage == MESA_SHADER_VERTEX)
2534                 NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);
2535 
2536         NIR_PASS_V(s, nir_lower_io,
2537                    nir_var_shader_in | nir_var_shader_out | nir_var_uniform,
2538                    type_size, (nir_lower_io_options)0);
2539 
2540         NIR_PASS_V(s, nir_normalize_cubemap_coords);
2541 
2542         NIR_PASS_V(s, nir_lower_load_const_to_scalar);
2543 
2544         vc4_optimize_nir(s);
2545 
2546         NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
2547 
2548         /* Garbage collect dead instructions */
2549         nir_sweep(s);
2550 
2551         so->base.type = PIPE_SHADER_IR_NIR;
2552         so->base.ir.nir = s;
2553 
2554         if (VC4_DBG(NIR)) {
2555                 fprintf(stderr, "%s prog %d NIR:\n",
2556                         gl_shader_stage_name(s->info.stage),
2557                         so->program_id);
2558                 nir_print_shader(s, stderr);
2559                 fprintf(stderr, "\n");
2560         }
2561 
2562         if (VC4_DBG(SHADERDB)) {
2563                 vc4_shader_precompile(vc4, so);
2564         }
2565 
2566         return so;
2567 }
2568 
2569 static void
copy_uniform_state_to_shader(struct vc4_compiled_shader * shader,struct vc4_compile * c)2570 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2571                              struct vc4_compile *c)
2572 {
2573         int count = c->num_uniforms;
2574         struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2575 
2576         uinfo->count = count;
2577         uinfo->data = ralloc_array(shader, uint32_t, count);
2578         memcpy(uinfo->data, c->uniform_data,
2579                count * sizeof(*uinfo->data));
2580         uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2581         memcpy(uinfo->contents, c->uniform_contents,
2582                count * sizeof(*uinfo->contents));
2583         uinfo->num_texture_samples = c->num_texture_samples;
2584 
2585         vc4_set_shader_uniform_dirty_flags(shader);
2586 }
2587 
2588 static void
vc4_setup_compiled_fs_inputs(struct vc4_context * vc4,struct vc4_compile * c,struct vc4_compiled_shader * shader)2589 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
2590                              struct vc4_compiled_shader *shader)
2591 {
2592         struct vc4_fs_inputs inputs;
2593 
2594         memset(&inputs, 0, sizeof(inputs));
2595         if (c->num_input_slots > 0) {
2596                 inputs.input_slots = ralloc_array(shader,
2597                                                   struct vc4_varying_slot,
2598                                                   c->num_input_slots);
2599 
2600                 bool input_live[c->num_input_slots];
2601 
2602                 memset(input_live, 0, sizeof(input_live));
2603                 qir_for_each_inst_inorder(inst, c) {
2604                         for (int i = 0; i < qir_get_nsrc(inst); i++) {
2605                                 if (inst->src[i].file == QFILE_VARY)
2606                                         input_live[inst->src[i].index] = true;
2607                         }
2608                 }
2609 
2610                 for (int i = 0; i < c->num_input_slots; i++) {
2611                         struct vc4_varying_slot *slot = &c->input_slots[i];
2612 
2613                         if (!input_live[i])
2614                                 continue;
2615 
2616                         /* Skip non-VS-output inputs. */
2617                         if (slot->slot == (uint8_t)~0)
2618                                 continue;
2619 
2620                         if (slot->slot == VARYING_SLOT_COL0 ||
2621                             slot->slot == VARYING_SLOT_COL1 ||
2622                             slot->slot == VARYING_SLOT_BFC0 ||
2623                             slot->slot == VARYING_SLOT_BFC1) {
2624                                 shader->color_inputs |= (1 << inputs.num_inputs);
2625                         }
2626 
2627                         inputs.input_slots[inputs.num_inputs] = *slot;
2628                         inputs.num_inputs++;
2629                 }
2630         }
2631         shader->num_inputs = inputs.num_inputs;
2632 
2633         /* Add our set of inputs to the set of all inputs seen.  This way, we
2634          * can have a single pointer that identifies an FS inputs set,
2635          * allowing VS to avoid recompiling when the FS is recompiled (or a
2636          * new one is bound using separate shader objects) but the inputs
2637          * don't change.
2638          */
2639         struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
2640         if (entry) {
2641                 shader->fs_inputs = entry->key;
2642                 ralloc_free(inputs.input_slots);
2643         } else {
2644                 struct vc4_fs_inputs *alloc_inputs;
2645 
2646                 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
2647                 memcpy(alloc_inputs, &inputs, sizeof(inputs));
2648                 ralloc_steal(alloc_inputs, inputs.input_slots);
2649                 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
2650 
2651                 shader->fs_inputs = alloc_inputs;
2652         }
2653 }
2654 
2655 static struct vc4_compiled_shader *
vc4_get_compiled_shader(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key)2656 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2657                         struct vc4_key *key)
2658 {
2659         struct hash_table *ht;
2660         uint32_t key_size;
2661         bool try_threading;
2662 
2663         if (stage == QSTAGE_FRAG) {
2664                 ht = vc4->fs_cache;
2665                 key_size = sizeof(struct vc4_fs_key);
2666                 try_threading = vc4->screen->has_threaded_fs;
2667         } else {
2668                 ht = vc4->vs_cache;
2669                 key_size = sizeof(struct vc4_vs_key);
2670                 try_threading = false;
2671         }
2672 
2673         struct vc4_compiled_shader *shader;
2674         struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2675         if (entry)
2676                 return entry->data;
2677 
2678         struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
2679         /* If the FS failed to compile threaded, fall back to single threaded. */
2680         if (try_threading && c->failed) {
2681                 qir_compile_destroy(c);
2682                 c = vc4_shader_ntq(vc4, stage, key, false);
2683         }
2684 
2685         shader = rzalloc(NULL, struct vc4_compiled_shader);
2686 
2687         shader->program_id = vc4->next_compiled_program_id++;
2688         if (stage == QSTAGE_FRAG) {
2689                 vc4_setup_compiled_fs_inputs(vc4, c, shader);
2690 
2691                 /* Note: the temporary clone in c->s has been freed. */
2692                 nir_shader *orig_shader = key->shader_state->base.ir.nir;
2693                 if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
2694                         shader->disable_early_z = true;
2695         } else {
2696                 shader->num_inputs = c->num_inputs;
2697 
2698                 shader->vattr_offsets[0] = 0;
2699                 for (int i = 0; i < 8; i++) {
2700                         shader->vattr_offsets[i + 1] =
2701                                 shader->vattr_offsets[i] + c->vattr_sizes[i];
2702 
2703                         if (c->vattr_sizes[i])
2704                                 shader->vattrs_live |= (1 << i);
2705                 }
2706         }
2707 
2708         shader->failed = c->failed;
2709         if (c->failed) {
2710                 shader->failed = true;
2711         } else {
2712                 copy_uniform_state_to_shader(shader, c);
2713                 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
2714                                                  c->qpu_inst_count *
2715                                                  sizeof(uint64_t));
2716         }
2717 
2718         shader->fs_threaded = c->fs_threaded;
2719 
2720         qir_compile_destroy(c);
2721 
2722         struct vc4_key *dup_key;
2723         dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
2724         memcpy(dup_key, key, key_size);
2725         _mesa_hash_table_insert(ht, dup_key, shader);
2726 
2727         return shader;
2728 }
2729 
2730 static void
vc4_setup_shared_key(struct vc4_context * vc4,struct vc4_key * key,struct vc4_texture_stateobj * texstate)2731 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2732                      struct vc4_texture_stateobj *texstate)
2733 {
2734         for (int i = 0; i < texstate->num_textures; i++) {
2735                 struct pipe_sampler_view *sampler = texstate->textures[i];
2736                 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
2737                 struct pipe_sampler_state *sampler_state =
2738                         texstate->samplers[i];
2739 
2740                 if (!sampler)
2741                         continue;
2742 
2743                 key->tex[i].format = sampler->format;
2744                 key->tex[i].swizzle[0] = sampler->swizzle_r;
2745                 key->tex[i].swizzle[1] = sampler->swizzle_g;
2746                 key->tex[i].swizzle[2] = sampler->swizzle_b;
2747                 key->tex[i].swizzle[3] = sampler->swizzle_a;
2748 
2749                 if (sampler->texture->nr_samples > 1) {
2750                         key->tex[i].msaa_width = sampler->texture->width0;
2751                         key->tex[i].msaa_height = sampler->texture->height0;
2752                 } else if (sampler){
2753                         key->tex[i].compare_mode = sampler_state->compare_mode;
2754                         key->tex[i].compare_func = sampler_state->compare_func;
2755                         key->tex[i].wrap_s = sampler_state->wrap_s;
2756                         key->tex[i].wrap_t = sampler_state->wrap_t;
2757                         key->tex[i].force_first_level =
2758                                 vc4_sampler->force_first_level;
2759                 }
2760         }
2761 
2762         key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2763 }
2764 
2765 static void
vc4_update_compiled_fs(struct vc4_context * vc4,uint8_t prim_mode)2766 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2767 {
2768         struct vc4_job *job = vc4->job;
2769         struct vc4_fs_key local_key;
2770         struct vc4_fs_key *key = &local_key;
2771 
2772         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2773                             VC4_DIRTY_BLEND |
2774                             VC4_DIRTY_FRAMEBUFFER |
2775                             VC4_DIRTY_ZSA |
2776                             VC4_DIRTY_RASTERIZER |
2777                             VC4_DIRTY_SAMPLE_MASK |
2778                             VC4_DIRTY_FRAGTEX |
2779                             VC4_DIRTY_UNCOMPILED_FS |
2780                             VC4_DIRTY_UBO_1_SIZE))) {
2781                 return;
2782         }
2783 
2784         memset(key, 0, sizeof(*key));
2785         vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2786         key->base.shader_state = vc4->prog.bind_fs;
2787         key->is_points = (prim_mode == MESA_PRIM_POINTS);
2788         key->is_lines = (prim_mode >= MESA_PRIM_LINES &&
2789                          prim_mode <= MESA_PRIM_LINE_STRIP);
2790         key->blend = vc4->blend->rt[0];
2791         if (vc4->blend->logicop_enable) {
2792                 key->logicop_func = vc4->blend->logicop_func;
2793         } else {
2794                 key->logicop_func = PIPE_LOGICOP_COPY;
2795         }
2796         if (job->msaa) {
2797                 key->msaa = vc4->rasterizer->base.multisample;
2798                 key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
2799                 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
2800                 key->sample_alpha_to_one = vc4->blend->alpha_to_one;
2801         }
2802 
2803         if (vc4->framebuffer.cbufs[0])
2804                 key->color_format = vc4->framebuffer.cbufs[0]->format;
2805 
2806         key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2807         key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2808         key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2809         key->depth_enabled = (vc4->zsa->base.depth_enabled ||
2810                               key->stencil_enabled);
2811 
2812         if (key->is_points) {
2813                 key->point_sprite_mask =
2814                         vc4->rasterizer->base.sprite_coord_enable;
2815                 key->point_coord_upper_left =
2816                         (vc4->rasterizer->base.sprite_coord_mode ==
2817                          PIPE_SPRITE_COORD_UPPER_LEFT);
2818         }
2819 
2820         key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
2821 
2822         struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2823         vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2824         if (vc4->prog.fs == old_fs)
2825                 return;
2826 
2827         vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2828 
2829         if (vc4->rasterizer->base.flatshade &&
2830             (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
2831                 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2832         }
2833 
2834         if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
2835                 vc4->dirty |= VC4_DIRTY_FS_INPUTS;
2836 }
2837 
2838 static void
vc4_update_compiled_vs(struct vc4_context * vc4,uint8_t prim_mode)2839 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2840 {
2841         struct vc4_vs_key local_key;
2842         struct vc4_vs_key *key = &local_key;
2843 
2844         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2845                             VC4_DIRTY_RASTERIZER |
2846                             VC4_DIRTY_VERTTEX |
2847                             VC4_DIRTY_VTXSTATE |
2848                             VC4_DIRTY_UNCOMPILED_VS |
2849                             VC4_DIRTY_FS_INPUTS))) {
2850                 return;
2851         }
2852 
2853         memset(key, 0, sizeof(*key));
2854         vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2855         key->base.shader_state = vc4->prog.bind_vs;
2856         key->fs_inputs = vc4->prog.fs->fs_inputs;
2857 
2858         for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2859                 key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2860 
2861         key->per_vertex_point_size =
2862                 (prim_mode == MESA_PRIM_POINTS &&
2863                  vc4->rasterizer->base.point_size_per_vertex);
2864 
2865         struct vc4_compiled_shader *vs =
2866                 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2867         if (vs != vc4->prog.vs) {
2868                 vc4->prog.vs = vs;
2869                 vc4->dirty |= VC4_DIRTY_COMPILED_VS;
2870         }
2871 
2872         key->is_coord = true;
2873         /* Coord shaders don't care what the FS inputs are. */
2874         key->fs_inputs = NULL;
2875         struct vc4_compiled_shader *cs =
2876                 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2877         if (cs != vc4->prog.cs) {
2878                 vc4->prog.cs = cs;
2879                 vc4->dirty |= VC4_DIRTY_COMPILED_CS;
2880         }
2881 }
2882 
2883 bool
vc4_update_compiled_shaders(struct vc4_context * vc4,uint8_t prim_mode)2884 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2885 {
2886         vc4_update_compiled_fs(vc4, prim_mode);
2887         vc4_update_compiled_vs(vc4, prim_mode);
2888 
2889         return !(vc4->prog.cs->failed ||
2890                  vc4->prog.vs->failed ||
2891                  vc4->prog.fs->failed);
2892 }
2893 
2894 static uint32_t
fs_cache_hash(const void * key)2895 fs_cache_hash(const void *key)
2896 {
2897         return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2898 }
2899 
2900 static uint32_t
vs_cache_hash(const void * key)2901 vs_cache_hash(const void *key)
2902 {
2903         return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2904 }
2905 
2906 static bool
fs_cache_compare(const void * key1,const void * key2)2907 fs_cache_compare(const void *key1, const void *key2)
2908 {
2909         return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2910 }
2911 
2912 static bool
vs_cache_compare(const void * key1,const void * key2)2913 vs_cache_compare(const void *key1, const void *key2)
2914 {
2915         return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2916 }
2917 
2918 static uint32_t
fs_inputs_hash(const void * key)2919 fs_inputs_hash(const void *key)
2920 {
2921         const struct vc4_fs_inputs *inputs = key;
2922 
2923         return _mesa_hash_data(inputs->input_slots,
2924                                sizeof(*inputs->input_slots) *
2925                                inputs->num_inputs);
2926 }
2927 
2928 static bool
fs_inputs_compare(const void * key1,const void * key2)2929 fs_inputs_compare(const void *key1, const void *key2)
2930 {
2931         const struct vc4_fs_inputs *inputs1 = key1;
2932         const struct vc4_fs_inputs *inputs2 = key2;
2933 
2934         if (inputs1->num_inputs == inputs2->num_inputs) {
2935                 if (inputs1->num_inputs == 0) {
2936                         return true;
2937                 } else {
2938                         return memcmp(inputs1->input_slots,
2939                                       inputs2->input_slots,
2940                                       sizeof(*inputs1->input_slots) *
2941                                       inputs1->num_inputs) == 0;
2942                 }
2943         }
2944 
2945         return false;
2946 }
2947 
2948 static void
delete_from_cache_if_matches(struct hash_table * ht,struct vc4_compiled_shader ** last_compile,struct hash_entry * entry,struct vc4_uncompiled_shader * so)2949 delete_from_cache_if_matches(struct hash_table *ht,
2950                              struct vc4_compiled_shader **last_compile,
2951                              struct hash_entry *entry,
2952                              struct vc4_uncompiled_shader *so)
2953 {
2954         const struct vc4_key *key = entry->key;
2955 
2956         if (key->shader_state == so) {
2957                 struct vc4_compiled_shader *shader = entry->data;
2958                 _mesa_hash_table_remove(ht, entry);
2959                 vc4_bo_unreference(&shader->bo);
2960 
2961                 if (shader == *last_compile)
2962                         *last_compile = NULL;
2963 
2964                 ralloc_free(shader);
2965         }
2966 }
2967 
2968 static void
vc4_shader_state_delete(struct pipe_context * pctx,void * hwcso)2969 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2970 {
2971         struct vc4_context *vc4 = vc4_context(pctx);
2972         struct vc4_uncompiled_shader *so = hwcso;
2973 
2974         hash_table_foreach(vc4->fs_cache, entry) {
2975                 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
2976                                              entry, so);
2977         }
2978         hash_table_foreach(vc4->vs_cache, entry) {
2979                 delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
2980                                              entry, so);
2981         }
2982 
2983         ralloc_free(so->base.ir.nir);
2984         free(so);
2985 }
2986 
2987 static void
vc4_fp_state_bind(struct pipe_context * pctx,void * hwcso)2988 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2989 {
2990         struct vc4_context *vc4 = vc4_context(pctx);
2991         vc4->prog.bind_fs = hwcso;
2992         vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2993 }
2994 
2995 static void
vc4_vp_state_bind(struct pipe_context * pctx,void * hwcso)2996 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2997 {
2998         struct vc4_context *vc4 = vc4_context(pctx);
2999         vc4->prog.bind_vs = hwcso;
3000         vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
3001 }
3002 
3003 void
vc4_program_init(struct pipe_context * pctx)3004 vc4_program_init(struct pipe_context *pctx)
3005 {
3006         struct vc4_context *vc4 = vc4_context(pctx);
3007 
3008         pctx->create_vs_state = vc4_shader_state_create;
3009         pctx->delete_vs_state = vc4_shader_state_delete;
3010 
3011         pctx->create_fs_state = vc4_shader_state_create;
3012         pctx->delete_fs_state = vc4_shader_state_delete;
3013 
3014         pctx->bind_fs_state = vc4_fp_state_bind;
3015         pctx->bind_vs_state = vc4_vp_state_bind;
3016 
3017         vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
3018                                                 fs_cache_compare);
3019         vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
3020                                                 vs_cache_compare);
3021         vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
3022                                               fs_inputs_compare);
3023 }
3024 
3025 void
vc4_program_fini(struct pipe_context * pctx)3026 vc4_program_fini(struct pipe_context *pctx)
3027 {
3028         struct vc4_context *vc4 = vc4_context(pctx);
3029 
3030         hash_table_foreach(vc4->fs_cache, entry) {
3031                 struct vc4_compiled_shader *shader = entry->data;
3032                 vc4_bo_unreference(&shader->bo);
3033                 ralloc_free(shader);
3034                 _mesa_hash_table_remove(vc4->fs_cache, entry);
3035         }
3036 
3037         hash_table_foreach(vc4->vs_cache, entry) {
3038                 struct vc4_compiled_shader *shader = entry->data;
3039                 vc4_bo_unreference(&shader->bo);
3040                 ralloc_free(shader);
3041                 _mesa_hash_table_remove(vc4->vs_cache, entry);
3042         }
3043 }
3044