1 /*
2 * Copyright (c) 2014 Scott Mansell
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <inttypes.h>
26 #include "util/format/u_format.h"
27 #include "util/crc32.h"
28 #include "util/u_helpers.h"
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 #include "util/ralloc.h"
32 #include "util/hash_table.h"
33 #include "tgsi/tgsi_dump.h"
34 #include "compiler/glsl_types.h"
35 #include "compiler/nir/nir.h"
36 #include "compiler/nir/nir_builder.h"
37 #include "nir/tgsi_to_nir.h"
38 #include "vc4_context.h"
39 #include "vc4_qpu.h"
40 #include "vc4_qir.h"
41
42 static struct qreg
43 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
44 static void
45 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
46
47 static struct vc4_compiled_shader *
48 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
49 struct vc4_key *key);
50
51 static int
type_size(const struct glsl_type * type,bool bindless)52 type_size(const struct glsl_type *type, bool bindless)
53 {
54 return glsl_count_attribute_slots(type, false);
55 }
56
57 static void
resize_qreg_array(struct vc4_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)58 resize_qreg_array(struct vc4_compile *c,
59 struct qreg **regs,
60 uint32_t *size,
61 uint32_t decl_size)
62 {
63 if (*size >= decl_size)
64 return;
65
66 uint32_t old_size = *size;
67 *size = MAX2(*size * 2, decl_size);
68 *regs = reralloc(c, *regs, struct qreg, *size);
69 if (!*regs) {
70 fprintf(stderr, "Malloc failure\n");
71 abort();
72 }
73
74 for (uint32_t i = old_size; i < *size; i++)
75 (*regs)[i] = c->undef;
76 }
77
78 static void
ntq_emit_thrsw(struct vc4_compile * c)79 ntq_emit_thrsw(struct vc4_compile *c)
80 {
81 if (!c->fs_threaded)
82 return;
83
84 /* Always thread switch after each texture operation for now.
85 *
86 * We could do better by batching a bunch of texture fetches up and
87 * then doing one thread switch and collecting all their results
88 * afterward.
89 */
90 qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
91 c->undef, c->undef));
92 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
93 }
94
95 static struct qreg
indirect_uniform_load(struct vc4_compile * c,nir_intrinsic_instr * intr)96 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
97 {
98 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
99
100 /* Clamp to [0, array size). Note that MIN/MAX are signed. */
101 uint32_t range = nir_intrinsic_range(intr);
102 indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
103 indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
104 qir_uniform_ui(c, range - 4));
105
106 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
107 indirect_offset,
108 qir_uniform(c, QUNIFORM_UBO0_ADDR,
109 nir_intrinsic_base(intr)));
110
111 c->num_texture_samples++;
112
113 ntq_emit_thrsw(c);
114
115 return qir_TEX_RESULT(c);
116 }
117
118 static struct qreg
vc4_ubo_load(struct vc4_compile * c,nir_intrinsic_instr * intr)119 vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
120 {
121 ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);
122 assert(buffer_index == 1);
123 assert(c->stage == QSTAGE_FRAG);
124
125 struct qreg offset = ntq_get_src(c, intr->src[1], 0);
126
127 /* Clamp to [0, array size). Note that MIN/MAX are signed. */
128 offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
129 offset = qir_MIN_NOIMM(c, offset,
130 qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
131
132 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
133 offset,
134 qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
135
136 c->num_texture_samples++;
137
138 ntq_emit_thrsw(c);
139
140 return qir_TEX_RESULT(c);
141 }
142
143 nir_def *
vc4_nir_get_swizzled_channel(nir_builder * b,nir_def ** srcs,int swiz)144 vc4_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
145 {
146 switch (swiz) {
147 default:
148 case PIPE_SWIZZLE_NONE:
149 fprintf(stderr, "warning: unknown swizzle\n");
150 FALLTHROUGH;
151 case PIPE_SWIZZLE_0:
152 return nir_imm_float(b, 0.0);
153 case PIPE_SWIZZLE_1:
154 return nir_imm_float(b, 1.0);
155 case PIPE_SWIZZLE_X:
156 case PIPE_SWIZZLE_Y:
157 case PIPE_SWIZZLE_Z:
158 case PIPE_SWIZZLE_W:
159 return srcs[swiz];
160 }
161 }
162
163 static struct qreg *
ntq_init_ssa_def(struct vc4_compile * c,nir_def * def)164 ntq_init_ssa_def(struct vc4_compile *c, nir_def *def)
165 {
166 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
167 def->num_components);
168 _mesa_hash_table_insert(c->def_ht, def, qregs);
169 return qregs;
170 }
171
172 /**
173 * This function is responsible for getting QIR results into the associated
174 * storage for a NIR instruction.
175 *
176 * If it's a NIR SSA def, then we just set the associated hash table entry to
177 * the new result.
178 *
179 * If it's a NIR reg, then we need to update the existing qreg assigned to the
180 * NIR destination with the incoming value. To do that without introducing
181 * new MOVs, we require that the incoming qreg either be a uniform, or be
182 * SSA-defined by the previous QIR instruction in the block and rewritable by
183 * this function. That lets us sneak ahead and insert the SF flag beforehand
184 * (knowing that the previous instruction doesn't depend on flags) and rewrite
185 * its destination to be the NIR reg's destination
186 */
187 static void
ntq_store_def(struct vc4_compile * c,nir_def * def,int chan,struct qreg result)188 ntq_store_def(struct vc4_compile *c, nir_def *def, int chan,
189 struct qreg result)
190 {
191 struct qinst *last_inst = NULL;
192 if (!list_is_empty(&c->cur_block->instructions))
193 last_inst = (struct qinst *)c->cur_block->instructions.prev;
194
195 assert(result.file == QFILE_UNIF ||
196 (result.file == QFILE_TEMP &&
197 last_inst && last_inst == c->defs[result.index]));
198
199 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
200 if (store == NULL) {
201 assert(chan < def->num_components);
202
203 struct qreg *qregs;
204 struct hash_entry *entry =
205 _mesa_hash_table_search(c->def_ht, def);
206
207 if (entry)
208 qregs = entry->data;
209 else
210 qregs = ntq_init_ssa_def(c, def);
211
212 qregs[chan] = result;
213 } else {
214 nir_def *reg = store->src[1].ssa;
215 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
216 assert(nir_intrinsic_base(store) == 0);
217 assert(nir_intrinsic_num_array_elems(decl) == 0);
218 struct hash_entry *entry =
219 _mesa_hash_table_search(c->def_ht, reg);
220 struct qreg *qregs = entry->data;
221
222 /* Insert a MOV if the source wasn't an SSA def in the
223 * previous instruction.
224 */
225 if (result.file == QFILE_UNIF) {
226 result = qir_MOV(c, result);
227 last_inst = c->defs[result.index];
228 }
229
230 /* We know they're both temps, so just rewrite index. */
231 c->defs[last_inst->dst.index] = NULL;
232 last_inst->dst.index = qregs[chan].index;
233
234 /* If we're in control flow, then make this update of the reg
235 * conditional on the execution mask.
236 */
237 if (c->execute.file != QFILE_NULL) {
238 last_inst->dst.index = qregs[chan].index;
239
240 /* Set the flags to the current exec mask. To insert
241 * the SF, we temporarily remove our SSA instruction.
242 */
243 list_del(&last_inst->link);
244 qir_SF(c, c->execute);
245 list_addtail(&last_inst->link,
246 &c->cur_block->instructions);
247
248 last_inst->cond = QPU_COND_ZS;
249 last_inst->cond_is_exec_mask = true;
250 }
251 }
252 }
253
254 static struct qreg
ntq_get_src(struct vc4_compile * c,nir_src src,int i)255 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
256 {
257 struct hash_entry *entry;
258
259 nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
260 if (load == NULL) {
261 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
262 assert(i < src.ssa->num_components);
263 } else {
264 nir_def *reg = load->src[0].ssa;
265 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
266 assert(nir_intrinsic_base(load) == 0);
267 assert(nir_intrinsic_num_array_elems(decl) == 0);
268 entry = _mesa_hash_table_search(c->def_ht, reg);
269 assert(i < nir_intrinsic_num_components(decl));
270 }
271
272 struct qreg *qregs = entry->data;
273 return qregs[i];
274 }
275
276 static struct qreg
ntq_get_alu_src(struct vc4_compile * c,nir_alu_instr * instr,unsigned src)277 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
278 unsigned src)
279 {
280 struct qreg r = ntq_get_src(c, instr->src[src].src,
281 instr->src[src].swizzle[0]);
282
283 return r;
284 };
285
286 static inline struct qreg
qir_SAT(struct vc4_compile * c,struct qreg val)287 qir_SAT(struct vc4_compile *c, struct qreg val)
288 {
289 return qir_FMAX(c,
290 qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
291 qir_uniform_f(c, 0.0));
292 }
293
294 static struct qreg
ntq_rcp(struct vc4_compile * c,struct qreg x)295 ntq_rcp(struct vc4_compile *c, struct qreg x)
296 {
297 struct qreg r = qir_RCP(c, x);
298
299 /* Apply a Newton-Raphson step to improve the accuracy. */
300 r = qir_FMUL(c, r, qir_FSUB(c,
301 qir_uniform_f(c, 2.0),
302 qir_FMUL(c, x, r)));
303
304 return r;
305 }
306
307 static struct qreg
ntq_rsq(struct vc4_compile * c,struct qreg x)308 ntq_rsq(struct vc4_compile *c, struct qreg x)
309 {
310 struct qreg r = qir_RSQ(c, x);
311
312 /* Apply a Newton-Raphson step to improve the accuracy. */
313 r = qir_FMUL(c, r, qir_FSUB(c,
314 qir_uniform_f(c, 1.5),
315 qir_FMUL(c,
316 qir_uniform_f(c, 0.5),
317 qir_FMUL(c, x,
318 qir_FMUL(c, r, r)))));
319
320 return r;
321 }
322
323 static struct qreg
ntq_umul(struct vc4_compile * c,struct qreg src0,struct qreg src1)324 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
325 {
326 struct qreg src0_hi = qir_SHR(c, src0,
327 qir_uniform_ui(c, 24));
328 struct qreg src1_hi = qir_SHR(c, src1,
329 qir_uniform_ui(c, 24));
330
331 struct qreg hilo = qir_MUL24(c, src0_hi, src1);
332 struct qreg lohi = qir_MUL24(c, src0, src1_hi);
333 struct qreg lolo = qir_MUL24(c, src0, src1);
334
335 return qir_ADD(c, lolo, qir_SHL(c,
336 qir_ADD(c, hilo, lohi),
337 qir_uniform_ui(c, 24)));
338 }
339
340 static struct qreg
ntq_scale_depth_texture(struct vc4_compile * c,struct qreg src)341 ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
342 {
343 struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
344 qir_uniform_ui(c, 8)));
345 return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
346 }
347
348 /**
349 * Emits a lowered TXF_MS from an MSAA texture.
350 *
351 * The addressing math has been lowered in NIR, and now we just need to read
352 * it like a UBO.
353 */
354 static void
ntq_emit_txf(struct vc4_compile * c,nir_tex_instr * instr)355 ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
356 {
357 uint32_t tile_width = 32;
358 uint32_t tile_height = 32;
359 uint32_t tile_size = (tile_height * tile_width *
360 VC4_MAX_SAMPLES * sizeof(uint32_t));
361
362 unsigned unit = instr->texture_index;
363 uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
364 uint32_t w_tiles = w / tile_width;
365 uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
366 uint32_t h_tiles = h / tile_height;
367 uint32_t size = w_tiles * h_tiles * tile_size;
368
369 struct qreg addr;
370 assert(instr->num_srcs == 1);
371 assert(instr->src[0].src_type == nir_tex_src_coord);
372 addr = ntq_get_src(c, instr->src[0].src, 0);
373
374 /* Perform the clamping required by kernel validation. */
375 addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
376 addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
377
378 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
379 addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
380
381 ntq_emit_thrsw(c);
382
383 struct qreg tex = qir_TEX_RESULT(c);
384 c->num_texture_samples++;
385
386 enum pipe_format format = c->key->tex[unit].format;
387 if (util_format_is_depth_or_stencil(format)) {
388 struct qreg scaled = ntq_scale_depth_texture(c, tex);
389 for (int i = 0; i < 4; i++)
390 ntq_store_def(c, &instr->def, i, qir_MOV(c, scaled));
391 } else {
392 for (int i = 0; i < 4; i++)
393 ntq_store_def(c, &instr->def, i,
394 qir_UNPACK_8_F(c, tex, i));
395 }
396 }
397
398 static void
ntq_emit_tex(struct vc4_compile * c,nir_tex_instr * instr)399 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
400 {
401 struct qreg s, t, r, lod, compare;
402 bool is_txb = false, is_txl = false;
403 unsigned unit = instr->texture_index;
404
405 if (instr->op == nir_texop_txf) {
406 ntq_emit_txf(c, instr);
407 return;
408 }
409
410 for (unsigned i = 0; i < instr->num_srcs; i++) {
411 switch (instr->src[i].src_type) {
412 case nir_tex_src_coord:
413 s = ntq_get_src(c, instr->src[i].src, 0);
414 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
415 t = qir_uniform_f(c, 0.5);
416 else
417 t = ntq_get_src(c, instr->src[i].src, 1);
418 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
419 r = ntq_get_src(c, instr->src[i].src, 2);
420 break;
421 case nir_tex_src_bias:
422 lod = ntq_get_src(c, instr->src[i].src, 0);
423 is_txb = true;
424 break;
425 case nir_tex_src_lod:
426 lod = ntq_get_src(c, instr->src[i].src, 0);
427 is_txl = true;
428 break;
429 case nir_tex_src_comparator:
430 compare = ntq_get_src(c, instr->src[i].src, 0);
431 break;
432 default:
433 unreachable("unknown texture source");
434 }
435 }
436
437 if (c->stage != QSTAGE_FRAG && !is_txl) {
438 /* From the GLSL 1.20 spec:
439 *
440 * "If it is mip-mapped and running on the vertex shader,
441 * then the base texture is used."
442 */
443 is_txl = true;
444 lod = qir_uniform_ui(c, 0);
445 }
446
447 if (c->key->tex[unit].force_first_level) {
448 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
449 is_txl = true;
450 is_txb = false;
451 }
452
453 struct qreg texture_u[] = {
454 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
455 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
456 qir_uniform(c, QUNIFORM_CONSTANT, 0),
457 qir_uniform(c, QUNIFORM_CONSTANT, 0),
458 };
459 uint32_t next_texture_u = 0;
460
461 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
462 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
463 unit | (is_txl << 16));
464 }
465
466 struct qinst *tmu;
467 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
468 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
469 tmu->src[qir_get_tex_uniform_src(tmu)] =
470 texture_u[next_texture_u++];
471 } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
472 c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
473 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
474 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
475 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
476 qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
477 unit));
478 tmu->src[qir_get_tex_uniform_src(tmu)] =
479 texture_u[next_texture_u++];
480 }
481
482 if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
483 s = qir_SAT(c, s);
484 }
485
486 if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
487 t = qir_SAT(c, t);
488 }
489
490 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
491 tmu->src[qir_get_tex_uniform_src(tmu)] =
492 texture_u[next_texture_u++];
493
494 if (is_txl || is_txb) {
495 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
496 tmu->src[qir_get_tex_uniform_src(tmu)] =
497 texture_u[next_texture_u++];
498 }
499
500 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
501 tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
502
503 c->num_texture_samples++;
504
505 ntq_emit_thrsw(c);
506
507 struct qreg tex = qir_TEX_RESULT(c);
508
509 enum pipe_format format = c->key->tex[unit].format;
510
511 if (util_format_is_depth_or_stencil(format)) {
512 struct qreg normalized = ntq_scale_depth_texture(c, tex);
513 struct qreg depth_output;
514
515 struct qreg u0 = qir_uniform_f(c, 0.0f);
516 struct qreg u1 = qir_uniform_f(c, 1.0f);
517 if (c->key->tex[unit].compare_mode) {
518 /* From the GL_ARB_shadow spec:
519 *
520 * "Let Dt (D subscript t) be the depth texture
521 * value, in the range [0, 1]. Let R be the
522 * interpolated texture coordinate clamped to the
523 * range [0, 1]."
524 */
525 compare = qir_SAT(c, compare);
526
527 switch (c->key->tex[unit].compare_func) {
528 case PIPE_FUNC_NEVER:
529 depth_output = qir_uniform_f(c, 0.0f);
530 break;
531 case PIPE_FUNC_ALWAYS:
532 depth_output = u1;
533 break;
534 case PIPE_FUNC_EQUAL:
535 qir_SF(c, qir_FSUB(c, compare, normalized));
536 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
537 break;
538 case PIPE_FUNC_NOTEQUAL:
539 qir_SF(c, qir_FSUB(c, compare, normalized));
540 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
541 break;
542 case PIPE_FUNC_GREATER:
543 qir_SF(c, qir_FSUB(c, compare, normalized));
544 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
545 break;
546 case PIPE_FUNC_GEQUAL:
547 qir_SF(c, qir_FSUB(c, normalized, compare));
548 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
549 break;
550 case PIPE_FUNC_LESS:
551 qir_SF(c, qir_FSUB(c, compare, normalized));
552 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
553 break;
554 case PIPE_FUNC_LEQUAL:
555 qir_SF(c, qir_FSUB(c, normalized, compare));
556 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
557 break;
558 }
559 } else {
560 depth_output = normalized;
561 }
562
563 for (int i = 0; i < 4; i++)
564 ntq_store_def(c, &instr->def, i,
565 qir_MOV(c, depth_output));
566 } else {
567 for (int i = 0; i < 4; i++)
568 ntq_store_def(c, &instr->def, i,
569 qir_UNPACK_8_F(c, tex, i));
570 }
571 }
572
573 /**
574 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
575 * to zero).
576 */
577 static struct qreg
ntq_ffract(struct vc4_compile * c,struct qreg src)578 ntq_ffract(struct vc4_compile *c, struct qreg src)
579 {
580 struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
581 struct qreg diff = qir_FSUB(c, src, trunc);
582 qir_SF(c, diff);
583
584 qir_FADD_dest(c, diff,
585 diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
586
587 return qir_MOV(c, diff);
588 }
589
590 /**
591 * Computes floor(x), which is tricky because our FTOI truncates (rounds to
592 * zero).
593 */
594 static struct qreg
ntq_ffloor(struct vc4_compile * c,struct qreg src)595 ntq_ffloor(struct vc4_compile *c, struct qreg src)
596 {
597 struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
598
599 /* This will be < 0 if we truncated and the truncation was of a value
600 * that was < 0 in the first place.
601 */
602 qir_SF(c, qir_FSUB(c, src, result));
603
604 struct qinst *sub = qir_FSUB_dest(c, result,
605 result, qir_uniform_f(c, 1.0));
606 sub->cond = QPU_COND_NS;
607
608 return qir_MOV(c, result);
609 }
610
611 /**
612 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
613 * zero).
614 */
615 static struct qreg
ntq_fceil(struct vc4_compile * c,struct qreg src)616 ntq_fceil(struct vc4_compile *c, struct qreg src)
617 {
618 struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
619
620 /* This will be < 0 if we truncated and the truncation was of a value
621 * that was > 0 in the first place.
622 */
623 qir_SF(c, qir_FSUB(c, result, src));
624
625 qir_FADD_dest(c, result,
626 result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
627
628 return qir_MOV(c, result);
629 }
630
631 static struct qreg
ntq_shrink_sincos_input_range(struct vc4_compile * c,struct qreg x)632 ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
633 {
634 /* Since we're using a Taylor approximation, we want to have a small
635 * number of coefficients and take advantage of sin/cos repeating
636 * every 2pi. We keep our x as close to 0 as we can, since the series
637 * will be less accurate as |x| increases. (Also, be careful of
638 * shifting the input x value to be tricky with sin/cos relations,
639 * because getting accurate values for x==0 is very important for SDL
640 * rendering)
641 */
642 struct qreg scaled_x =
643 qir_FMUL(c, x,
644 qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
645 /* Note: FTOI truncates toward 0. */
646 struct qreg x_frac = qir_FSUB(c, scaled_x,
647 qir_ITOF(c, qir_FTOI(c, scaled_x)));
648 /* Map [0.5, 1] to [-0.5, 0] */
649 qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
650 qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
651 /* Map [-1, -0.5] to [0, 0.5] */
652 qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
653 qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
654
655 return x_frac;
656 }
657
658 static struct qreg
ntq_fsin(struct vc4_compile * c,struct qreg src)659 ntq_fsin(struct vc4_compile *c, struct qreg src)
660 {
661 float coeff[] = {
662 2.0 * M_PI,
663 -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
664 pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
665 -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
666 pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
667 };
668
669 struct qreg x = ntq_shrink_sincos_input_range(c, src);
670 struct qreg x2 = qir_FMUL(c, x, x);
671 struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
672 for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
673 x = qir_FMUL(c, x, x2);
674 sum = qir_FADD(c,
675 sum,
676 qir_FMUL(c,
677 x,
678 qir_uniform_f(c, coeff[i])));
679 }
680 return sum;
681 }
682
683 static struct qreg
ntq_fcos(struct vc4_compile * c,struct qreg src)684 ntq_fcos(struct vc4_compile *c, struct qreg src)
685 {
686 float coeff[] = {
687 1.0f,
688 -pow(2.0 * M_PI, 2) / (2 * 1),
689 pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
690 -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
691 pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
692 -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
693 };
694
695 struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
696 struct qreg sum = qir_uniform_f(c, coeff[0]);
697 struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
698 struct qreg x = x2; /* Current x^2, x^4, or x^6 */
699 for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
700 if (i != 1)
701 x = qir_FMUL(c, x, x2);
702
703 sum = qir_FADD(c, qir_FMUL(c,
704 x,
705 qir_uniform_f(c, coeff[i])),
706 sum);
707 }
708 return sum;
709 }
710
711 static struct qreg
ntq_fsign(struct vc4_compile * c,struct qreg src)712 ntq_fsign(struct vc4_compile *c, struct qreg src)
713 {
714 struct qreg t = qir_get_temp(c);
715
716 qir_SF(c, src);
717 qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
718 qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
719 qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
720 return qir_MOV(c, t);
721 }
722
723 static void
emit_vertex_input(struct vc4_compile * c,int attr)724 emit_vertex_input(struct vc4_compile *c, int attr)
725 {
726 enum pipe_format format = c->vs_key->attr_formats[attr];
727 uint32_t attr_size = util_format_get_blocksize(format);
728
729 c->vattr_sizes[attr] = align(attr_size, 4);
730 for (int i = 0; i < align(attr_size, 4) / 4; i++) {
731 c->inputs[attr * 4 + i] =
732 qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
733 c->num_inputs++;
734 }
735 }
736
737 static void
emit_fragcoord_input(struct vc4_compile * c,int attr)738 emit_fragcoord_input(struct vc4_compile *c, int attr)
739 {
740 c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
741 c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
742 c->inputs[attr * 4 + 2] =
743 qir_FMUL(c,
744 qir_ITOF(c, qir_FRAG_Z(c)),
745 qir_uniform_f(c, 1.0 / 0xffffff));
746 c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
747 }
748
749 static struct qreg
emit_fragment_varying(struct vc4_compile * c,gl_varying_slot slot,uint8_t swizzle)750 emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
751 uint8_t swizzle)
752 {
753 uint32_t i = c->num_input_slots++;
754 struct qreg vary = {
755 QFILE_VARY,
756 i
757 };
758
759 if (c->num_input_slots >= c->input_slots_array_size) {
760 c->input_slots_array_size =
761 MAX2(4, c->input_slots_array_size * 2);
762
763 c->input_slots = reralloc(c, c->input_slots,
764 struct vc4_varying_slot,
765 c->input_slots_array_size);
766 }
767
768 c->input_slots[i].slot = slot;
769 c->input_slots[i].swizzle = swizzle;
770
771 return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
772 }
773
774 static void
emit_fragment_input(struct vc4_compile * c,int attr,gl_varying_slot slot)775 emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
776 {
777 for (int i = 0; i < 4; i++) {
778 c->inputs[attr * 4 + i] =
779 emit_fragment_varying(c, slot, i);
780 c->num_inputs++;
781 }
782 }
783
784 static void
add_output(struct vc4_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)785 add_output(struct vc4_compile *c,
786 uint32_t decl_offset,
787 uint8_t slot,
788 uint8_t swizzle)
789 {
790 uint32_t old_array_size = c->outputs_array_size;
791 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
792 decl_offset + 1);
793
794 if (old_array_size != c->outputs_array_size) {
795 c->output_slots = reralloc(c,
796 c->output_slots,
797 struct vc4_varying_slot,
798 c->outputs_array_size);
799 }
800
801 c->output_slots[decl_offset].slot = slot;
802 c->output_slots[decl_offset].swizzle = swizzle;
803 }
804
805 static bool
ntq_src_is_only_ssa_def_user(nir_src * src)806 ntq_src_is_only_ssa_def_user(nir_src *src)
807 {
808 return list_is_singular(&src->ssa->uses) &&
809 nir_load_reg_for_def(src->ssa) == NULL;
810 }
811
812 /**
813 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
814 * bit set.
815 *
816 * However, as an optimization, it tries to find the instructions generating
817 * the sources to be packed and just emit the pack flag there, if possible.
818 */
819 static void
ntq_emit_pack_unorm_4x8(struct vc4_compile * c,nir_alu_instr * instr)820 ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
821 {
822 struct qreg result = qir_get_temp(c);
823 struct nir_alu_instr *vec4 = NULL;
824
825 /* If packing from a vec4 op (as expected), identify it so that we can
826 * peek back at what generated its sources.
827 */
828 if (instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
829 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
830 nir_op_vec4) {
831 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
832 }
833
834 /* If the pack is replicating the same channel 4 times, use the 8888
835 * pack flag. This is common for blending using the alpha
836 * channel.
837 */
838 if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
839 instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
840 instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
841 struct qreg rep = ntq_get_src(c,
842 instr->src[0].src,
843 instr->src[0].swizzle[0]);
844 ntq_store_def(c, &instr->def, 0, qir_PACK_8888_F(c, rep));
845 return;
846 }
847
848 for (int i = 0; i < 4; i++) {
849 int swiz = instr->src[0].swizzle[i];
850 struct qreg src;
851 if (vec4) {
852 src = ntq_get_src(c, vec4->src[swiz].src,
853 vec4->src[swiz].swizzle[0]);
854 } else {
855 src = ntq_get_src(c, instr->src[0].src, swiz);
856 }
857
858 if (vec4 &&
859 ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
860 src.file == QFILE_TEMP &&
861 c->defs[src.index] &&
862 qir_is_mul(c->defs[src.index]) &&
863 !c->defs[src.index]->dst.pack) {
864 struct qinst *rewrite = c->defs[src.index];
865 c->defs[src.index] = NULL;
866 rewrite->dst = result;
867 rewrite->dst.pack = QPU_PACK_MUL_8A + i;
868 continue;
869 }
870
871 qir_PACK_8_F(c, result, src, i);
872 }
873
874 ntq_store_def(c, &instr->def, 0, qir_MOV(c, result));
875 }
876
877 /** Handles sign-extended bitfield extracts for 16 bits. */
878 static struct qreg
ntq_emit_ibfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)879 ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
880 struct qreg bits)
881 {
882 assert(bits.file == QFILE_UNIF &&
883 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
884 c->uniform_data[bits.index] == 16);
885
886 assert(offset.file == QFILE_UNIF &&
887 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
888 int offset_bit = c->uniform_data[offset.index];
889 assert(offset_bit % 16 == 0);
890
891 return qir_UNPACK_16_I(c, base, offset_bit / 16);
892 }
893
894 /** Handles unsigned bitfield extracts for 8 bits. */
895 static struct qreg
ntq_emit_ubfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)896 ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
897 struct qreg bits)
898 {
899 assert(bits.file == QFILE_UNIF &&
900 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
901 c->uniform_data[bits.index] == 8);
902
903 assert(offset.file == QFILE_UNIF &&
904 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
905 int offset_bit = c->uniform_data[offset.index];
906 assert(offset_bit % 8 == 0);
907
908 return qir_UNPACK_8_I(c, base, offset_bit / 8);
909 }
910
911 /**
912 * If compare_instr is a valid comparison instruction, emits the
913 * compare_instr's comparison and returns the sel_instr's return value based
914 * on the compare_instr's result.
915 */
916 static bool
ntq_emit_comparison(struct vc4_compile * c,struct qreg * dest,nir_alu_instr * compare_instr,nir_alu_instr * sel_instr)917 ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
918 nir_alu_instr *compare_instr,
919 nir_alu_instr *sel_instr)
920 {
921 enum qpu_cond cond;
922
923 switch (compare_instr->op) {
924 case nir_op_feq32:
925 case nir_op_ieq32:
926 case nir_op_seq:
927 cond = QPU_COND_ZS;
928 break;
929 case nir_op_fneu32:
930 case nir_op_ine32:
931 case nir_op_sne:
932 cond = QPU_COND_ZC;
933 break;
934 case nir_op_fge32:
935 case nir_op_ige32:
936 case nir_op_uge32:
937 case nir_op_sge:
938 cond = QPU_COND_NC;
939 break;
940 case nir_op_flt32:
941 case nir_op_ilt32:
942 case nir_op_slt:
943 cond = QPU_COND_NS;
944 break;
945 default:
946 return false;
947 }
948
949 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
950 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
951
952 unsigned unsized_type =
953 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
954 if (unsized_type == nir_type_float)
955 qir_SF(c, qir_FSUB(c, src0, src1));
956 else
957 qir_SF(c, qir_SUB(c, src0, src1));
958
959 switch (sel_instr->op) {
960 case nir_op_seq:
961 case nir_op_sne:
962 case nir_op_sge:
963 case nir_op_slt:
964 *dest = qir_SEL(c, cond,
965 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
966 break;
967
968 case nir_op_b32csel:
969 *dest = qir_SEL(c, cond,
970 ntq_get_alu_src(c, sel_instr, 1),
971 ntq_get_alu_src(c, sel_instr, 2));
972 break;
973
974 default:
975 *dest = qir_SEL(c, cond,
976 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
977 break;
978 }
979
980 /* Make the temporary for nir_store_def(). */
981 *dest = qir_MOV(c, *dest);
982
983 return true;
984 }
985
986 /**
987 * Attempts to fold a comparison generating a boolean result into the
988 * condition code for selecting between two values, instead of comparing the
989 * boolean result against 0 to generate the condition code.
990 */
ntq_emit_bcsel(struct vc4_compile * c,nir_alu_instr * instr,struct qreg * src)991 static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
992 struct qreg *src)
993 {
994 if (nir_load_reg_for_def(instr->src[0].src.ssa))
995 goto out;
996 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
997 goto out;
998 nir_alu_instr *compare =
999 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1000 if (!compare)
1001 goto out;
1002
1003 struct qreg dest;
1004 if (ntq_emit_comparison(c, &dest, compare, instr))
1005 return dest;
1006
1007 out:
1008 qir_SF(c, src[0]);
1009 return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
1010 }
1011
1012 static struct qreg
ntq_fddx(struct vc4_compile * c,struct qreg src)1013 ntq_fddx(struct vc4_compile *c, struct qreg src)
1014 {
1015 /* Make sure that we have a bare temp to use for MUL rotation, so it
1016 * can be allocated to an accumulator.
1017 */
1018 if (src.pack || src.file != QFILE_TEMP)
1019 src = qir_MOV(c, src);
1020
1021 struct qreg from_left = qir_ROT_MUL(c, src, 1);
1022 struct qreg from_right = qir_ROT_MUL(c, src, 15);
1023
1024 /* Distinguish left/right pixels of the quad. */
1025 qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
1026 qir_uniform_ui(c, 1)));
1027
1028 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1029 qir_FSUB(c, from_right, src),
1030 qir_FSUB(c, src, from_left)));
1031 }
1032
1033 static struct qreg
ntq_fddy(struct vc4_compile * c,struct qreg src)1034 ntq_fddy(struct vc4_compile *c, struct qreg src)
1035 {
1036 if (src.pack || src.file != QFILE_TEMP)
1037 src = qir_MOV(c, src);
1038
1039 struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
1040 struct qreg from_top = qir_ROT_MUL(c, src, 14);
1041
1042 /* Distinguish top/bottom pixels of the quad. */
1043 qir_SF(c, qir_AND(c,
1044 qir_reg(QFILE_QPU_ELEMENT, 0),
1045 qir_uniform_ui(c, 2)));
1046
1047 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1048 qir_FSUB(c, from_top, src),
1049 qir_FSUB(c, src, from_bottom)));
1050 }
1051
1052 static struct qreg
ntq_emit_cond_to_int(struct vc4_compile * c,enum qpu_cond cond)1053 ntq_emit_cond_to_int(struct vc4_compile *c, enum qpu_cond cond)
1054 {
1055 return qir_MOV(c, qir_SEL(c, cond,
1056 qir_uniform_ui(c, 1),
1057 qir_uniform_ui(c, 0)));
1058 }
1059
1060 static void
ntq_emit_alu(struct vc4_compile * c,nir_alu_instr * instr)1061 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
1062 {
1063 /* Vectors are special in that they have non-scalarized writemasks,
1064 * and just take the first swizzle channel for each argument in order
1065 * into each writemask channel.
1066 */
1067 if (instr->op == nir_op_vec2 ||
1068 instr->op == nir_op_vec3 ||
1069 instr->op == nir_op_vec4) {
1070 struct qreg srcs[4];
1071 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1072 srcs[i] = ntq_get_src(c, instr->src[i].src,
1073 instr->src[i].swizzle[0]);
1074 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1075 ntq_store_def(c, &instr->def, i,
1076 qir_MOV(c, srcs[i]));
1077 return;
1078 }
1079
1080 if (instr->op == nir_op_pack_unorm_4x8) {
1081 ntq_emit_pack_unorm_4x8(c, instr);
1082 return;
1083 }
1084
1085 if (instr->op == nir_op_unpack_unorm_4x8) {
1086 struct qreg src = ntq_get_src(c, instr->src[0].src,
1087 instr->src[0].swizzle[0]);
1088 unsigned count = instr->def.num_components;
1089 for (int i = 0; i < count; i++) {
1090 ntq_store_def(c, &instr->def, i,
1091 qir_UNPACK_8_F(c, src, i));
1092 }
1093 return;
1094 }
1095
1096 /* General case: We can just grab the one used channel per src. */
1097 struct qreg src[nir_op_infos[instr->op].num_inputs];
1098 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1099 src[i] = ntq_get_alu_src(c, instr, i);
1100 }
1101
1102 struct qreg result;
1103
1104 switch (instr->op) {
1105 case nir_op_mov:
1106 result = qir_MOV(c, src[0]);
1107 break;
1108 case nir_op_fmul:
1109 result = qir_FMUL(c, src[0], src[1]);
1110 break;
1111 case nir_op_fadd:
1112 result = qir_FADD(c, src[0], src[1]);
1113 break;
1114 case nir_op_fsub:
1115 result = qir_FSUB(c, src[0], src[1]);
1116 break;
1117 case nir_op_fmin:
1118 result = qir_FMIN(c, src[0], src[1]);
1119 break;
1120 case nir_op_fmax:
1121 result = qir_FMAX(c, src[0], src[1]);
1122 break;
1123
1124 case nir_op_f2i32:
1125 case nir_op_f2u32:
1126 result = qir_FTOI(c, src[0]);
1127 break;
1128 case nir_op_i2f32:
1129 case nir_op_u2f32:
1130 result = qir_ITOF(c, src[0]);
1131 break;
1132 case nir_op_b2f32:
1133 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
1134 break;
1135 case nir_op_b2i32:
1136 result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
1137 break;
1138
1139 case nir_op_iadd:
1140 result = qir_ADD(c, src[0], src[1]);
1141 break;
1142 case nir_op_ushr:
1143 result = qir_SHR(c, src[0], src[1]);
1144 break;
1145 case nir_op_isub:
1146 result = qir_SUB(c, src[0], src[1]);
1147 break;
1148 case nir_op_ishr:
1149 result = qir_ASR(c, src[0], src[1]);
1150 break;
1151 case nir_op_ishl:
1152 result = qir_SHL(c, src[0], src[1]);
1153 break;
1154 case nir_op_imin:
1155 result = qir_MIN(c, src[0], src[1]);
1156 break;
1157 case nir_op_imax:
1158 result = qir_MAX(c, src[0], src[1]);
1159 break;
1160 case nir_op_iand:
1161 result = qir_AND(c, src[0], src[1]);
1162 break;
1163 case nir_op_ior:
1164 result = qir_OR(c, src[0], src[1]);
1165 break;
1166 case nir_op_ixor:
1167 result = qir_XOR(c, src[0], src[1]);
1168 break;
1169 case nir_op_inot:
1170 result = qir_NOT(c, src[0]);
1171 break;
1172
1173 case nir_op_imul:
1174 result = ntq_umul(c, src[0], src[1]);
1175 break;
1176
1177 case nir_op_seq:
1178 case nir_op_sne:
1179 case nir_op_sge:
1180 case nir_op_slt:
1181 case nir_op_feq32:
1182 case nir_op_fneu32:
1183 case nir_op_fge32:
1184 case nir_op_flt32:
1185 case nir_op_ieq32:
1186 case nir_op_ine32:
1187 case nir_op_ige32:
1188 case nir_op_uge32:
1189 case nir_op_ilt32:
1190 if (!ntq_emit_comparison(c, &result, instr, instr)) {
1191 fprintf(stderr, "Bad comparison instruction\n");
1192 }
1193 break;
1194
1195 case nir_op_b32csel:
1196 result = ntq_emit_bcsel(c, instr, src);
1197 break;
1198 case nir_op_fcsel:
1199 qir_SF(c, src[0]);
1200 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
1201 break;
1202
1203 case nir_op_frcp:
1204 result = ntq_rcp(c, src[0]);
1205 break;
1206 case nir_op_frsq:
1207 result = ntq_rsq(c, src[0]);
1208 break;
1209 case nir_op_fexp2:
1210 result = qir_EXP2(c, src[0]);
1211 break;
1212 case nir_op_flog2:
1213 result = qir_LOG2(c, src[0]);
1214 break;
1215
1216 case nir_op_ftrunc:
1217 result = qir_ITOF(c, qir_FTOI(c, src[0]));
1218 break;
1219 case nir_op_fceil:
1220 result = ntq_fceil(c, src[0]);
1221 break;
1222 case nir_op_ffract:
1223 result = ntq_ffract(c, src[0]);
1224 break;
1225 case nir_op_ffloor:
1226 result = ntq_ffloor(c, src[0]);
1227 break;
1228
1229 case nir_op_fsin:
1230 result = ntq_fsin(c, src[0]);
1231 break;
1232 case nir_op_fcos:
1233 result = ntq_fcos(c, src[0]);
1234 break;
1235
1236 case nir_op_fsign:
1237 result = ntq_fsign(c, src[0]);
1238 break;
1239
1240 case nir_op_fabs:
1241 result = qir_FMAXABS(c, src[0], src[0]);
1242 break;
1243 case nir_op_iabs:
1244 result = qir_MAX(c, src[0],
1245 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1246 break;
1247
1248 case nir_op_ibitfield_extract:
1249 result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
1250 break;
1251
1252 case nir_op_ubitfield_extract:
1253 result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
1254 break;
1255
1256 case nir_op_usadd_4x8_vc4:
1257 result = qir_V8ADDS(c, src[0], src[1]);
1258 break;
1259
1260 case nir_op_ussub_4x8_vc4:
1261 result = qir_V8SUBS(c, src[0], src[1]);
1262 break;
1263
1264 case nir_op_umin_4x8_vc4:
1265 result = qir_V8MIN(c, src[0], src[1]);
1266 break;
1267
1268 case nir_op_umax_4x8_vc4:
1269 result = qir_V8MAX(c, src[0], src[1]);
1270 break;
1271
1272 case nir_op_umul_unorm_4x8_vc4:
1273 result = qir_V8MULD(c, src[0], src[1]);
1274 break;
1275
1276 case nir_op_fddx:
1277 case nir_op_fddx_coarse:
1278 case nir_op_fddx_fine:
1279 result = ntq_fddx(c, src[0]);
1280 break;
1281
1282 case nir_op_fddy:
1283 case nir_op_fddy_coarse:
1284 case nir_op_fddy_fine:
1285 result = ntq_fddy(c, src[0]);
1286 break;
1287
1288 case nir_op_uadd_carry:
1289 qir_SF(c, qir_ADD(c, src[0], src[1]));
1290 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1291 break;
1292
1293 case nir_op_usub_borrow:
1294 qir_SF(c, qir_SUB(c, src[0], src[1]));
1295 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1296 break;
1297
1298 default:
1299 fprintf(stderr, "unknown NIR ALU inst: ");
1300 nir_print_instr(&instr->instr, stderr);
1301 fprintf(stderr, "\n");
1302 abort();
1303 }
1304
1305 ntq_store_def(c, &instr->def, 0, result);
1306 }
1307
1308 static void
emit_frag_end(struct vc4_compile * c)1309 emit_frag_end(struct vc4_compile *c)
1310 {
1311 struct qreg color;
1312 if (c->output_color_index != -1) {
1313 color = c->outputs[c->output_color_index];
1314 } else {
1315 color = qir_uniform_ui(c, 0);
1316 }
1317
1318 uint32_t discard_cond = QPU_COND_ALWAYS;
1319 if (c->s->info.fs.uses_discard) {
1320 qir_SF(c, c->discard);
1321 discard_cond = QPU_COND_ZS;
1322 }
1323
1324 if (c->fs_key->stencil_enabled) {
1325 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1326 qir_uniform(c, QUNIFORM_STENCIL, 0));
1327 if (c->fs_key->stencil_twoside) {
1328 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1329 qir_uniform(c, QUNIFORM_STENCIL, 1));
1330 }
1331 if (c->fs_key->stencil_full_writemasks) {
1332 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1333 qir_uniform(c, QUNIFORM_STENCIL, 2));
1334 }
1335 }
1336
1337 if (c->output_sample_mask_index != -1) {
1338 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1339 }
1340
1341 if (c->fs_key->depth_enabled) {
1342 if (c->output_position_index != -1) {
1343 qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1344 qir_FMUL(c,
1345 c->outputs[c->output_position_index],
1346 qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
1347 } else {
1348 qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1349 qir_FRAG_Z(c))->cond = discard_cond;
1350 }
1351 }
1352
1353 if (!c->msaa_per_sample_output) {
1354 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
1355 color)->cond = discard_cond;
1356 } else {
1357 for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
1358 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
1359 c->sample_colors[i])->cond = discard_cond;
1360 }
1361 }
1362 }
1363
1364 static void
emit_scaled_viewport_write(struct vc4_compile * c,struct qreg rcp_w)1365 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1366 {
1367 struct qreg packed = qir_get_temp(c);
1368
1369 for (int i = 0; i < 2; i++) {
1370 struct qreg scale =
1371 qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1372
1373 struct qreg packed_chan = packed;
1374 packed_chan.pack = QPU_PACK_A_16A + i;
1375
1376 qir_FTOI_dest(c, packed_chan,
1377 qir_FMUL(c,
1378 qir_FMUL(c,
1379 c->outputs[c->output_position_index + i],
1380 scale),
1381 rcp_w));
1382 }
1383
1384 qir_VPM_WRITE(c, packed);
1385 }
1386
1387 static void
emit_zs_write(struct vc4_compile * c,struct qreg rcp_w)1388 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1389 {
1390 struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1391 struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1392
1393 qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1394 c->outputs[c->output_position_index + 2],
1395 zscale),
1396 rcp_w),
1397 zoffset));
1398 }
1399
1400 static void
emit_rcp_wc_write(struct vc4_compile * c,struct qreg rcp_w)1401 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1402 {
1403 qir_VPM_WRITE(c, rcp_w);
1404 }
1405
1406 static void
emit_point_size_write(struct vc4_compile * c)1407 emit_point_size_write(struct vc4_compile *c)
1408 {
1409 struct qreg point_size;
1410
1411 if (c->output_point_size_index != -1)
1412 point_size = c->outputs[c->output_point_size_index];
1413 else
1414 point_size = qir_uniform_f(c, 1.0);
1415
1416 qir_VPM_WRITE(c, point_size);
1417 }
1418
1419 /**
1420 * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1421 *
1422 * The simulator insists that there be at least one vertex attribute, so
1423 * vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also
1424 * insists that all vertex attributes loaded get read by the VS/CS, so we have
1425 * to consume it here.
1426 */
1427 static void
emit_stub_vpm_read(struct vc4_compile * c)1428 emit_stub_vpm_read(struct vc4_compile *c)
1429 {
1430 if (c->num_inputs)
1431 return;
1432
1433 c->vattr_sizes[0] = 4;
1434 (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
1435 c->num_inputs++;
1436 }
1437
1438 static void
emit_vert_end(struct vc4_compile * c,struct vc4_varying_slot * fs_inputs,uint32_t num_fs_inputs)1439 emit_vert_end(struct vc4_compile *c,
1440 struct vc4_varying_slot *fs_inputs,
1441 uint32_t num_fs_inputs)
1442 {
1443 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1444
1445 emit_stub_vpm_read(c);
1446
1447 emit_scaled_viewport_write(c, rcp_w);
1448 emit_zs_write(c, rcp_w);
1449 emit_rcp_wc_write(c, rcp_w);
1450 if (c->vs_key->per_vertex_point_size)
1451 emit_point_size_write(c);
1452
1453 for (int i = 0; i < num_fs_inputs; i++) {
1454 struct vc4_varying_slot *input = &fs_inputs[i];
1455 int j;
1456
1457 for (j = 0; j < c->num_outputs; j++) {
1458 struct vc4_varying_slot *output =
1459 &c->output_slots[j];
1460
1461 if (input->slot == output->slot &&
1462 input->swizzle == output->swizzle) {
1463 qir_VPM_WRITE(c, c->outputs[j]);
1464 break;
1465 }
1466 }
1467 /* Emit padding if we didn't find a declared VS output for
1468 * this FS input.
1469 */
1470 if (j == c->num_outputs)
1471 qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1472 }
1473 }
1474
1475 static void
emit_coord_end(struct vc4_compile * c)1476 emit_coord_end(struct vc4_compile *c)
1477 {
1478 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1479
1480 emit_stub_vpm_read(c);
1481
1482 for (int i = 0; i < 4; i++)
1483 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1484
1485 emit_scaled_viewport_write(c, rcp_w);
1486 emit_zs_write(c, rcp_w);
1487 emit_rcp_wc_write(c, rcp_w);
1488 if (c->vs_key->per_vertex_point_size)
1489 emit_point_size_write(c);
1490 }
1491
1492 static void
vc4_optimize_nir(struct nir_shader * s)1493 vc4_optimize_nir(struct nir_shader *s)
1494 {
1495 bool progress;
1496 unsigned lower_flrp =
1497 (s->options->lower_flrp16 ? 16 : 0) |
1498 (s->options->lower_flrp32 ? 32 : 0) |
1499 (s->options->lower_flrp64 ? 64 : 0);
1500
1501 do {
1502 progress = false;
1503
1504 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1505 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1506 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1507 NIR_PASS(progress, s, nir_copy_prop);
1508 NIR_PASS(progress, s, nir_opt_remove_phis);
1509 NIR_PASS(progress, s, nir_opt_dce);
1510 NIR_PASS(progress, s, nir_opt_dead_cf);
1511 NIR_PASS(progress, s, nir_opt_cse);
1512 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1513 NIR_PASS(progress, s, nir_opt_algebraic);
1514 NIR_PASS(progress, s, nir_opt_constant_folding);
1515 if (lower_flrp != 0) {
1516 bool lower_flrp_progress = false;
1517
1518 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1519 lower_flrp,
1520 false /* always_precise */);
1521 if (lower_flrp_progress) {
1522 NIR_PASS(progress, s, nir_opt_constant_folding);
1523 progress = true;
1524 }
1525
1526 /* Nothing should rematerialize any flrps, so we only
1527 * need to do this lowering once.
1528 */
1529 lower_flrp = 0;
1530 }
1531
1532 NIR_PASS(progress, s, nir_opt_undef);
1533 NIR_PASS(progress, s, nir_opt_loop_unroll);
1534 } while (progress);
1535 }
1536
1537 static int
driver_location_compare(const void * in_a,const void * in_b)1538 driver_location_compare(const void *in_a, const void *in_b)
1539 {
1540 const nir_variable *const *a = in_a;
1541 const nir_variable *const *b = in_b;
1542
1543 return (*a)->data.driver_location - (*b)->data.driver_location;
1544 }
1545
1546 static void
ntq_setup_inputs(struct vc4_compile * c)1547 ntq_setup_inputs(struct vc4_compile *c)
1548 {
1549 unsigned num_entries = 0;
1550 nir_foreach_shader_in_variable(var, c->s)
1551 num_entries++;
1552
1553 nir_variable *vars[num_entries];
1554
1555 unsigned i = 0;
1556 nir_foreach_shader_in_variable(var, c->s)
1557 vars[i++] = var;
1558
1559 /* Sort the variables so that we emit the input setup in
1560 * driver_location order. This is required for VPM reads, whose data
1561 * is fetched into the VPM in driver_location (TGSI register index)
1562 * order.
1563 */
1564 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1565
1566 for (unsigned i = 0; i < num_entries; i++) {
1567 nir_variable *var = vars[i];
1568 assert(glsl_type_is_vector_or_scalar(var->type));
1569 unsigned loc = var->data.driver_location;
1570
1571 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1572 (loc + 1) * 4);
1573
1574 if (c->stage == QSTAGE_FRAG) {
1575 if (var->data.location == VARYING_SLOT_POS) {
1576 emit_fragcoord_input(c, loc);
1577 } else if (util_varying_is_point_coord(var->data.location,
1578 c->fs_key->point_sprite_mask)) {
1579 c->inputs[loc * 4 + 0] = c->point_x;
1580 c->inputs[loc * 4 + 1] = c->point_y;
1581 } else {
1582 emit_fragment_input(c, loc, var->data.location);
1583 }
1584 } else {
1585 emit_vertex_input(c, loc);
1586 }
1587 }
1588 }
1589
1590 static void
ntq_setup_outputs(struct vc4_compile * c)1591 ntq_setup_outputs(struct vc4_compile *c)
1592 {
1593 nir_foreach_shader_out_variable(var, c->s) {
1594 assert(glsl_type_is_vector_or_scalar(var->type));
1595 unsigned loc = var->data.driver_location * 4;
1596
1597 for (int i = 0; i < 4; i++)
1598 add_output(c, loc + i, var->data.location, i);
1599
1600 if (c->stage == QSTAGE_FRAG) {
1601 switch (var->data.location) {
1602 case FRAG_RESULT_COLOR:
1603 case FRAG_RESULT_DATA0:
1604 c->output_color_index = loc;
1605 break;
1606 case FRAG_RESULT_DEPTH:
1607 c->output_position_index = loc;
1608 break;
1609 case FRAG_RESULT_SAMPLE_MASK:
1610 c->output_sample_mask_index = loc;
1611 break;
1612 }
1613 } else {
1614 switch (var->data.location) {
1615 case VARYING_SLOT_POS:
1616 c->output_position_index = loc;
1617 break;
1618 case VARYING_SLOT_PSIZ:
1619 c->output_point_size_index = loc;
1620 break;
1621 }
1622 }
1623 }
1624 }
1625
1626 /**
1627 * Sets up the mapping from nir_register to struct qreg *.
1628 *
1629 * Each nir_register gets a struct qreg per 32-bit component being stored.
1630 */
1631 static void
ntq_setup_registers(struct vc4_compile * c,nir_function_impl * impl)1632 ntq_setup_registers(struct vc4_compile *c, nir_function_impl *impl)
1633 {
1634 nir_foreach_reg_decl(decl, impl) {
1635 unsigned num_components = nir_intrinsic_num_components(decl);
1636 unsigned array_len = nir_intrinsic_num_array_elems(decl);
1637 array_len = MAX2(array_len, 1);
1638 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1639 array_len * num_components);
1640
1641 nir_def *nir_reg = &decl->def;
1642 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1643
1644 for (int i = 0; i < array_len * num_components; i++)
1645 qregs[i] = qir_get_temp(c);
1646 }
1647 }
1648
1649 static void
ntq_emit_load_const(struct vc4_compile * c,nir_load_const_instr * instr)1650 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1651 {
1652 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1653 for (int i = 0; i < instr->def.num_components; i++)
1654 qregs[i] = qir_uniform_ui(c, instr->value[i].u32);
1655
1656 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1657 }
1658
1659 static void
ntq_emit_ssa_undef(struct vc4_compile * c,nir_undef_instr * instr)1660 ntq_emit_ssa_undef(struct vc4_compile *c, nir_undef_instr *instr)
1661 {
1662 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1663
1664 /* QIR needs there to be *some* value, so pick 0 (same as for
1665 * ntq_setup_registers().
1666 */
1667 for (int i = 0; i < instr->def.num_components; i++)
1668 qregs[i] = qir_uniform_ui(c, 0);
1669 }
1670
1671 static void
ntq_emit_color_read(struct vc4_compile * c,nir_intrinsic_instr * instr)1672 ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
1673 {
1674 assert(nir_src_as_uint(instr->src[0]) == 0);
1675
1676 /* Reads of the per-sample color need to be done in
1677 * order.
1678 */
1679 int sample_index = (nir_intrinsic_base(instr) -
1680 VC4_NIR_TLB_COLOR_READ_INPUT);
1681 for (int i = 0; i <= sample_index; i++) {
1682 if (c->color_reads[i].file == QFILE_NULL) {
1683 c->color_reads[i] =
1684 qir_TLB_COLOR_READ(c);
1685 }
1686 }
1687 ntq_store_def(c, &instr->def, 0,
1688 qir_MOV(c, c->color_reads[sample_index]));
1689 }
1690
1691 static void
ntq_emit_load_input(struct vc4_compile * c,nir_intrinsic_instr * instr)1692 ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
1693 {
1694 assert(instr->num_components == 1);
1695 assert(nir_src_is_const(instr->src[0]) &&
1696 "vc4 doesn't support indirect inputs");
1697
1698 if (c->stage == QSTAGE_FRAG &&
1699 nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
1700 ntq_emit_color_read(c, instr);
1701 return;
1702 }
1703
1704 uint32_t offset = nir_intrinsic_base(instr) +
1705 nir_src_as_uint(instr->src[0]);
1706 int comp = nir_intrinsic_component(instr);
1707 ntq_store_def(c, &instr->def, 0,
1708 qir_MOV(c, c->inputs[offset * 4 + comp]));
1709 }
1710
1711 static void
ntq_emit_intrinsic(struct vc4_compile * c,nir_intrinsic_instr * instr)1712 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1713 {
1714 unsigned offset;
1715
1716 switch (instr->intrinsic) {
1717 case nir_intrinsic_decl_reg:
1718 case nir_intrinsic_load_reg:
1719 case nir_intrinsic_store_reg:
1720 break; /* Ignore these */
1721
1722 case nir_intrinsic_load_uniform:
1723 assert(instr->num_components == 1);
1724 if (nir_src_is_const(instr->src[0])) {
1725 offset = nir_intrinsic_base(instr) +
1726 nir_src_as_uint(instr->src[0]);
1727 assert(offset % 4 == 0);
1728 /* We need dwords */
1729 offset = offset / 4;
1730 ntq_store_def(c, &instr->def, 0,
1731 qir_uniform(c, QUNIFORM_UNIFORM,
1732 offset));
1733 } else {
1734 ntq_store_def(c, &instr->def, 0,
1735 indirect_uniform_load(c, instr));
1736 }
1737 break;
1738
1739 case nir_intrinsic_load_ubo:
1740 assert(instr->num_components == 1);
1741 ntq_store_def(c, &instr->def, 0, vc4_ubo_load(c, instr));
1742 break;
1743
1744 case nir_intrinsic_load_user_clip_plane:
1745 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
1746 ntq_store_def(c, &instr->def, i,
1747 qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1748 nir_intrinsic_ucp_id(instr) *
1749 4 + i));
1750 }
1751 break;
1752
1753 case nir_intrinsic_load_blend_const_color_r_float:
1754 case nir_intrinsic_load_blend_const_color_g_float:
1755 case nir_intrinsic_load_blend_const_color_b_float:
1756 case nir_intrinsic_load_blend_const_color_a_float:
1757 ntq_store_def(c, &instr->def, 0,
1758 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
1759 (instr->intrinsic -
1760 nir_intrinsic_load_blend_const_color_r_float),
1761 0));
1762 break;
1763
1764 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
1765 ntq_store_def(c, &instr->def, 0,
1766 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
1767 0));
1768 break;
1769
1770 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
1771 ntq_store_def(c, &instr->def, 0,
1772 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
1773 0));
1774 break;
1775
1776 case nir_intrinsic_load_sample_mask_in:
1777 ntq_store_def(c, &instr->def, 0,
1778 qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1779 break;
1780
1781 case nir_intrinsic_load_front_face:
1782 /* The register contains 0 (front) or 1 (back), and we need to
1783 * turn it into a NIR bool where true means front.
1784 */
1785 ntq_store_def(c, &instr->def, 0,
1786 qir_ADD(c,
1787 qir_uniform_ui(c, -1),
1788 qir_reg(QFILE_FRAG_REV_FLAG, 0)));
1789 break;
1790
1791 case nir_intrinsic_load_input:
1792 ntq_emit_load_input(c, instr);
1793 break;
1794
1795 case nir_intrinsic_store_output:
1796 assert(nir_src_is_const(instr->src[1]) &&
1797 "vc4 doesn't support indirect outputs");
1798 offset = nir_intrinsic_base(instr) +
1799 nir_src_as_uint(instr->src[1]);
1800
1801 /* MSAA color outputs are the only case where we have an
1802 * output that's not lowered to being a store of a single 32
1803 * bit value.
1804 */
1805 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
1806 assert(offset == c->output_color_index);
1807 for (int i = 0; i < 4; i++) {
1808 c->sample_colors[i] =
1809 qir_MOV(c, ntq_get_src(c, instr->src[0],
1810 i));
1811 }
1812 } else {
1813 offset = offset * 4 + nir_intrinsic_component(instr);
1814 assert(instr->num_components == 1);
1815 c->outputs[offset] =
1816 qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
1817 c->num_outputs = MAX2(c->num_outputs, offset + 1);
1818 }
1819 break;
1820
1821 case nir_intrinsic_discard:
1822 if (c->execute.file != QFILE_NULL) {
1823 qir_SF(c, c->execute);
1824 qir_MOV_cond(c, QPU_COND_ZS, c->discard,
1825 qir_uniform_ui(c, ~0));
1826 } else {
1827 qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
1828 }
1829 break;
1830
1831 case nir_intrinsic_discard_if: {
1832 /* true (~0) if we're discarding */
1833 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1834
1835 if (c->execute.file != QFILE_NULL) {
1836 /* execute == 0 means the channel is active. Invert
1837 * the condition so that we can use zero as "executing
1838 * and discarding."
1839 */
1840 qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
1841 qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
1842 } else {
1843 qir_OR_dest(c, c->discard, c->discard,
1844 ntq_get_src(c, instr->src[0], 0));
1845 }
1846
1847 break;
1848 }
1849
1850 case nir_intrinsic_load_texture_scale: {
1851 assert(nir_src_is_const(instr->src[0]));
1852 int sampler = nir_src_as_int(instr->src[0]);
1853
1854 ntq_store_def(c, &instr->def, 0,
1855 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));
1856 ntq_store_def(c, &instr->def, 1,
1857 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));
1858 break;
1859 }
1860
1861 default:
1862 fprintf(stderr, "Unknown intrinsic: ");
1863 nir_print_instr(&instr->instr, stderr);
1864 fprintf(stderr, "\n");
1865 break;
1866 }
1867 }
1868
1869 /* Clears (activates) the execute flags for any channels whose jump target
1870 * matches this block.
1871 */
1872 static void
ntq_activate_execute_for_block(struct vc4_compile * c)1873 ntq_activate_execute_for_block(struct vc4_compile *c)
1874 {
1875 qir_SF(c, qir_SUB(c,
1876 c->execute,
1877 qir_uniform_ui(c, c->cur_block->index)));
1878 qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
1879 }
1880
1881 static void
ntq_emit_if(struct vc4_compile * c,nir_if * if_stmt)1882 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1883 {
1884 if (!c->vc4->screen->has_control_flow) {
1885 fprintf(stderr,
1886 "IF statement support requires updated kernel.\n");
1887 return;
1888 }
1889
1890 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1891 bool empty_else_block =
1892 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1893 exec_list_is_empty(&nir_else_block->instr_list));
1894
1895 struct qblock *then_block = qir_new_block(c);
1896 struct qblock *after_block = qir_new_block(c);
1897 struct qblock *else_block;
1898 if (empty_else_block)
1899 else_block = after_block;
1900 else
1901 else_block = qir_new_block(c);
1902
1903 bool was_top_level = false;
1904 if (c->execute.file == QFILE_NULL) {
1905 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
1906 was_top_level = true;
1907 }
1908
1909 /* Set ZS for executing (execute == 0) and jumping (if->condition ==
1910 * 0) channels, and then update execute flags for those to point to
1911 * the ELSE block.
1912 */
1913 qir_SF(c, qir_OR(c,
1914 c->execute,
1915 ntq_get_src(c, if_stmt->condition, 0)));
1916 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1917 qir_uniform_ui(c, else_block->index));
1918
1919 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1920 * through.
1921 */
1922 qir_SF(c, c->execute);
1923 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
1924 qir_link_blocks(c->cur_block, else_block);
1925 qir_link_blocks(c->cur_block, then_block);
1926
1927 /* Process the THEN block. */
1928 qir_set_emit_block(c, then_block);
1929 ntq_emit_cf_list(c, &if_stmt->then_list);
1930
1931 if (!empty_else_block) {
1932 /* Handle the end of the THEN block. First, all currently
1933 * active channels update their execute flags to point to
1934 * ENDIF
1935 */
1936 qir_SF(c, c->execute);
1937 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1938 qir_uniform_ui(c, after_block->index));
1939
1940 /* If everything points at ENDIF, then jump there immediately. */
1941 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
1942 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1943 qir_link_blocks(c->cur_block, after_block);
1944 qir_link_blocks(c->cur_block, else_block);
1945
1946 qir_set_emit_block(c, else_block);
1947 ntq_activate_execute_for_block(c);
1948 ntq_emit_cf_list(c, &if_stmt->else_list);
1949 }
1950
1951 qir_link_blocks(c->cur_block, after_block);
1952
1953 qir_set_emit_block(c, after_block);
1954 if (was_top_level) {
1955 c->execute = c->undef;
1956 c->last_top_block = c->cur_block;
1957 } else {
1958 ntq_activate_execute_for_block(c);
1959 }
1960 }
1961
1962 static void
ntq_emit_jump(struct vc4_compile * c,nir_jump_instr * jump)1963 ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
1964 {
1965 struct qblock *jump_block;
1966 switch (jump->type) {
1967 case nir_jump_break:
1968 jump_block = c->loop_break_block;
1969 break;
1970 case nir_jump_continue:
1971 jump_block = c->loop_cont_block;
1972 break;
1973 default:
1974 unreachable("Unsupported jump type\n");
1975 }
1976
1977 qir_SF(c, c->execute);
1978 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1979 qir_uniform_ui(c, jump_block->index));
1980
1981 /* Jump to the destination block if everyone has taken the jump. */
1982 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
1983 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1984 struct qblock *new_block = qir_new_block(c);
1985 qir_link_blocks(c->cur_block, jump_block);
1986 qir_link_blocks(c->cur_block, new_block);
1987 qir_set_emit_block(c, new_block);
1988 }
1989
1990 static void
ntq_emit_instr(struct vc4_compile * c,nir_instr * instr)1991 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
1992 {
1993 switch (instr->type) {
1994 case nir_instr_type_alu:
1995 ntq_emit_alu(c, nir_instr_as_alu(instr));
1996 break;
1997
1998 case nir_instr_type_intrinsic:
1999 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2000 break;
2001
2002 case nir_instr_type_load_const:
2003 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2004 break;
2005
2006 case nir_instr_type_undef:
2007 ntq_emit_ssa_undef(c, nir_instr_as_undef(instr));
2008 break;
2009
2010 case nir_instr_type_tex:
2011 ntq_emit_tex(c, nir_instr_as_tex(instr));
2012 break;
2013
2014 case nir_instr_type_jump:
2015 ntq_emit_jump(c, nir_instr_as_jump(instr));
2016 break;
2017
2018 default:
2019 fprintf(stderr, "Unknown NIR instr type: ");
2020 nir_print_instr(instr, stderr);
2021 fprintf(stderr, "\n");
2022 abort();
2023 }
2024 }
2025
2026 static void
ntq_emit_block(struct vc4_compile * c,nir_block * block)2027 ntq_emit_block(struct vc4_compile *c, nir_block *block)
2028 {
2029 nir_foreach_instr(instr, block) {
2030 ntq_emit_instr(c, instr);
2031 }
2032 }
2033
2034 static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
2035
2036 static void
ntq_emit_loop(struct vc4_compile * c,nir_loop * loop)2037 ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
2038 {
2039 assert(!nir_loop_has_continue_construct(loop));
2040 if (!c->vc4->screen->has_control_flow) {
2041 fprintf(stderr,
2042 "loop support requires updated kernel.\n");
2043 ntq_emit_cf_list(c, &loop->body);
2044 return;
2045 }
2046
2047 bool was_top_level = false;
2048 if (c->execute.file == QFILE_NULL) {
2049 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
2050 was_top_level = true;
2051 }
2052
2053 struct qblock *save_loop_cont_block = c->loop_cont_block;
2054 struct qblock *save_loop_break_block = c->loop_break_block;
2055
2056 c->loop_cont_block = qir_new_block(c);
2057 c->loop_break_block = qir_new_block(c);
2058
2059 qir_link_blocks(c->cur_block, c->loop_cont_block);
2060 qir_set_emit_block(c, c->loop_cont_block);
2061 ntq_activate_execute_for_block(c);
2062
2063 ntq_emit_cf_list(c, &loop->body);
2064
2065 /* If anything had explicitly continued, or is here at the end of the
2066 * loop, then we need to loop again. SF updates are masked by the
2067 * instruction's condition, so we can do the OR of the two conditions
2068 * within SF.
2069 */
2070 qir_SF(c, c->execute);
2071 struct qinst *cont_check =
2072 qir_SUB_dest(c,
2073 c->undef,
2074 c->execute,
2075 qir_uniform_ui(c, c->loop_cont_block->index));
2076 cont_check->cond = QPU_COND_ZC;
2077 cont_check->sf = true;
2078
2079 qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
2080 qir_link_blocks(c->cur_block, c->loop_cont_block);
2081 qir_link_blocks(c->cur_block, c->loop_break_block);
2082
2083 qir_set_emit_block(c, c->loop_break_block);
2084 if (was_top_level) {
2085 c->execute = c->undef;
2086 c->last_top_block = c->cur_block;
2087 } else {
2088 ntq_activate_execute_for_block(c);
2089 }
2090
2091 c->loop_break_block = save_loop_break_block;
2092 c->loop_cont_block = save_loop_cont_block;
2093 }
2094
2095 static void
ntq_emit_function(struct vc4_compile * c,nir_function_impl * func)2096 ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
2097 {
2098 fprintf(stderr, "FUNCTIONS not handled.\n");
2099 abort();
2100 }
2101
2102 static void
ntq_emit_cf_list(struct vc4_compile * c,struct exec_list * list)2103 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
2104 {
2105 foreach_list_typed(nir_cf_node, node, node, list) {
2106 switch (node->type) {
2107 case nir_cf_node_block:
2108 ntq_emit_block(c, nir_cf_node_as_block(node));
2109 break;
2110
2111 case nir_cf_node_if:
2112 ntq_emit_if(c, nir_cf_node_as_if(node));
2113 break;
2114
2115 case nir_cf_node_loop:
2116 ntq_emit_loop(c, nir_cf_node_as_loop(node));
2117 break;
2118
2119 case nir_cf_node_function:
2120 ntq_emit_function(c, nir_cf_node_as_function(node));
2121 break;
2122
2123 default:
2124 fprintf(stderr, "Unknown NIR node type\n");
2125 abort();
2126 }
2127 }
2128 }
2129
2130 static void
ntq_emit_impl(struct vc4_compile * c,nir_function_impl * impl)2131 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
2132 {
2133 ntq_setup_registers(c, impl);
2134 ntq_emit_cf_list(c, &impl->body);
2135 }
2136
2137 static void
nir_to_qir(struct vc4_compile * c)2138 nir_to_qir(struct vc4_compile *c)
2139 {
2140 if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
2141 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
2142
2143 ntq_setup_inputs(c);
2144 ntq_setup_outputs(c);
2145
2146 /* Find the main function and emit the body. */
2147 nir_foreach_function(function, c->s) {
2148 assert(strcmp(function->name, "main") == 0);
2149 assert(function->impl);
2150 ntq_emit_impl(c, function->impl);
2151 }
2152 }
2153
2154 static const nir_shader_compiler_options nir_options = {
2155 .lower_all_io_to_temps = true,
2156 .lower_extract_byte = true,
2157 .lower_extract_word = true,
2158 .lower_insert_byte = true,
2159 .lower_insert_word = true,
2160 .lower_fdiv = true,
2161 .lower_ffma16 = true,
2162 .lower_ffma32 = true,
2163 .lower_ffma64 = true,
2164 .lower_flrp32 = true,
2165 .lower_fmod = true,
2166 .lower_fpow = true,
2167 .lower_fsat = true,
2168 .lower_fsqrt = true,
2169 .lower_ldexp = true,
2170 .lower_fneg = true,
2171 .lower_ineg = true,
2172 .lower_to_scalar = true,
2173 .lower_umax = true,
2174 .lower_umin = true,
2175 .lower_isign = true,
2176 .has_fsub = true,
2177 .has_isub = true,
2178 .has_texture_scaling = true,
2179 .lower_mul_high = true,
2180 .max_unroll_iterations = 32,
2181 .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp),
2182 };
2183
2184 const void *
vc4_screen_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,enum pipe_shader_type shader)2185 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
2186 enum pipe_shader_ir ir,
2187 enum pipe_shader_type shader)
2188 {
2189 return &nir_options;
2190 }
2191
2192 static int
count_nir_instrs(nir_shader * nir)2193 count_nir_instrs(nir_shader *nir)
2194 {
2195 int count = 0;
2196 nir_foreach_function_impl(impl, nir) {
2197 nir_foreach_block(block, impl) {
2198 nir_foreach_instr(instr, block)
2199 count++;
2200 }
2201 }
2202 return count;
2203 }
2204
2205 static struct vc4_compile *
vc4_shader_ntq(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key,bool fs_threaded)2206 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2207 struct vc4_key *key, bool fs_threaded)
2208 {
2209 struct vc4_compile *c = qir_compile_init();
2210
2211 c->vc4 = vc4;
2212 c->stage = stage;
2213 c->shader_state = &key->shader_state->base;
2214 c->program_id = key->shader_state->program_id;
2215 c->variant_id =
2216 p_atomic_inc_return(&key->shader_state->compiled_variant_count);
2217 c->fs_threaded = fs_threaded;
2218
2219 c->key = key;
2220 switch (stage) {
2221 case QSTAGE_FRAG:
2222 c->fs_key = (struct vc4_fs_key *)key;
2223 if (c->fs_key->is_points) {
2224 c->point_x = emit_fragment_varying(c, ~0, 0);
2225 c->point_y = emit_fragment_varying(c, ~0, 0);
2226 } else if (c->fs_key->is_lines) {
2227 c->line_x = emit_fragment_varying(c, ~0, 0);
2228 }
2229 break;
2230 case QSTAGE_VERT:
2231 c->vs_key = (struct vc4_vs_key *)key;
2232 break;
2233 case QSTAGE_COORD:
2234 c->vs_key = (struct vc4_vs_key *)key;
2235 break;
2236 }
2237
2238 c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
2239
2240 if (stage == QSTAGE_FRAG) {
2241 NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
2242 }
2243
2244 struct nir_lower_tex_options tex_options = {
2245 .lower_txp = ~0,
2246
2247 /* Apply swizzles to all samplers. */
2248 .swizzle_result = ~0,
2249 .lower_invalid_implicit_lod = true,
2250 };
2251
2252 /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
2253 * The format swizzling applies before sRGB decode, and
2254 * ARB_texture_swizzle is the last thing before returning the sample.
2255 */
2256 for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
2257 enum pipe_format format = c->key->tex[i].format;
2258
2259 if (!format)
2260 continue;
2261
2262 const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
2263
2264 for (int j = 0; j < 4; j++) {
2265 uint8_t arb_swiz = c->key->tex[i].swizzle[j];
2266
2267 if (arb_swiz <= 3) {
2268 tex_options.swizzles[i][j] =
2269 format_swizzle[arb_swiz];
2270 } else {
2271 tex_options.swizzles[i][j] = arb_swiz;
2272 }
2273 }
2274
2275 if (util_format_is_srgb(format))
2276 tex_options.lower_srgb |= (1 << i);
2277 }
2278
2279 NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
2280
2281 if (c->key->ucp_enables) {
2282 if (stage == QSTAGE_FRAG) {
2283 NIR_PASS_V(c->s, nir_lower_clip_fs,
2284 c->key->ucp_enables, false);
2285 } else {
2286 NIR_PASS_V(c->s, nir_lower_clip_vs,
2287 c->key->ucp_enables, false, false, NULL);
2288 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
2289 nir_var_shader_out, NULL, NULL);
2290 }
2291 }
2292
2293 /* FS input scalarizing must happen after nir_lower_two_sided_color,
2294 * which only handles a vec4 at a time. Similarly, VS output
2295 * scalarizing must happen after nir_lower_clip_vs.
2296 */
2297 if (c->stage == QSTAGE_FRAG)
2298 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
2299 else
2300 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
2301
2302 NIR_PASS_V(c->s, vc4_nir_lower_io, c);
2303 NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
2304 nir_lower_idiv_options idiv_options = {
2305 .allow_fp16 = true,
2306 };
2307 NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
2308 NIR_PASS(_, c->s, nir_lower_alu);
2309
2310 vc4_optimize_nir(c->s);
2311
2312 /* Do late algebraic optimization to turn add(a, neg(b)) back into
2313 * subs, then the mandatory cleanup after algebraic. Note that it may
2314 * produce fnegs, and if so then we need to keep running to squash
2315 * fneg(fneg(a)).
2316 */
2317 bool more_late_algebraic = true;
2318 while (more_late_algebraic) {
2319 more_late_algebraic = false;
2320 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
2321 NIR_PASS_V(c->s, nir_opt_constant_folding);
2322 NIR_PASS_V(c->s, nir_copy_prop);
2323 NIR_PASS_V(c->s, nir_opt_dce);
2324 NIR_PASS_V(c->s, nir_opt_cse);
2325 }
2326
2327 NIR_PASS_V(c->s, nir_lower_bool_to_int32);
2328
2329 NIR_PASS_V(c->s, nir_convert_from_ssa, true);
2330 NIR_PASS_V(c->s, nir_trivialize_registers);
2331
2332 if (VC4_DBG(NIR)) {
2333 fprintf(stderr, "%s prog %d/%d NIR:\n",
2334 qir_get_stage_name(c->stage),
2335 c->program_id, c->variant_id);
2336 nir_print_shader(c->s, stderr);
2337 }
2338
2339 nir_to_qir(c);
2340
2341 switch (stage) {
2342 case QSTAGE_FRAG:
2343 /* FS threading requires that the thread execute
2344 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
2345 * (with no other THRSW afterwards, obviously). If we didn't
2346 * fetch a texture at a top level block, this wouldn't be
2347 * true.
2348 */
2349 if (c->fs_threaded && !c->last_thrsw_at_top_level) {
2350 c->failed = true;
2351 return c;
2352 }
2353
2354 emit_frag_end(c);
2355 break;
2356 case QSTAGE_VERT:
2357 emit_vert_end(c,
2358 c->vs_key->fs_inputs->input_slots,
2359 c->vs_key->fs_inputs->num_inputs);
2360 break;
2361 case QSTAGE_COORD:
2362 emit_coord_end(c);
2363 break;
2364 }
2365
2366 if (VC4_DBG(QIR)) {
2367 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2368 qir_get_stage_name(c->stage),
2369 c->program_id, c->variant_id);
2370 qir_dump(c);
2371 fprintf(stderr, "\n");
2372 }
2373
2374 qir_optimize(c);
2375 qir_lower_uniforms(c);
2376
2377 qir_schedule_instructions(c);
2378 qir_emit_uniform_stream_resets(c);
2379
2380 if (VC4_DBG(QIR)) {
2381 fprintf(stderr, "%s prog %d/%d QIR:\n",
2382 qir_get_stage_name(c->stage),
2383 c->program_id, c->variant_id);
2384 qir_dump(c);
2385 fprintf(stderr, "\n");
2386 }
2387
2388 qir_reorder_uniforms(c);
2389 vc4_generate_code(vc4, c);
2390
2391 ralloc_free(c->s);
2392
2393 return c;
2394 }
2395
2396 static void
vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader * uncompiled,struct vc4_key * key)2397 vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader *uncompiled,
2398 struct vc4_key *key)
2399 {
2400 nir_shader *s = uncompiled->base.ir.nir;
2401
2402 for (int i = 0; i < s->info.num_textures; i++) {
2403 key->tex[i].format = PIPE_FORMAT_R8G8B8A8_UNORM;
2404 key->tex[i].swizzle[0] = PIPE_SWIZZLE_X;
2405 key->tex[i].swizzle[1] = PIPE_SWIZZLE_Y;
2406 key->tex[i].swizzle[2] = PIPE_SWIZZLE_Z;
2407 key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
2408 }
2409 }
2410
2411 static inline struct vc4_varying_slot
vc4_slot_from_slot_and_component(uint8_t slot,uint8_t component)2412 vc4_slot_from_slot_and_component(uint8_t slot, uint8_t component)
2413 {
2414 assume(slot < 255 / 4);
2415 return (struct vc4_varying_slot){ (slot << 2) + component };
2416 }
2417
2418 static void
precompile_all_fs_inputs(nir_shader * s,struct vc4_fs_inputs * fs_inputs)2419 precompile_all_fs_inputs(nir_shader *s,
2420 struct vc4_fs_inputs *fs_inputs)
2421 {
2422 /* Assume all VS outputs will actually be used by the FS and output
2423 * them (the two sides have to match exactly) */
2424 nir_foreach_shader_out_variable(var, s) {
2425 const int array_len =
2426 glsl_type_is_vector_or_scalar(var->type) ?
2427 1 : glsl_get_length(var->type);
2428 for (int j = 0; j < array_len; j++) {
2429 const int slot = var->data.location + j;
2430 const int num_components =
2431 glsl_get_components(var->type);
2432 for (int i = 0; i < num_components; i++) {
2433 const int swiz = var->data.location_frac + i;
2434 fs_inputs->input_slots[fs_inputs->num_inputs++] =
2435 vc4_slot_from_slot_and_component(slot,
2436 swiz);
2437 }
2438 }
2439 }
2440 }
2441
2442 /**
2443 * Precompiles a shader variant at shader state creation time if
2444 * VC4_DEBUG=shaderdb is set.
2445 */
2446 static void
vc4_shader_precompile(struct vc4_context * vc4,struct vc4_uncompiled_shader * so)2447 vc4_shader_precompile(struct vc4_context *vc4,
2448 struct vc4_uncompiled_shader *so)
2449 {
2450 nir_shader *s = so->base.ir.nir;
2451
2452 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2453 struct vc4_fs_key key = {
2454 .base.shader_state = so,
2455 .depth_enabled = true,
2456 .logicop_func = PIPE_LOGICOP_COPY,
2457 .color_format = PIPE_FORMAT_R8G8B8A8_UNORM,
2458 .blend = {
2459 .blend_enable = false,
2460 .colormask = PIPE_MASK_RGBA,
2461 },
2462 };
2463
2464 vc4_setup_shared_precompile_key(so, &key.base);
2465 vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key.base);
2466 } else {
2467 assert(s->info.stage == MESA_SHADER_VERTEX);
2468 struct vc4_varying_slot input_slots[64] = {};
2469 struct vc4_fs_inputs fs_inputs = {
2470 .input_slots = input_slots,
2471 .num_inputs = 0,
2472 };
2473 struct vc4_vs_key key = {
2474 .base.shader_state = so,
2475 .fs_inputs = &fs_inputs,
2476 };
2477
2478 vc4_setup_shared_precompile_key(so, &key.base);
2479 precompile_all_fs_inputs(s, &fs_inputs);
2480 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2481
2482 /* Compile VS bin shader: only position (XXX: include TF) */
2483 key.is_coord = true;
2484 fs_inputs.num_inputs = 0;
2485 precompile_all_fs_inputs(s, &fs_inputs);
2486 for (int i = 0; i < 4; i++) {
2487 fs_inputs.input_slots[fs_inputs.num_inputs++] =
2488 vc4_slot_from_slot_and_component(VARYING_SLOT_POS,
2489 i);
2490 }
2491 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2492 }
2493 }
2494
2495 static void *
vc4_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)2496 vc4_shader_state_create(struct pipe_context *pctx,
2497 const struct pipe_shader_state *cso)
2498 {
2499 struct vc4_context *vc4 = vc4_context(pctx);
2500 struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2501 if (!so)
2502 return NULL;
2503
2504 so->program_id = vc4->next_uncompiled_program_id++;
2505
2506 nir_shader *s;
2507
2508 if (cso->type == PIPE_SHADER_IR_NIR) {
2509 /* The backend takes ownership of the NIR shader on state
2510 * creation.
2511 */
2512 s = cso->ir.nir;
2513 } else {
2514 assert(cso->type == PIPE_SHADER_IR_TGSI);
2515
2516 if (VC4_DBG(TGSI)) {
2517 fprintf(stderr, "prog %d TGSI:\n",
2518 so->program_id);
2519 tgsi_dump(cso->tokens, 0);
2520 fprintf(stderr, "\n");
2521 }
2522 s = tgsi_to_nir(cso->tokens, pctx->screen, false);
2523 }
2524
2525 if (s->info.stage == MESA_SHADER_VERTEX)
2526 NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);
2527
2528 NIR_PASS_V(s, nir_lower_io,
2529 nir_var_shader_in | nir_var_shader_out | nir_var_uniform,
2530 type_size, (nir_lower_io_options)0);
2531
2532 NIR_PASS_V(s, nir_normalize_cubemap_coords);
2533
2534 NIR_PASS_V(s, nir_lower_load_const_to_scalar);
2535
2536 vc4_optimize_nir(s);
2537
2538 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
2539
2540 /* Garbage collect dead instructions */
2541 nir_sweep(s);
2542
2543 so->base.type = PIPE_SHADER_IR_NIR;
2544 so->base.ir.nir = s;
2545
2546 if (VC4_DBG(NIR)) {
2547 fprintf(stderr, "%s prog %d NIR:\n",
2548 gl_shader_stage_name(s->info.stage),
2549 so->program_id);
2550 nir_print_shader(s, stderr);
2551 fprintf(stderr, "\n");
2552 }
2553
2554 if (VC4_DBG(SHADERDB)) {
2555 vc4_shader_precompile(vc4, so);
2556 }
2557
2558 return so;
2559 }
2560
2561 static void
copy_uniform_state_to_shader(struct vc4_compiled_shader * shader,struct vc4_compile * c)2562 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2563 struct vc4_compile *c)
2564 {
2565 int count = c->num_uniforms;
2566 struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2567
2568 uinfo->count = count;
2569 uinfo->data = ralloc_array(shader, uint32_t, count);
2570 memcpy(uinfo->data, c->uniform_data,
2571 count * sizeof(*uinfo->data));
2572 uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2573 memcpy(uinfo->contents, c->uniform_contents,
2574 count * sizeof(*uinfo->contents));
2575 uinfo->num_texture_samples = c->num_texture_samples;
2576
2577 vc4_set_shader_uniform_dirty_flags(shader);
2578 }
2579
2580 static void
vc4_setup_compiled_fs_inputs(struct vc4_context * vc4,struct vc4_compile * c,struct vc4_compiled_shader * shader)2581 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
2582 struct vc4_compiled_shader *shader)
2583 {
2584 struct vc4_fs_inputs inputs;
2585
2586 memset(&inputs, 0, sizeof(inputs));
2587 inputs.input_slots = ralloc_array(shader,
2588 struct vc4_varying_slot,
2589 c->num_input_slots);
2590
2591 bool input_live[c->num_input_slots];
2592
2593 memset(input_live, 0, sizeof(input_live));
2594 qir_for_each_inst_inorder(inst, c) {
2595 for (int i = 0; i < qir_get_nsrc(inst); i++) {
2596 if (inst->src[i].file == QFILE_VARY)
2597 input_live[inst->src[i].index] = true;
2598 }
2599 }
2600
2601 for (int i = 0; i < c->num_input_slots; i++) {
2602 struct vc4_varying_slot *slot = &c->input_slots[i];
2603
2604 if (!input_live[i])
2605 continue;
2606
2607 /* Skip non-VS-output inputs. */
2608 if (slot->slot == (uint8_t)~0)
2609 continue;
2610
2611 if (slot->slot == VARYING_SLOT_COL0 ||
2612 slot->slot == VARYING_SLOT_COL1 ||
2613 slot->slot == VARYING_SLOT_BFC0 ||
2614 slot->slot == VARYING_SLOT_BFC1) {
2615 shader->color_inputs |= (1 << inputs.num_inputs);
2616 }
2617
2618 inputs.input_slots[inputs.num_inputs] = *slot;
2619 inputs.num_inputs++;
2620 }
2621 shader->num_inputs = inputs.num_inputs;
2622
2623 /* Add our set of inputs to the set of all inputs seen. This way, we
2624 * can have a single pointer that identifies an FS inputs set,
2625 * allowing VS to avoid recompiling when the FS is recompiled (or a
2626 * new one is bound using separate shader objects) but the inputs
2627 * don't change.
2628 */
2629 struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
2630 if (entry) {
2631 shader->fs_inputs = entry->key;
2632 ralloc_free(inputs.input_slots);
2633 } else {
2634 struct vc4_fs_inputs *alloc_inputs;
2635
2636 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
2637 memcpy(alloc_inputs, &inputs, sizeof(inputs));
2638 ralloc_steal(alloc_inputs, inputs.input_slots);
2639 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
2640
2641 shader->fs_inputs = alloc_inputs;
2642 }
2643 }
2644
2645 static struct vc4_compiled_shader *
vc4_get_compiled_shader(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key)2646 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2647 struct vc4_key *key)
2648 {
2649 struct hash_table *ht;
2650 uint32_t key_size;
2651 bool try_threading;
2652
2653 if (stage == QSTAGE_FRAG) {
2654 ht = vc4->fs_cache;
2655 key_size = sizeof(struct vc4_fs_key);
2656 try_threading = vc4->screen->has_threaded_fs;
2657 } else {
2658 ht = vc4->vs_cache;
2659 key_size = sizeof(struct vc4_vs_key);
2660 try_threading = false;
2661 }
2662
2663 struct vc4_compiled_shader *shader;
2664 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2665 if (entry)
2666 return entry->data;
2667
2668 struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
2669 /* If the FS failed to compile threaded, fall back to single threaded. */
2670 if (try_threading && c->failed) {
2671 qir_compile_destroy(c);
2672 c = vc4_shader_ntq(vc4, stage, key, false);
2673 }
2674
2675 shader = rzalloc(NULL, struct vc4_compiled_shader);
2676
2677 shader->program_id = vc4->next_compiled_program_id++;
2678 if (stage == QSTAGE_FRAG) {
2679 vc4_setup_compiled_fs_inputs(vc4, c, shader);
2680
2681 /* Note: the temporary clone in c->s has been freed. */
2682 nir_shader *orig_shader = key->shader_state->base.ir.nir;
2683 if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
2684 shader->disable_early_z = true;
2685 } else {
2686 shader->num_inputs = c->num_inputs;
2687
2688 shader->vattr_offsets[0] = 0;
2689 for (int i = 0; i < 8; i++) {
2690 shader->vattr_offsets[i + 1] =
2691 shader->vattr_offsets[i] + c->vattr_sizes[i];
2692
2693 if (c->vattr_sizes[i])
2694 shader->vattrs_live |= (1 << i);
2695 }
2696 }
2697
2698 shader->failed = c->failed;
2699 if (c->failed) {
2700 shader->failed = true;
2701 } else {
2702 copy_uniform_state_to_shader(shader, c);
2703 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
2704 c->qpu_inst_count *
2705 sizeof(uint64_t));
2706 }
2707
2708 shader->fs_threaded = c->fs_threaded;
2709
2710 qir_compile_destroy(c);
2711
2712 struct vc4_key *dup_key;
2713 dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
2714 memcpy(dup_key, key, key_size);
2715 _mesa_hash_table_insert(ht, dup_key, shader);
2716
2717 return shader;
2718 }
2719
2720 static void
vc4_setup_shared_key(struct vc4_context * vc4,struct vc4_key * key,struct vc4_texture_stateobj * texstate)2721 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2722 struct vc4_texture_stateobj *texstate)
2723 {
2724 for (int i = 0; i < texstate->num_textures; i++) {
2725 struct pipe_sampler_view *sampler = texstate->textures[i];
2726 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
2727 struct pipe_sampler_state *sampler_state =
2728 texstate->samplers[i];
2729
2730 if (!sampler)
2731 continue;
2732
2733 key->tex[i].format = sampler->format;
2734 key->tex[i].swizzle[0] = sampler->swizzle_r;
2735 key->tex[i].swizzle[1] = sampler->swizzle_g;
2736 key->tex[i].swizzle[2] = sampler->swizzle_b;
2737 key->tex[i].swizzle[3] = sampler->swizzle_a;
2738
2739 if (sampler->texture->nr_samples > 1) {
2740 key->tex[i].msaa_width = sampler->texture->width0;
2741 key->tex[i].msaa_height = sampler->texture->height0;
2742 } else if (sampler){
2743 key->tex[i].compare_mode = sampler_state->compare_mode;
2744 key->tex[i].compare_func = sampler_state->compare_func;
2745 key->tex[i].wrap_s = sampler_state->wrap_s;
2746 key->tex[i].wrap_t = sampler_state->wrap_t;
2747 key->tex[i].force_first_level =
2748 vc4_sampler->force_first_level;
2749 }
2750 }
2751
2752 key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2753 }
2754
2755 static void
vc4_update_compiled_fs(struct vc4_context * vc4,uint8_t prim_mode)2756 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2757 {
2758 struct vc4_job *job = vc4->job;
2759 struct vc4_fs_key local_key;
2760 struct vc4_fs_key *key = &local_key;
2761
2762 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2763 VC4_DIRTY_BLEND |
2764 VC4_DIRTY_FRAMEBUFFER |
2765 VC4_DIRTY_ZSA |
2766 VC4_DIRTY_RASTERIZER |
2767 VC4_DIRTY_SAMPLE_MASK |
2768 VC4_DIRTY_FRAGTEX |
2769 VC4_DIRTY_UNCOMPILED_FS |
2770 VC4_DIRTY_UBO_1_SIZE))) {
2771 return;
2772 }
2773
2774 memset(key, 0, sizeof(*key));
2775 vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2776 key->base.shader_state = vc4->prog.bind_fs;
2777 key->is_points = (prim_mode == MESA_PRIM_POINTS);
2778 key->is_lines = (prim_mode >= MESA_PRIM_LINES &&
2779 prim_mode <= MESA_PRIM_LINE_STRIP);
2780 key->blend = vc4->blend->rt[0];
2781 if (vc4->blend->logicop_enable) {
2782 key->logicop_func = vc4->blend->logicop_func;
2783 } else {
2784 key->logicop_func = PIPE_LOGICOP_COPY;
2785 }
2786 if (job->msaa) {
2787 key->msaa = vc4->rasterizer->base.multisample;
2788 key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
2789 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
2790 key->sample_alpha_to_one = vc4->blend->alpha_to_one;
2791 }
2792
2793 if (vc4->framebuffer.cbufs[0])
2794 key->color_format = vc4->framebuffer.cbufs[0]->format;
2795
2796 key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2797 key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2798 key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2799 key->depth_enabled = (vc4->zsa->base.depth_enabled ||
2800 key->stencil_enabled);
2801
2802 if (key->is_points) {
2803 key->point_sprite_mask =
2804 vc4->rasterizer->base.sprite_coord_enable;
2805 key->point_coord_upper_left =
2806 (vc4->rasterizer->base.sprite_coord_mode ==
2807 PIPE_SPRITE_COORD_UPPER_LEFT);
2808 }
2809
2810 key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
2811
2812 struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2813 vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2814 if (vc4->prog.fs == old_fs)
2815 return;
2816
2817 vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2818
2819 if (vc4->rasterizer->base.flatshade &&
2820 (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
2821 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2822 }
2823
2824 if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
2825 vc4->dirty |= VC4_DIRTY_FS_INPUTS;
2826 }
2827
2828 static void
vc4_update_compiled_vs(struct vc4_context * vc4,uint8_t prim_mode)2829 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2830 {
2831 struct vc4_vs_key local_key;
2832 struct vc4_vs_key *key = &local_key;
2833
2834 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2835 VC4_DIRTY_RASTERIZER |
2836 VC4_DIRTY_VERTTEX |
2837 VC4_DIRTY_VTXSTATE |
2838 VC4_DIRTY_UNCOMPILED_VS |
2839 VC4_DIRTY_FS_INPUTS))) {
2840 return;
2841 }
2842
2843 memset(key, 0, sizeof(*key));
2844 vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2845 key->base.shader_state = vc4->prog.bind_vs;
2846 key->fs_inputs = vc4->prog.fs->fs_inputs;
2847
2848 for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2849 key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2850
2851 key->per_vertex_point_size =
2852 (prim_mode == MESA_PRIM_POINTS &&
2853 vc4->rasterizer->base.point_size_per_vertex);
2854
2855 struct vc4_compiled_shader *vs =
2856 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2857 if (vs != vc4->prog.vs) {
2858 vc4->prog.vs = vs;
2859 vc4->dirty |= VC4_DIRTY_COMPILED_VS;
2860 }
2861
2862 key->is_coord = true;
2863 /* Coord shaders don't care what the FS inputs are. */
2864 key->fs_inputs = NULL;
2865 struct vc4_compiled_shader *cs =
2866 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2867 if (cs != vc4->prog.cs) {
2868 vc4->prog.cs = cs;
2869 vc4->dirty |= VC4_DIRTY_COMPILED_CS;
2870 }
2871 }
2872
2873 bool
vc4_update_compiled_shaders(struct vc4_context * vc4,uint8_t prim_mode)2874 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2875 {
2876 vc4_update_compiled_fs(vc4, prim_mode);
2877 vc4_update_compiled_vs(vc4, prim_mode);
2878
2879 return !(vc4->prog.cs->failed ||
2880 vc4->prog.vs->failed ||
2881 vc4->prog.fs->failed);
2882 }
2883
2884 static uint32_t
fs_cache_hash(const void * key)2885 fs_cache_hash(const void *key)
2886 {
2887 return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2888 }
2889
2890 static uint32_t
vs_cache_hash(const void * key)2891 vs_cache_hash(const void *key)
2892 {
2893 return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2894 }
2895
2896 static bool
fs_cache_compare(const void * key1,const void * key2)2897 fs_cache_compare(const void *key1, const void *key2)
2898 {
2899 return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2900 }
2901
2902 static bool
vs_cache_compare(const void * key1,const void * key2)2903 vs_cache_compare(const void *key1, const void *key2)
2904 {
2905 return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2906 }
2907
2908 static uint32_t
fs_inputs_hash(const void * key)2909 fs_inputs_hash(const void *key)
2910 {
2911 const struct vc4_fs_inputs *inputs = key;
2912
2913 return _mesa_hash_data(inputs->input_slots,
2914 sizeof(*inputs->input_slots) *
2915 inputs->num_inputs);
2916 }
2917
2918 static bool
fs_inputs_compare(const void * key1,const void * key2)2919 fs_inputs_compare(const void *key1, const void *key2)
2920 {
2921 const struct vc4_fs_inputs *inputs1 = key1;
2922 const struct vc4_fs_inputs *inputs2 = key2;
2923
2924 return (inputs1->num_inputs == inputs2->num_inputs &&
2925 memcmp(inputs1->input_slots,
2926 inputs2->input_slots,
2927 sizeof(*inputs1->input_slots) *
2928 inputs1->num_inputs) == 0);
2929 }
2930
2931 static void
delete_from_cache_if_matches(struct hash_table * ht,struct vc4_compiled_shader ** last_compile,struct hash_entry * entry,struct vc4_uncompiled_shader * so)2932 delete_from_cache_if_matches(struct hash_table *ht,
2933 struct vc4_compiled_shader **last_compile,
2934 struct hash_entry *entry,
2935 struct vc4_uncompiled_shader *so)
2936 {
2937 const struct vc4_key *key = entry->key;
2938
2939 if (key->shader_state == so) {
2940 struct vc4_compiled_shader *shader = entry->data;
2941 _mesa_hash_table_remove(ht, entry);
2942 vc4_bo_unreference(&shader->bo);
2943
2944 if (shader == *last_compile)
2945 *last_compile = NULL;
2946
2947 ralloc_free(shader);
2948 }
2949 }
2950
2951 static void
vc4_shader_state_delete(struct pipe_context * pctx,void * hwcso)2952 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2953 {
2954 struct vc4_context *vc4 = vc4_context(pctx);
2955 struct vc4_uncompiled_shader *so = hwcso;
2956
2957 hash_table_foreach(vc4->fs_cache, entry) {
2958 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
2959 entry, so);
2960 }
2961 hash_table_foreach(vc4->vs_cache, entry) {
2962 delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
2963 entry, so);
2964 }
2965
2966 ralloc_free(so->base.ir.nir);
2967 free(so);
2968 }
2969
2970 static void
vc4_fp_state_bind(struct pipe_context * pctx,void * hwcso)2971 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2972 {
2973 struct vc4_context *vc4 = vc4_context(pctx);
2974 vc4->prog.bind_fs = hwcso;
2975 vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2976 }
2977
2978 static void
vc4_vp_state_bind(struct pipe_context * pctx,void * hwcso)2979 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2980 {
2981 struct vc4_context *vc4 = vc4_context(pctx);
2982 vc4->prog.bind_vs = hwcso;
2983 vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
2984 }
2985
2986 void
vc4_program_init(struct pipe_context * pctx)2987 vc4_program_init(struct pipe_context *pctx)
2988 {
2989 struct vc4_context *vc4 = vc4_context(pctx);
2990
2991 pctx->create_vs_state = vc4_shader_state_create;
2992 pctx->delete_vs_state = vc4_shader_state_delete;
2993
2994 pctx->create_fs_state = vc4_shader_state_create;
2995 pctx->delete_fs_state = vc4_shader_state_delete;
2996
2997 pctx->bind_fs_state = vc4_fp_state_bind;
2998 pctx->bind_vs_state = vc4_vp_state_bind;
2999
3000 vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
3001 fs_cache_compare);
3002 vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
3003 vs_cache_compare);
3004 vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
3005 fs_inputs_compare);
3006 }
3007
3008 void
vc4_program_fini(struct pipe_context * pctx)3009 vc4_program_fini(struct pipe_context *pctx)
3010 {
3011 struct vc4_context *vc4 = vc4_context(pctx);
3012
3013 hash_table_foreach(vc4->fs_cache, entry) {
3014 struct vc4_compiled_shader *shader = entry->data;
3015 vc4_bo_unreference(&shader->bo);
3016 ralloc_free(shader);
3017 _mesa_hash_table_remove(vc4->fs_cache, entry);
3018 }
3019
3020 hash_table_foreach(vc4->vs_cache, entry) {
3021 struct vc4_compiled_shader *shader = entry->data;
3022 vc4_bo_unreference(&shader->bo);
3023 ralloc_free(shader);
3024 _mesa_hash_table_remove(vc4->vs_cache, entry);
3025 }
3026 }
3027