1 /*
2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 * Rob Clark <robclark@freedesktop.org>
26 */
27
28 #include "pipe/p_state.h"
29 #include "util/format/u_format.h"
30 #include "util/u_helpers.h"
31 #include "util/u_memory.h"
32 #include "util/u_string.h"
33 #include "util/u_viewport.h"
34
35 #include "common/freedreno_guardband.h"
36 #include "freedreno_query_hw.h"
37 #include "freedreno_resource.h"
38 #include "freedreno_state.h"
39 #include "freedreno_tracepoints.h"
40
41 #include "fd6_blend.h"
42 #include "fd6_const.h"
43 #include "fd6_context.h"
44 #include "fd6_emit.h"
45 #include "fd6_image.h"
46 #include "fd6_pack.h"
47 #include "fd6_program.h"
48 #include "fd6_rasterizer.h"
49 #include "fd6_texture.h"
50 #include "fd6_zsa.h"
51
52 /* Border color layout is diff from a4xx/a5xx.. if it turns out to be
53 * the same as a6xx then move this somewhere common ;-)
54 *
55 * Entry layout looks like (total size, 0x60 bytes):
56 */
57
58 struct PACKED bcolor_entry {
59 uint32_t fp32[4];
60 uint16_t ui16[4];
61 int16_t si16[4];
62 uint16_t fp16[4];
63 uint16_t rgb565;
64 uint16_t rgb5a1;
65 uint16_t rgba4;
66 uint8_t __pad0[2];
67 uint8_t ui8[4];
68 int8_t si8[4];
69 uint32_t rgb10a2;
70 uint32_t z24;
71 uint16_t
72 srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
73 uint8_t __pad1[56];
74 };
75
76 #define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry)
77 #define FD6_BORDER_COLOR_UPLOAD_SIZE \
78 (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE)
79
80 static void
setup_border_colors(struct fd_texture_stateobj * tex,struct bcolor_entry * entries,struct fd_screen * screen)81 setup_border_colors(struct fd_texture_stateobj *tex,
82 struct bcolor_entry *entries,
83 struct fd_screen *screen)
84 {
85 unsigned i, j;
86 STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
87 const bool has_z24uint_s8uint = screen->info->a6xx.has_z24uint_s8uint;
88
89 for (i = 0; i < tex->num_samplers; i++) {
90 struct bcolor_entry *e = &entries[i];
91 struct pipe_sampler_state *sampler = tex->samplers[i];
92 union pipe_color_union *bc;
93
94 if (!sampler)
95 continue;
96
97 bc = &sampler->border_color;
98
99 /*
100 * XXX HACK ALERT XXX
101 *
102 * The border colors need to be swizzled in a particular
103 * format-dependent order. Even though samplers don't know about
104 * formats, we can assume that with a GL state tracker, there's a
105 * 1:1 correspondence between sampler and texture. Take advantage
106 * of that knowledge.
107 */
108 if ((i >= tex->num_textures) || !tex->textures[i])
109 continue;
110
111 struct pipe_sampler_view *view = tex->textures[i];
112 enum pipe_format format = view->format;
113 const struct util_format_description *desc =
114 util_format_description(format);
115
116 e->rgb565 = 0;
117 e->rgb5a1 = 0;
118 e->rgba4 = 0;
119 e->rgb10a2 = 0;
120 e->z24 = 0;
121
122 unsigned char swiz[4];
123
124 fdl6_format_swiz(format, false, swiz);
125
126 for (j = 0; j < 4; j++) {
127 int c = swiz[j];
128 int cd = c;
129
130 /*
131 * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the
132 * stencil border color value in bc->ui[0] but according
133 * to desc->swizzle and desc->channel, the .x/.w component
134 * is NONE and the stencil value is in the y component.
135 * Meanwhile the hardware wants this in the .x component
136 * for x24s8 and x32_s8x24, or the .y component for x24s8 with the
137 * special Z24UINT_S8UINT format.
138 */
139 if ((format == PIPE_FORMAT_X24S8_UINT) ||
140 (format == PIPE_FORMAT_X32_S8X24_UINT)) {
141 if (j == 0) {
142 c = 1;
143 cd = (format == PIPE_FORMAT_X24S8_UINT && has_z24uint_s8uint) ? 1 : 0;
144 } else {
145 continue;
146 }
147 }
148
149 if (c >= 4)
150 continue;
151
152 if (desc->channel[c].pure_integer) {
153 uint16_t clamped;
154 switch (desc->channel[c].size) {
155 case 2:
156 assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
157 clamped = CLAMP(bc->ui[j], 0, 0x3);
158 break;
159 case 8:
160 if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
161 clamped = CLAMP(bc->i[j], -128, 127);
162 else
163 clamped = CLAMP(bc->ui[j], 0, 255);
164 break;
165 case 10:
166 assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
167 clamped = CLAMP(bc->ui[j], 0, 0x3ff);
168 break;
169 case 16:
170 if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
171 clamped = CLAMP(bc->i[j], -32768, 32767);
172 else
173 clamped = CLAMP(bc->ui[j], 0, 65535);
174 break;
175 default:
176 assert(!"Unexpected bit size");
177 case 32:
178 clamped = 0;
179 break;
180 }
181 e->fp32[cd] = bc->ui[j];
182 e->fp16[cd] = clamped;
183 } else {
184 float f = bc->f[j];
185 float f_u = CLAMP(f, 0, 1);
186 float f_s = CLAMP(f, -1, 1);
187
188 e->fp32[c] = fui(f);
189 e->fp16[c] = _mesa_float_to_half(f);
190 e->srgb[c] = _mesa_float_to_half(f_u);
191 e->ui16[c] = f_u * 0xffff;
192 e->si16[c] = f_s * 0x7fff;
193 e->ui8[c] = f_u * 0xff;
194 e->si8[c] = f_s * 0x7f;
195 if (c == 1)
196 e->rgb565 |= (int)(f_u * 0x3f) << 5;
197 else if (c < 3)
198 e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0);
199 if (c == 3)
200 e->rgb5a1 |= (f_u > 0.5f) ? 0x8000 : 0;
201 else
202 e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5);
203 if (c == 3)
204 e->rgb10a2 |= (int)(f_u * 0x3) << 30;
205 else
206 e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10);
207 e->rgba4 |= (int)(f_u * 0xf) << (c * 4);
208 if (c == 0)
209 e->z24 = f_u * 0xffffff;
210 }
211 }
212
213 #ifdef DEBUG
214 memset(&e->__pad0, 0, sizeof(e->__pad0));
215 memset(&e->__pad1, 0, sizeof(e->__pad1));
216 #endif
217 }
218 }
219
220 static void
emit_border_color(struct fd_context * ctx,struct fd_ringbuffer * ring)221 emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt
222 {
223 struct fd6_context *fd6_ctx = fd6_context(ctx);
224 struct bcolor_entry *entries;
225 unsigned off;
226 void *ptr;
227
228 STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
229
230 u_upload_alloc(fd6_ctx->border_color_uploader, 0,
231 FD6_BORDER_COLOR_UPLOAD_SIZE, FD6_BORDER_COLOR_UPLOAD_SIZE,
232 &off, &fd6_ctx->border_color_buf, &ptr);
233
234 entries = ptr;
235
236 setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0], ctx->screen);
237 setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT],
238 &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers],
239 ctx->screen);
240
241 OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
242 OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0);
243
244 u_upload_unmap(fd6_ctx->border_color_uploader);
245 }
246
247 static void
fd6_emit_fb_tex(struct fd_ringbuffer * state,struct fd_context * ctx)248 fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) assert_dt
249 {
250 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
251 struct pipe_surface *psurf = pfb->cbufs[0];
252 struct fd_resource *rsc = fd_resource(psurf->texture);
253
254 OUT_RINGP(state, 0, &ctx->batch->fb_read_patches); /* texconst0, patched in gmem emit */
255 OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) |
256 A6XX_TEX_CONST_1_HEIGHT(pfb->height));
257 OUT_RING(state, 0); /* texconst2, patched in gmem emit */
258 OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size));
259 OUT_RING(state, 0); /* BASE_LO, patched in gmem emit */
260 OUT_RING(state, 0); /* BASE_HI, patched in gmem emit */
261 OUT_RING(state, 0); /* texconst6 */
262 OUT_RING(state, 0); /* texconst7 */
263 OUT_RING(state, 0); /* texconst8 */
264 OUT_RING(state, 0); /* texconst9 */
265 OUT_RING(state, 0); /* texconst10 */
266 OUT_RING(state, 0); /* texconst11 */
267 OUT_RING(state, 0);
268 OUT_RING(state, 0);
269 OUT_RING(state, 0);
270 OUT_RING(state, 0);
271 }
272
273 bool
fd6_emit_textures(struct fd_context * ctx,struct fd_ringbuffer * ring,enum pipe_shader_type type,struct fd_texture_stateobj * tex,unsigned bcolor_offset,const struct ir3_shader_variant * v)274 fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
275 enum pipe_shader_type type, struct fd_texture_stateobj *tex,
276 unsigned bcolor_offset,
277 /* can be NULL if no image/SSBO/fb state to merge in: */
278 const struct ir3_shader_variant *v)
279 {
280 bool needs_border = false;
281 unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg;
282 enum a6xx_state_block sb;
283
284 switch (type) {
285 case PIPE_SHADER_VERTEX:
286 sb = SB6_VS_TEX;
287 opcode = CP_LOAD_STATE6_GEOM;
288 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP;
289 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST;
290 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
291 break;
292 case PIPE_SHADER_TESS_CTRL:
293 sb = SB6_HS_TEX;
294 opcode = CP_LOAD_STATE6_GEOM;
295 tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP;
296 tex_const_reg = REG_A6XX_SP_HS_TEX_CONST;
297 tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT;
298 break;
299 case PIPE_SHADER_TESS_EVAL:
300 sb = SB6_DS_TEX;
301 opcode = CP_LOAD_STATE6_GEOM;
302 tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP;
303 tex_const_reg = REG_A6XX_SP_DS_TEX_CONST;
304 tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT;
305 break;
306 case PIPE_SHADER_GEOMETRY:
307 sb = SB6_GS_TEX;
308 opcode = CP_LOAD_STATE6_GEOM;
309 tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP;
310 tex_const_reg = REG_A6XX_SP_GS_TEX_CONST;
311 tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT;
312 break;
313 case PIPE_SHADER_FRAGMENT:
314 sb = SB6_FS_TEX;
315 opcode = CP_LOAD_STATE6_FRAG;
316 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP;
317 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST;
318 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
319 break;
320 case PIPE_SHADER_COMPUTE:
321 sb = SB6_CS_TEX;
322 opcode = CP_LOAD_STATE6_FRAG;
323 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP;
324 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST;
325 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
326 break;
327 default:
328 unreachable("bad state block");
329 }
330
331 if (tex->num_samplers > 0) {
332 struct fd_ringbuffer *state =
333 fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4);
334 for (unsigned i = 0; i < tex->num_samplers; i++) {
335 static const struct fd6_sampler_stateobj dummy_sampler = {};
336 const struct fd6_sampler_stateobj *sampler =
337 tex->samplers[i] ? fd6_sampler_stateobj(tex->samplers[i])
338 : &dummy_sampler;
339 OUT_RING(state, sampler->texsamp0);
340 OUT_RING(state, sampler->texsamp1);
341 OUT_RING(state, sampler->texsamp2 |
342 A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset));
343 OUT_RING(state, sampler->texsamp3);
344 needs_border |= sampler->needs_border;
345 }
346
347 /* output sampler state: */
348 OUT_PKT7(ring, opcode, 3);
349 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
350 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
351 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
352 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
353 CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers));
354 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
355
356 OUT_PKT4(ring, tex_samp_reg, 2);
357 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
358
359 fd_ringbuffer_del(state);
360 }
361
362 unsigned num_merged_textures = tex->num_textures;
363 unsigned num_textures = tex->num_textures;
364 if (v) {
365 num_merged_textures += v->image_mapping.num_tex;
366
367 if (v->fb_read)
368 num_merged_textures++;
369
370 /* There could be more bound textures than what the shader uses.
371 * Which isn't known at shader compile time. So in the case we
372 * are merging tex state, only emit the textures that the shader
373 * uses (since the image/SSBO related tex state comes immediately
374 * after)
375 */
376 num_textures = v->image_mapping.tex_base;
377 }
378
379 if (num_merged_textures > 0) {
380 struct fd_ringbuffer *state =
381 fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4);
382 for (unsigned i = 0; i < num_textures; i++) {
383 const struct fd6_pipe_sampler_view *view;
384
385 if (tex->textures[i]) {
386 view = fd6_pipe_sampler_view(tex->textures[i]);
387 if (unlikely(view->rsc_seqno !=
388 fd_resource(view->base.texture)->seqno)) {
389 fd6_sampler_view_update(ctx,
390 fd6_pipe_sampler_view(tex->textures[i]));
391 }
392 } else {
393 static const struct fd6_pipe_sampler_view dummy_view = {};
394 view = &dummy_view;
395 }
396
397 OUT_RING(state, view->descriptor[0]);
398 OUT_RING(state, view->descriptor[1]);
399 OUT_RING(state, view->descriptor[2]);
400 OUT_RING(state, view->descriptor[3]);
401
402 if (view->ptr1) {
403 OUT_RELOC(state, view->ptr1->bo, view->descriptor[4],
404 (uint64_t)view->descriptor[5] << 32, 0);
405 } else {
406 OUT_RING(state, view->descriptor[4]);
407 OUT_RING(state, view->descriptor[5]);
408 }
409
410 OUT_RING(state, view->descriptor[6]);
411
412 if (view->ptr2) {
413 OUT_RELOC(state, view->ptr2->bo, view->descriptor[7], 0, 0);
414 } else {
415 OUT_RING(state, view->descriptor[7]);
416 OUT_RING(state, view->descriptor[8]);
417 }
418
419 OUT_RING(state, view->descriptor[9]);
420 OUT_RING(state, view->descriptor[10]);
421 OUT_RING(state, view->descriptor[11]);
422 OUT_RING(state, view->descriptor[12]);
423 OUT_RING(state, view->descriptor[13]);
424 OUT_RING(state, view->descriptor[14]);
425 OUT_RING(state, view->descriptor[15]);
426 }
427
428 if (v) {
429 const struct ir3_ibo_mapping *mapping = &v->image_mapping;
430 struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type];
431 struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type];
432
433 for (unsigned i = 0; i < mapping->num_tex; i++) {
434 unsigned idx = mapping->tex_to_image[i];
435 if (idx & IBO_SSBO) {
436 fd6_emit_ssbo_tex(ctx, state, &buf->sb[idx & ~IBO_SSBO]);
437 } else {
438 fd6_emit_image_tex(ctx, state, &img->si[idx]);
439 }
440 }
441
442 if (v->fb_read) {
443 fd6_emit_fb_tex(state, ctx);
444 }
445 }
446
447 /* emit texture state: */
448 OUT_PKT7(ring, opcode, 3);
449 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
450 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
451 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
452 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
453 CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures));
454 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
455
456 OUT_PKT4(ring, tex_const_reg, 2);
457 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
458
459 fd_ringbuffer_del(state);
460 }
461
462 OUT_PKT4(ring, tex_count_reg, 1);
463 OUT_RING(ring, num_merged_textures);
464
465 return needs_border;
466 }
467
468 /* Emits combined texture state, which also includes any Image/SSBO
469 * related texture state merged in (because we must have all texture
470 * state for a given stage in a single buffer). In the fast-path, if
471 * we don't need to merge in any image/ssbo related texture state, we
472 * just use cached texture stateobj. Otherwise we generate a single-
473 * use stateobj.
474 *
475 * TODO Is there some sane way we can still use cached texture stateobj
476 * with image/ssbo in use?
477 *
478 * returns whether border_color is required:
479 */
480 static bool
fd6_emit_combined_textures(struct fd_ringbuffer * ring,struct fd6_emit * emit,enum pipe_shader_type type,const struct ir3_shader_variant * v)481 fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit,
482 enum pipe_shader_type type,
483 const struct ir3_shader_variant *v) assert_dt
484 {
485 struct fd_context *ctx = emit->ctx;
486 bool needs_border = false;
487
488 static const struct {
489 enum fd6_state_id state_id;
490 unsigned enable_mask;
491 } s[PIPE_SHADER_TYPES] = {
492 [PIPE_SHADER_VERTEX] = {FD6_GROUP_VS_TEX, ENABLE_ALL},
493 [PIPE_SHADER_TESS_CTRL] = {FD6_GROUP_HS_TEX, ENABLE_ALL},
494 [PIPE_SHADER_TESS_EVAL] = {FD6_GROUP_DS_TEX, ENABLE_ALL},
495 [PIPE_SHADER_GEOMETRY] = {FD6_GROUP_GS_TEX, ENABLE_ALL},
496 [PIPE_SHADER_FRAGMENT] = {FD6_GROUP_FS_TEX, ENABLE_DRAW},
497 };
498
499 assert(s[type].state_id);
500
501 if (!v->image_mapping.num_tex && !v->fb_read) {
502 /* in the fast-path, when we don't have to mix in any image/SSBO
503 * related texture state, we can just lookup the stateobj and
504 * re-emit that:
505 *
506 * Also, framebuffer-read is a slow-path because an extra
507 * texture needs to be inserted.
508 *
509 * TODO we can probably simmplify things if we also treated
510 * border_color as a slow-path.. this way the tex state key
511 * wouldn't depend on bcolor_offset.. but fb_read might rather
512 * be *somehow* a fast-path if we eventually used it for PLS.
513 * I suppose there would be no harm in just *always* inserting
514 * an fb_read texture?
515 */
516 if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) &&
517 ctx->tex[type].num_textures > 0) {
518 struct fd6_texture_state *tex =
519 fd6_texture_state(ctx, type, &ctx->tex[type]);
520
521 needs_border |= tex->needs_border;
522
523 fd6_emit_add_group(emit, tex->stateobj, s[type].state_id,
524 s[type].enable_mask);
525
526 fd6_texture_state_reference(&tex, NULL);
527 }
528 } else {
529 /* In the slow-path, create a one-shot texture state object
530 * if either TEX|PROG|SSBO|IMAGE state is dirty:
531 */
532 if ((ctx->dirty_shader[type] &
533 (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE |
534 FD_DIRTY_SHADER_SSBO)) ||
535 v->fb_read) {
536 struct fd_texture_stateobj *tex = &ctx->tex[type];
537 struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer(
538 ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
539 unsigned bcolor_offset = fd6_border_color_offset(ctx, type, tex);
540
541 needs_border |=
542 fd6_emit_textures(ctx, stateobj, type, tex, bcolor_offset, v);
543
544 fd6_emit_take_group(emit, stateobj, s[type].state_id,
545 s[type].enable_mask);
546 }
547 }
548
549 return needs_border;
550 }
551
552 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)553 build_vbo_state(struct fd6_emit *emit) assert_dt
554 {
555 const struct fd_vertex_state *vtx = emit->vtx;
556
557 /* Limit PKT4 size, because at max count (32) we would overflow the
558 * size of the PKT4 size field:
559 */
560 const unsigned maxcnt = 16;
561 const unsigned cnt = vtx->vertexbuf.count;
562 const unsigned dwords = (cnt * 4) /* per vbo: reg64 + two reg32 */
563 + (1 + cnt / maxcnt); /* PKT4 hdr every 16 vbo's */
564
565 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
566 emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING);
567
568 for (int32_t j = 0; j < cnt; j++) {
569 if ((j % maxcnt) == 0) {
570 unsigned sz = MIN2(maxcnt, cnt - j);
571 OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 4 * sz);
572 }
573 const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
574 struct fd_resource *rsc = fd_resource(vb->buffer.resource);
575 if (rsc == NULL) {
576 OUT_RING(ring, 0);
577 OUT_RING(ring, 0);
578 OUT_RING(ring, 0);
579 OUT_RING(ring, 0);
580 } else {
581 uint32_t off = vb->buffer_offset;
582 uint32_t size = vb->buffer.resource->width0 - off;
583
584 OUT_RELOC(ring, rsc->bo, off, 0, 0);
585 OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */
586 OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */
587 }
588 }
589
590 return ring;
591 }
592
593 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)594 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
595 {
596 struct fd_context *ctx = emit->ctx;
597 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
598 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
599 const struct ir3_shader_variant *fs = emit->fs;
600
601 if (fs->fs.early_fragment_tests)
602 return A6XX_EARLY_Z;
603
604 if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled ||
605 fs->writes_stencilref) {
606 return A6XX_LATE_Z;
607 } else if ((fs->has_kill || zsa->alpha_test) &&
608 (zsa->writes_zs || !pfb->zsbuf)) {
609 /* Slightly odd, but seems like the hw wants us to select
610 * LATE_Z mode if there is no depth buffer + discard. Either
611 * that, or when occlusion query is enabled. See:
612 *
613 * dEQP-GLES31.functional.fbo.no_attachments.*
614 */
615 return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
616 } else {
617 return A6XX_EARLY_Z;
618 }
619 }
620
621 /**
622 * Calculate normalized LRZ state based on zsa/prog/blend state, updating
623 * the zsbuf's lrz state as necessary to detect the cases where we need
624 * to invalidate lrz.
625 */
626 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit,bool binning_pass)627 compute_lrz_state(struct fd6_emit *emit, bool binning_pass) assert_dt
628 {
629 struct fd_context *ctx = emit->ctx;
630 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
631 const struct ir3_shader_variant *fs = emit->fs;
632 struct fd6_lrz_state lrz;
633
634 if (!pfb->zsbuf) {
635 memset(&lrz, 0, sizeof(lrz));
636 if (!binning_pass) {
637 lrz.z_mode = compute_ztest_mode(emit, false);
638 }
639 return lrz;
640 }
641
642 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
643 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
644 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
645
646 lrz = zsa->lrz;
647
648 /* normalize lrz state: */
649 if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill ||
650 blend->base.alpha_to_coverage) {
651 lrz.write = false;
652 if (binning_pass)
653 lrz.enable = false;
654 }
655
656 /* if we change depthfunc direction, bail out on using LRZ. The
657 * LRZ buffer encodes a min/max depth value per block, but if
658 * we switch from GT/GE <-> LT/LE, those values cannot be
659 * interpreted properly.
660 */
661 if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
662 (rsc->lrz_direction != lrz.direction)) {
663 rsc->lrz_valid = false;
664 }
665
666 if (zsa->invalidate_lrz || !rsc->lrz_valid) {
667 rsc->lrz_valid = false;
668 memset(&lrz, 0, sizeof(lrz));
669 }
670
671 if (fs->no_earlyz || fs->writes_pos) {
672 lrz.enable = false;
673 lrz.write = false;
674 lrz.test = false;
675 }
676
677 if (!binning_pass) {
678 lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
679 }
680
681 /* Once we start writing to the real depth buffer, we lock in the
682 * direction for LRZ.. if we have to skip a LRZ write for any
683 * reason, it is still safe to have LRZ until there is a direction
684 * reversal. Prior to the reversal, since we disabled LRZ writes
685 * in the "unsafe" cases, this just means that the LRZ test may
686 * not early-discard some things that end up not passing a later
687 * test (ie. be overly concervative). But once you have a reversal
688 * of direction, it is possible to increase/decrease the z value
689 * to the point where the overly-conservative test is incorrect.
690 */
691 if (zsa->base.depth_writemask) {
692 rsc->lrz_direction = lrz.direction;
693 }
694
695 return lrz;
696 }
697
698 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit,bool binning_pass)699 build_lrz(struct fd6_emit *emit, bool binning_pass) assert_dt
700 {
701 struct fd_context *ctx = emit->ctx;
702 struct fd6_context *fd6_ctx = fd6_context(ctx);
703 struct fd6_lrz_state lrz = compute_lrz_state(emit, binning_pass);
704
705 /* If the LRZ state has not changed, we can skip the emit: */
706 if (!ctx->last.dirty &&
707 !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz)))
708 return NULL;
709
710 fd6_ctx->last.lrz[binning_pass] = lrz;
711
712 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
713 ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING);
714
715 OUT_REG(ring,
716 A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write,
717 .greater = lrz.direction == FD_LRZ_GREATER,
718 .z_test_enable = lrz.test, ));
719 OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
720
721 OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
722
723 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
724
725 return ring;
726 }
727
728 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)729 build_scissor(struct fd6_emit *emit) assert_dt
730 {
731 struct fd_context *ctx = emit->ctx;
732 struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
733
734 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
735 emit->ctx->batch->submit, 3 * 4, FD_RINGBUFFER_STREAMING);
736
737 OUT_REG(
738 ring,
739 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = scissor->minx, .y = scissor->miny),
740 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
741 .y = MAX2(scissor->maxy, 1) - 1));
742
743 ctx->batch->max_scissor.minx =
744 MIN2(ctx->batch->max_scissor.minx, scissor->minx);
745 ctx->batch->max_scissor.miny =
746 MIN2(ctx->batch->max_scissor.miny, scissor->miny);
747 ctx->batch->max_scissor.maxx =
748 MAX2(ctx->batch->max_scissor.maxx, scissor->maxx);
749 ctx->batch->max_scissor.maxy =
750 MAX2(ctx->batch->max_scissor.maxy, scissor->maxy);
751
752 return ring;
753 }
754
755 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
756 * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
757 */
758 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)759 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
760 {
761 struct fd_context *ctx = emit->ctx;
762 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
763 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
764 const struct ir3_shader_variant *fs = emit->fs;
765
766 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
767 ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
768
769 unsigned nr = pfb->nr_cbufs;
770
771 if (ctx->rasterizer->rasterizer_discard)
772 nr = 0;
773
774 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
775
776 if (blend->use_dual_src_blend)
777 nr++;
778
779 OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
780 OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
781 COND(fs->writes_smask && pfb->samples > 1,
782 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
783 COND(fs->writes_stencilref,
784 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
785 COND(blend->use_dual_src_blend,
786 A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
787 OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
788
789 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
790 OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
791
792 unsigned mrt_components = 0;
793 for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
794 if (!pfb->cbufs[i])
795 continue;
796 mrt_components |= 0xf << (i * 4);
797 }
798
799 /* dual source blending has an extra fs output in the 2nd slot */
800 if (blend->use_dual_src_blend)
801 mrt_components |= 0xf << 4;
802
803 mrt_components &= prog->mrt_components;
804
805 OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
806 OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
807
808 return ring;
809 }
810
811 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)812 build_blend_color(struct fd6_emit *emit) assert_dt
813 {
814 struct fd_context *ctx = emit->ctx;
815 struct pipe_blend_color *bcolor = &ctx->blend_color;
816 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
817 ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
818
819 OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
820 A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
821 A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
822 A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
823
824 return ring;
825 }
826
827 static struct fd_ringbuffer *
build_ibo(struct fd6_emit * emit)828 build_ibo(struct fd6_emit *emit) assert_dt
829 {
830 struct fd_context *ctx = emit->ctx;
831
832 if (emit->hs) {
833 assert(ir3_shader_nibo(emit->hs) == 0);
834 assert(ir3_shader_nibo(emit->ds) == 0);
835 }
836 if (emit->gs) {
837 assert(ir3_shader_nibo(emit->gs) == 0);
838 }
839
840 struct fd_ringbuffer *ibo_state =
841 fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT);
842 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
843 ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING);
844
845 OUT_PKT7(ring, CP_LOAD_STATE6, 3);
846 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
847 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
848 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
849 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
850 CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs)));
851 OUT_RB(ring, ibo_state);
852
853 OUT_PKT4(ring, REG_A6XX_SP_IBO, 2);
854 OUT_RB(ring, ibo_state);
855
856 /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could
857 * de-duplicate this from program->config_stateobj
858 */
859 OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
860 OUT_RING(ring, ir3_shader_nibo(emit->fs));
861
862 fd_ringbuffer_del(ibo_state);
863
864 return ring;
865 }
866
867 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)868 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
869 {
870 struct fd_context *ctx = emit->ctx;
871 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
872 const struct ir3_stream_output_info *info = prog->stream_output;
873 struct fd_streamout_stateobj *so = &ctx->streamout;
874
875 emit->streamout_mask = 0;
876
877 if (!info)
878 return;
879
880 for (unsigned i = 0; i < so->num_targets; i++) {
881 struct fd_stream_output_target *target =
882 fd_stream_output_target(so->targets[i]);
883
884 if (!target)
885 continue;
886
887 target->stride = info->stride[i];
888
889 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
890 /* VPC_SO[i].BUFFER_BASE_LO: */
891 OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
892 OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
893
894 struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
895
896 if (so->reset & (1 << i)) {
897 assert(so->offsets[i] == 0);
898
899 OUT_PKT7(ring, CP_MEM_WRITE, 3);
900 OUT_RELOC(ring, offset_bo, 0, 0, 0);
901 OUT_RING(ring, target->base.buffer_offset);
902
903 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
904 OUT_RING(ring, target->base.buffer_offset);
905 } else {
906 OUT_PKT7(ring, CP_MEM_TO_REG, 3);
907 OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
908 CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
909 CP_MEM_TO_REG_0_CNT(0));
910 OUT_RELOC(ring, offset_bo, 0, 0, 0);
911 }
912
913 // After a draw HW would write the new offset to offset_bo
914 OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
915 OUT_RELOC(ring, offset_bo, 0, 0, 0);
916
917 so->reset &= ~(1 << i);
918
919 emit->streamout_mask |= (1 << i);
920 }
921
922 if (emit->streamout_mask) {
923 fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO,
924 ENABLE_ALL);
925 } else if (ctx->last.streamout_mask != 0) {
926 /* If we transition from a draw with streamout to one without, turn
927 * off streamout.
928 */
929 fd6_emit_add_group(emit, fd6_context(ctx)->streamout_disable_stateobj,
930 FD6_GROUP_SO, ENABLE_ALL);
931 }
932
933 /* Make sure that any use of our TFB outputs (indirect draw source or shader
934 * UBO reads) comes after the TFB output is written. From the GL 4.6 core
935 * spec:
936 *
937 * "Buffers should not be bound or in use for both transform feedback and
938 * other purposes in the GL. Specifically, if a buffer object is
939 * simultaneously bound to a transform feedback buffer binding point
940 * and elsewhere in the GL, any writes to or reads from the buffer
941 * generate undefined values."
942 *
943 * So we idle whenever SO buffers change. Note that this function is called
944 * on every draw with TFB enabled, so check the dirty flag for the buffers
945 * themselves.
946 */
947 if (ctx->dirty & FD_DIRTY_STREAMOUT)
948 fd_wfi(ctx->batch, ring);
949
950 ctx->last.streamout_mask = emit->streamout_mask;
951 }
952
953 /**
954 * Stuff that less frequently changes and isn't (yet) moved into stategroups
955 */
956 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)957 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
958 {
959 struct fd_context *ctx = emit->ctx;
960 const enum fd_dirty_3d_state dirty = emit->dirty;
961
962 if (dirty & FD_DIRTY_STENCIL_REF) {
963 struct pipe_stencil_ref *sr = &ctx->stencil_ref;
964
965 OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
966 OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
967 A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
968 }
969
970 if (dirty & FD_DIRTY_VIEWPORT) {
971 struct pipe_scissor_state *scissor = &ctx->viewport_scissor;
972
973 OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]),
974 A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]),
975 A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]),
976 A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]),
977 A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]),
978 A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2]));
979
980 OUT_REG(
981 ring,
982 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = scissor->minx,
983 .y = scissor->miny),
984 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
985 .y = MAX2(scissor->maxy, 1) - 1));
986
987 unsigned guardband_x = fd_calc_guardband(ctx->viewport.translate[0],
988 ctx->viewport.scale[0], false);
989 unsigned guardband_y = fd_calc_guardband(ctx->viewport.translate[1],
990 ctx->viewport.scale[1], false);
991
992 OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = guardband_x,
993 .vert = guardband_y));
994 }
995
996 /* The clamp ranges are only used when the rasterizer disables
997 * depth clip.
998 */
999 if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) &&
1000 fd_depth_clip_disabled(ctx)) {
1001 float zmin, zmax;
1002 util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz,
1003 &zmin, &zmax);
1004
1005 OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin),
1006 A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax));
1007
1008 OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
1009 }
1010 }
1011
1012 void
fd6_emit_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)1013 fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
1014 {
1015 struct fd_context *ctx = emit->ctx;
1016 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
1017 const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
1018 const struct ir3_shader_variant *vs = emit->vs;
1019 const struct ir3_shader_variant *hs = emit->hs;
1020 const struct ir3_shader_variant *ds = emit->ds;
1021 const struct ir3_shader_variant *gs = emit->gs;
1022 const struct ir3_shader_variant *fs = emit->fs;
1023 bool needs_border = false;
1024
1025 emit_marker6(ring, 5);
1026
1027 /* NOTE: we track fb_read differently than _BLEND_ENABLED since we
1028 * might decide to do sysmem in some cases when blend is enabled:
1029 */
1030 if (fs->fb_read)
1031 ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
1032
1033 u_foreach_bit (b, emit->dirty_groups) {
1034 enum fd6_state_id group = b;
1035 struct fd_ringbuffer *state = NULL;
1036 uint32_t enable_mask = ENABLE_ALL;
1037
1038 switch (group) {
1039 case FD6_GROUP_VTXSTATE:
1040 state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
1041 fd_ringbuffer_ref(state);
1042 break;
1043 case FD6_GROUP_VBO:
1044 state = build_vbo_state(emit);
1045 break;
1046 case FD6_GROUP_ZSA:
1047 state = fd6_zsa_state(
1048 ctx,
1049 util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
1050 fd_depth_clip_disabled(ctx));
1051 fd_ringbuffer_ref(state);
1052 break;
1053 case FD6_GROUP_LRZ:
1054 state = build_lrz(emit, false);
1055 if (!state)
1056 continue;
1057 enable_mask = ENABLE_DRAW;
1058 break;
1059 case FD6_GROUP_LRZ_BINNING:
1060 state = build_lrz(emit, true);
1061 if (!state)
1062 continue;
1063 enable_mask = CP_SET_DRAW_STATE__0_BINNING;
1064 break;
1065 case FD6_GROUP_SCISSOR:
1066 state = build_scissor(emit);
1067 break;
1068 case FD6_GROUP_PROG:
1069 fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG,
1070 ENABLE_ALL);
1071 fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW);
1072 fd6_emit_add_group(emit, prog->binning_stateobj,
1073 FD6_GROUP_PROG_BINNING,
1074 CP_SET_DRAW_STATE__0_BINNING);
1075
1076 /* emit remaining streaming program state, ie. what depends on
1077 * other emit state, so cannot be pre-baked.
1078 */
1079 fd6_emit_take_group(emit, fd6_program_interp_state(emit),
1080 FD6_GROUP_PROG_INTERP, ENABLE_DRAW);
1081 continue;
1082 case FD6_GROUP_RASTERIZER:
1083 state = fd6_rasterizer_state(ctx, emit->primitive_restart);
1084 fd_ringbuffer_ref(state);
1085 break;
1086 case FD6_GROUP_PROG_FB_RAST:
1087 state = build_prog_fb_rast(emit);
1088 break;
1089 case FD6_GROUP_BLEND:
1090 state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask)
1091 ->stateobj;
1092 fd_ringbuffer_ref(state);
1093 break;
1094 case FD6_GROUP_BLEND_COLOR:
1095 state = build_blend_color(emit);
1096 break;
1097 case FD6_GROUP_IBO:
1098 state = build_ibo(emit);
1099 break;
1100 case FD6_GROUP_CONST:
1101 state = fd6_build_user_consts(emit);
1102 break;
1103 case FD6_GROUP_DRIVER_PARAMS:
1104 state = fd6_build_driver_params(emit);
1105 break;
1106 case FD6_GROUP_PRIMITIVE_PARAMS:
1107 state = fd6_build_tess_consts(emit);
1108 break;
1109 case FD6_GROUP_VS_TEX:
1110 needs_border |=
1111 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs);
1112 continue;
1113 case FD6_GROUP_HS_TEX:
1114 if (hs) {
1115 needs_border |= fd6_emit_combined_textures(
1116 ring, emit, PIPE_SHADER_TESS_CTRL, hs);
1117 }
1118 continue;
1119 case FD6_GROUP_DS_TEX:
1120 if (ds) {
1121 needs_border |= fd6_emit_combined_textures(
1122 ring, emit, PIPE_SHADER_TESS_EVAL, ds);
1123 }
1124 continue;
1125 case FD6_GROUP_GS_TEX:
1126 if (gs) {
1127 needs_border |=
1128 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs);
1129 }
1130 continue;
1131 case FD6_GROUP_FS_TEX:
1132 needs_border |=
1133 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs);
1134 continue;
1135 case FD6_GROUP_SO:
1136 fd6_emit_streamout(ring, emit);
1137 continue;
1138 case FD6_GROUP_NON_GROUP:
1139 fd6_emit_non_ring(ring, emit);
1140 continue;
1141 default:
1142 unreachable("bad state group");
1143 }
1144
1145 fd6_emit_take_group(emit, state, group, enable_mask);
1146 }
1147
1148 if (needs_border)
1149 emit_border_color(ctx, ring);
1150
1151 if (emit->num_groups > 0) {
1152 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups);
1153 for (unsigned i = 0; i < emit->num_groups; i++) {
1154 struct fd6_state_group *g = &emit->groups[i];
1155 unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0;
1156
1157 assert((g->enable_mask & ~ENABLE_ALL) == 0);
1158
1159 if (n == 0) {
1160 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1161 CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask |
1162 CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1163 OUT_RING(ring, 0x00000000);
1164 OUT_RING(ring, 0x00000000);
1165 } else {
1166 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask |
1167 CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1168 OUT_RB(ring, g->stateobj);
1169 }
1170
1171 if (g->stateobj)
1172 fd_ringbuffer_del(g->stateobj);
1173 }
1174 emit->num_groups = 0;
1175 }
1176 }
1177
1178 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct ir3_shader_variant * cp)1179 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
1180 struct ir3_shader_variant *cp)
1181 {
1182 enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
1183
1184 if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG |
1185 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) {
1186 struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE];
1187 unsigned bcolor_offset =
1188 fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex);
1189
1190 bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex,
1191 bcolor_offset, cp);
1192
1193 if (needs_border)
1194 emit_border_color(ctx, ring);
1195
1196 OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1);
1197 OUT_RING(ring, 0);
1198
1199 OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1);
1200 OUT_RING(ring, 0);
1201
1202 OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1);
1203 OUT_RING(ring, 0);
1204
1205 OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1);
1206 OUT_RING(ring, 0);
1207
1208 OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1);
1209 OUT_RING(ring, 0);
1210 }
1211
1212 if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) {
1213 struct fd_ringbuffer *state =
1214 fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE);
1215
1216 OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
1217 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
1218 CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |
1219 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1220 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
1221 CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp)));
1222 OUT_RB(ring, state);
1223
1224 OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2);
1225 OUT_RB(ring, state);
1226
1227 OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);
1228 OUT_RING(ring, ir3_shader_nibo(cp));
1229
1230 fd_ringbuffer_del(state);
1231 }
1232 }
1233
1234 /* emit setup at begin of new cmdstream buffer (don't rely on previous
1235 * state, there could have been a context switch between ioctls):
1236 */
1237 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)1238 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
1239 {
1240 struct fd_screen *screen = batch->ctx->screen;
1241
1242 if (!batch->nondraw) {
1243 trace_start_state_restore(&batch->trace, ring);
1244 }
1245
1246 fd6_cache_inv(batch, ring);
1247
1248 OUT_REG(ring,
1249 A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true,
1250 .ds_state = true, .gs_state = true,
1251 .fs_state = true, .cs_state = true,
1252 .gfx_ibo = true, .cs_ibo = true,
1253 .gfx_shared_const = true,
1254 .cs_shared_const = true,
1255 .gfx_bindless = 0x1f, .cs_bindless = 0x1f));
1256
1257 OUT_WFI5(ring);
1258
1259 WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0);
1260 WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
1261 WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0);
1262 WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
1263 WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
1264 WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
1265 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1266 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1267
1268 WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0);
1269 WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880);
1270 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000);
1271 WRITE(REG_A6XX_SP_CHICKEN_BITS, 0x1430);
1272 WRITE(REG_A6XX_SP_IBO_COUNT, 0);
1273 WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
1274 WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
1275 WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1276 WRITE(REG_A6XX_UCHE_CLIENT_PF, 4);
1277 WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1);
1278 WRITE(REG_A6XX_SP_MODE_CONTROL,
1279 A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
1280 WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1281 WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1282 WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f);
1283
1284 WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
1285 WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
1286 WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
1287
1288 WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
1289 WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
1290 WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
1291 WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
1292 WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
1293 WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
1294 WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1295 WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1296
1297 WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
1298 WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1299
1300 WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
1301
1302 WRITE(REG_A6XX_PC_RASTER_CNTL, 0);
1303
1304 WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
1305
1306 WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1307
1308 WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1309 WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
1310 WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1311 WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1312 WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1313 WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1314 WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1315 WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1316 WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0);
1317 /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
1318 * but this seems to kill texture gather offsets.
1319 */
1320 WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
1321 A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1322 WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0);
1323 WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0);
1324 WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0);
1325 WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0);
1326 WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1327
1328 emit_marker6(ring, 7);
1329
1330 OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1331 OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */
1332
1333 WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
1334
1335 OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1);
1336 OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */
1337
1338 /* Clear any potential pending state groups to be safe: */
1339 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1340 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1341 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1342 CP_SET_DRAW_STATE__0_GROUP_ID(0));
1343 OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1344 OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1345
1346 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1347 OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1348
1349 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1350 OUT_RING(ring, 0x00000000);
1351
1352 OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1353 OUT_RING(ring, 0x00000000);
1354
1355 /* Initialize VFD_FETCH[n].SIZE to zero to avoid iova faults trying
1356 * to fetch from a VFD_FETCH[n].BASE which we've potentially inherited
1357 * from another process:
1358 */
1359 for (int32_t i = 0; i < 32; i++) {
1360 OUT_PKT4(ring, REG_A6XX_VFD_FETCH_SIZE(i), 1);
1361 OUT_RING(ring, 0);
1362 }
1363
1364 /* This happens after all drawing has been emitted to the draw CS, so we know
1365 * whether we need the tess BO pointers.
1366 */
1367 if (batch->tessellation) {
1368 assert(screen->tess_bo);
1369 OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2);
1370 OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
1371 /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
1372 OUT_WFI5(ring);
1373 }
1374
1375 if (!batch->nondraw) {
1376 trace_end_state_restore(&batch->trace, ring);
1377 }
1378 }
1379
1380 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1381 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1382 unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1383 unsigned sizedwords)
1384 {
1385 struct fd_bo *src_bo = fd_resource(src)->bo;
1386 struct fd_bo *dst_bo = fd_resource(dst)->bo;
1387 unsigned i;
1388
1389 for (i = 0; i < sizedwords; i++) {
1390 OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1391 OUT_RING(ring, 0x00000000);
1392 OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1393 OUT_RELOC(ring, src_bo, src_off, 0, 0);
1394
1395 dst_off += 4;
1396 src_off += 4;
1397 }
1398 }
1399
1400 /* this is *almost* the same as fd6_cache_flush().. which I guess
1401 * could be re-worked to be something a bit more generic w/ param
1402 * indicating what needs to be flushed.. although that would mean
1403 * figuring out which events trigger what state to flush..
1404 */
1405 static void
fd6_framebuffer_barrier(struct fd_context * ctx)1406 fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt
1407 {
1408 struct fd6_context *fd6_ctx = fd6_context(ctx);
1409 struct fd_batch *batch = fd_context_batch_locked(ctx);
1410 struct fd_ringbuffer *ring = batch->draw;
1411 unsigned seqno;
1412
1413 fd_batch_needs_flush(batch);
1414
1415 seqno = fd6_event_write(batch, ring, RB_DONE_TS, true);
1416
1417 OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
1418 OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1419 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1420 OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1421 OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
1422 OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
1423 OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1424
1425 fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
1426 fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
1427
1428 seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
1429 fd_wfi(batch, ring);
1430
1431 fd6_event_write(batch, ring, CACHE_INVALIDATE, false);
1432
1433 OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
1434 OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
1435 OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1436 OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
1437
1438 fd_batch_unlock_submit(batch);
1439 fd_batch_reference(&batch, NULL);
1440 }
1441
1442 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1443 fd6_emit_init_screen(struct pipe_screen *pscreen)
1444 {
1445 struct fd_screen *screen = fd_screen(pscreen);
1446 screen->emit_ib = fd6_emit_ib;
1447 screen->mem_to_mem = fd6_mem_to_mem;
1448 }
1449
1450 void
fd6_emit_init(struct pipe_context * pctx)1451 fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis
1452 {
1453 struct fd_context *ctx = fd_context(pctx);
1454 ctx->framebuffer_barrier = fd6_framebuffer_barrier;
1455 }
1456