• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors:
25  *    Rob Clark <robclark@freedesktop.org>
26  */
27 
28 #include "pipe/p_state.h"
29 #include "util/format/u_format.h"
30 #include "util/u_helpers.h"
31 #include "util/u_memory.h"
32 #include "util/u_string.h"
33 #include "util/u_viewport.h"
34 
35 #include "common/freedreno_guardband.h"
36 #include "freedreno_query_hw.h"
37 #include "freedreno_resource.h"
38 #include "freedreno_state.h"
39 #include "freedreno_tracepoints.h"
40 
41 #include "fd6_blend.h"
42 #include "fd6_const.h"
43 #include "fd6_context.h"
44 #include "fd6_emit.h"
45 #include "fd6_image.h"
46 #include "fd6_pack.h"
47 #include "fd6_program.h"
48 #include "fd6_rasterizer.h"
49 #include "fd6_texture.h"
50 #include "fd6_zsa.h"
51 
52 /* Border color layout is diff from a4xx/a5xx.. if it turns out to be
53  * the same as a6xx then move this somewhere common ;-)
54  *
55  * Entry layout looks like (total size, 0x60 bytes):
56  */
57 
58 struct PACKED bcolor_entry {
59    uint32_t fp32[4];
60    uint16_t ui16[4];
61    int16_t si16[4];
62    uint16_t fp16[4];
63    uint16_t rgb565;
64    uint16_t rgb5a1;
65    uint16_t rgba4;
66    uint8_t __pad0[2];
67    uint8_t ui8[4];
68    int8_t si8[4];
69    uint32_t rgb10a2;
70    uint32_t z24;
71    uint16_t
72       srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
73    uint8_t __pad1[56];
74 };
75 
76 #define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry)
77 #define FD6_BORDER_COLOR_UPLOAD_SIZE                                           \
78    (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE)
79 
80 static void
setup_border_colors(struct fd_texture_stateobj * tex,struct bcolor_entry * entries,struct fd_screen * screen)81 setup_border_colors(struct fd_texture_stateobj *tex,
82                     struct bcolor_entry *entries,
83                     struct fd_screen *screen)
84 {
85    unsigned i, j;
86    STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
87    const bool has_z24uint_s8uint = screen->info->a6xx.has_z24uint_s8uint;
88 
89    for (i = 0; i < tex->num_samplers; i++) {
90       struct bcolor_entry *e = &entries[i];
91       struct pipe_sampler_state *sampler = tex->samplers[i];
92       union pipe_color_union *bc;
93 
94       if (!sampler)
95          continue;
96 
97       bc = &sampler->border_color;
98 
99       /*
100        * XXX HACK ALERT XXX
101        *
102        * The border colors need to be swizzled in a particular
103        * format-dependent order. Even though samplers don't know about
104        * formats, we can assume that with a GL state tracker, there's a
105        * 1:1 correspondence between sampler and texture. Take advantage
106        * of that knowledge.
107        */
108       if ((i >= tex->num_textures) || !tex->textures[i])
109          continue;
110 
111       struct pipe_sampler_view *view = tex->textures[i];
112       enum pipe_format format = view->format;
113       const struct util_format_description *desc =
114          util_format_description(format);
115 
116       e->rgb565 = 0;
117       e->rgb5a1 = 0;
118       e->rgba4 = 0;
119       e->rgb10a2 = 0;
120       e->z24 = 0;
121 
122       unsigned char swiz[4];
123 
124       fdl6_format_swiz(format, false, swiz);
125 
126       for (j = 0; j < 4; j++) {
127          int c = swiz[j];
128          int cd = c;
129 
130          /*
131           * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the
132           * stencil border color value in bc->ui[0] but according
133           * to desc->swizzle and desc->channel, the .x/.w component
134           * is NONE and the stencil value is in the y component.
135           * Meanwhile the hardware wants this in the .x component
136           * for x24s8 and x32_s8x24, or the .y component for x24s8 with the
137           * special Z24UINT_S8UINT format.
138           */
139          if ((format == PIPE_FORMAT_X24S8_UINT) ||
140              (format == PIPE_FORMAT_X32_S8X24_UINT)) {
141             if (j == 0) {
142                c = 1;
143                cd = (format == PIPE_FORMAT_X24S8_UINT && has_z24uint_s8uint) ? 1 : 0;
144             } else {
145                continue;
146             }
147          }
148 
149          if (c >= 4)
150             continue;
151 
152          if (desc->channel[c].pure_integer) {
153             uint16_t clamped;
154             switch (desc->channel[c].size) {
155             case 2:
156                assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
157                clamped = CLAMP(bc->ui[j], 0, 0x3);
158                break;
159             case 8:
160                if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
161                   clamped = CLAMP(bc->i[j], -128, 127);
162                else
163                   clamped = CLAMP(bc->ui[j], 0, 255);
164                break;
165             case 10:
166                assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
167                clamped = CLAMP(bc->ui[j], 0, 0x3ff);
168                break;
169             case 16:
170                if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
171                   clamped = CLAMP(bc->i[j], -32768, 32767);
172                else
173                   clamped = CLAMP(bc->ui[j], 0, 65535);
174                break;
175             default:
176                assert(!"Unexpected bit size");
177             case 32:
178                clamped = 0;
179                break;
180             }
181             e->fp32[cd] = bc->ui[j];
182             e->fp16[cd] = clamped;
183          } else {
184             float f = bc->f[j];
185             float f_u = CLAMP(f, 0, 1);
186             float f_s = CLAMP(f, -1, 1);
187 
188             e->fp32[c] = fui(f);
189             e->fp16[c] = _mesa_float_to_half(f);
190             e->srgb[c] = _mesa_float_to_half(f_u);
191             e->ui16[c] = f_u * 0xffff;
192             e->si16[c] = f_s * 0x7fff;
193             e->ui8[c] = f_u * 0xff;
194             e->si8[c] = f_s * 0x7f;
195             if (c == 1)
196                e->rgb565 |= (int)(f_u * 0x3f) << 5;
197             else if (c < 3)
198                e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0);
199             if (c == 3)
200                e->rgb5a1 |= (f_u > 0.5f) ? 0x8000 : 0;
201             else
202                e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5);
203             if (c == 3)
204                e->rgb10a2 |= (int)(f_u * 0x3) << 30;
205             else
206                e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10);
207             e->rgba4 |= (int)(f_u * 0xf) << (c * 4);
208             if (c == 0)
209                e->z24 = f_u * 0xffffff;
210          }
211       }
212 
213 #ifdef DEBUG
214       memset(&e->__pad0, 0, sizeof(e->__pad0));
215       memset(&e->__pad1, 0, sizeof(e->__pad1));
216 #endif
217    }
218 }
219 
220 static void
emit_border_color(struct fd_context * ctx,struct fd_ringbuffer * ring)221 emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt
222 {
223    struct fd6_context *fd6_ctx = fd6_context(ctx);
224    struct bcolor_entry *entries;
225    unsigned off;
226    void *ptr;
227 
228    STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
229 
230    u_upload_alloc(fd6_ctx->border_color_uploader, 0,
231                   FD6_BORDER_COLOR_UPLOAD_SIZE, FD6_BORDER_COLOR_UPLOAD_SIZE,
232                   &off, &fd6_ctx->border_color_buf, &ptr);
233 
234    entries = ptr;
235 
236    setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0], ctx->screen);
237    setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT],
238                        &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers],
239                        ctx->screen);
240 
241    OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
242    OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0);
243 
244    u_upload_unmap(fd6_ctx->border_color_uploader);
245 }
246 
247 static void
fd6_emit_fb_tex(struct fd_ringbuffer * state,struct fd_context * ctx)248 fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) assert_dt
249 {
250    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
251    struct pipe_surface *psurf = pfb->cbufs[0];
252    struct fd_resource *rsc = fd_resource(psurf->texture);
253 
254    OUT_RINGP(state, 0, &ctx->batch->fb_read_patches); /* texconst0, patched in gmem emit */
255    OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) |
256                       A6XX_TEX_CONST_1_HEIGHT(pfb->height));
257    OUT_RING(state, 0); /* texconst2, patched in gmem emit */
258    OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size));
259    OUT_RING(state, 0); /* BASE_LO, patched in gmem emit */
260    OUT_RING(state, 0); /* BASE_HI, patched in gmem emit */
261    OUT_RING(state, 0); /* texconst6 */
262    OUT_RING(state, 0); /* texconst7 */
263    OUT_RING(state, 0); /* texconst8 */
264    OUT_RING(state, 0); /* texconst9 */
265    OUT_RING(state, 0); /* texconst10 */
266    OUT_RING(state, 0); /* texconst11 */
267    OUT_RING(state, 0);
268    OUT_RING(state, 0);
269    OUT_RING(state, 0);
270    OUT_RING(state, 0);
271 }
272 
273 bool
fd6_emit_textures(struct fd_context * ctx,struct fd_ringbuffer * ring,enum pipe_shader_type type,struct fd_texture_stateobj * tex,unsigned bcolor_offset,const struct ir3_shader_variant * v)274 fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
275                   enum pipe_shader_type type, struct fd_texture_stateobj *tex,
276                   unsigned bcolor_offset,
277                   /* can be NULL if no image/SSBO/fb state to merge in: */
278                   const struct ir3_shader_variant *v)
279 {
280    bool needs_border = false;
281    unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg;
282    enum a6xx_state_block sb;
283 
284    switch (type) {
285    case PIPE_SHADER_VERTEX:
286       sb = SB6_VS_TEX;
287       opcode = CP_LOAD_STATE6_GEOM;
288       tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP;
289       tex_const_reg = REG_A6XX_SP_VS_TEX_CONST;
290       tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
291       break;
292    case PIPE_SHADER_TESS_CTRL:
293       sb = SB6_HS_TEX;
294       opcode = CP_LOAD_STATE6_GEOM;
295       tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP;
296       tex_const_reg = REG_A6XX_SP_HS_TEX_CONST;
297       tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT;
298       break;
299    case PIPE_SHADER_TESS_EVAL:
300       sb = SB6_DS_TEX;
301       opcode = CP_LOAD_STATE6_GEOM;
302       tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP;
303       tex_const_reg = REG_A6XX_SP_DS_TEX_CONST;
304       tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT;
305       break;
306    case PIPE_SHADER_GEOMETRY:
307       sb = SB6_GS_TEX;
308       opcode = CP_LOAD_STATE6_GEOM;
309       tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP;
310       tex_const_reg = REG_A6XX_SP_GS_TEX_CONST;
311       tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT;
312       break;
313    case PIPE_SHADER_FRAGMENT:
314       sb = SB6_FS_TEX;
315       opcode = CP_LOAD_STATE6_FRAG;
316       tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP;
317       tex_const_reg = REG_A6XX_SP_FS_TEX_CONST;
318       tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
319       break;
320    case PIPE_SHADER_COMPUTE:
321       sb = SB6_CS_TEX;
322       opcode = CP_LOAD_STATE6_FRAG;
323       tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP;
324       tex_const_reg = REG_A6XX_SP_CS_TEX_CONST;
325       tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
326       break;
327    default:
328       unreachable("bad state block");
329    }
330 
331    if (tex->num_samplers > 0) {
332       struct fd_ringbuffer *state =
333          fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4);
334       for (unsigned i = 0; i < tex->num_samplers; i++) {
335          static const struct fd6_sampler_stateobj dummy_sampler = {};
336          const struct fd6_sampler_stateobj *sampler =
337             tex->samplers[i] ? fd6_sampler_stateobj(tex->samplers[i])
338                              : &dummy_sampler;
339          OUT_RING(state, sampler->texsamp0);
340          OUT_RING(state, sampler->texsamp1);
341          OUT_RING(state, sampler->texsamp2 |
342                             A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset));
343          OUT_RING(state, sampler->texsamp3);
344          needs_border |= sampler->needs_border;
345       }
346 
347       /* output sampler state: */
348       OUT_PKT7(ring, opcode, 3);
349       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
350                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
351                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
352                         CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
353                         CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers));
354       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
355 
356       OUT_PKT4(ring, tex_samp_reg, 2);
357       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
358 
359       fd_ringbuffer_del(state);
360    }
361 
362    unsigned num_merged_textures = tex->num_textures;
363    unsigned num_textures = tex->num_textures;
364    if (v) {
365       num_merged_textures += v->image_mapping.num_tex;
366 
367       if (v->fb_read)
368          num_merged_textures++;
369 
370       /* There could be more bound textures than what the shader uses.
371        * Which isn't known at shader compile time.  So in the case we
372        * are merging tex state, only emit the textures that the shader
373        * uses (since the image/SSBO related tex state comes immediately
374        * after)
375        */
376       num_textures = v->image_mapping.tex_base;
377    }
378 
379    if (num_merged_textures > 0) {
380       struct fd_ringbuffer *state =
381          fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4);
382       for (unsigned i = 0; i < num_textures; i++) {
383          const struct fd6_pipe_sampler_view *view;
384 
385          if (tex->textures[i]) {
386             view = fd6_pipe_sampler_view(tex->textures[i]);
387             if (unlikely(view->rsc_seqno !=
388                          fd_resource(view->base.texture)->seqno)) {
389                fd6_sampler_view_update(ctx,
390                                        fd6_pipe_sampler_view(tex->textures[i]));
391             }
392          } else {
393             static const struct fd6_pipe_sampler_view dummy_view = {};
394             view = &dummy_view;
395          }
396 
397          OUT_RING(state, view->descriptor[0]);
398          OUT_RING(state, view->descriptor[1]);
399          OUT_RING(state, view->descriptor[2]);
400          OUT_RING(state, view->descriptor[3]);
401 
402          if (view->ptr1) {
403             OUT_RELOC(state, view->ptr1->bo, view->descriptor[4],
404                       (uint64_t)view->descriptor[5] << 32, 0);
405          } else {
406             OUT_RING(state, view->descriptor[4]);
407             OUT_RING(state, view->descriptor[5]);
408          }
409 
410          OUT_RING(state, view->descriptor[6]);
411 
412          if (view->ptr2) {
413             OUT_RELOC(state, view->ptr2->bo, view->descriptor[7], 0, 0);
414          } else {
415             OUT_RING(state, view->descriptor[7]);
416             OUT_RING(state, view->descriptor[8]);
417          }
418 
419          OUT_RING(state, view->descriptor[9]);
420          OUT_RING(state, view->descriptor[10]);
421          OUT_RING(state, view->descriptor[11]);
422          OUT_RING(state, view->descriptor[12]);
423          OUT_RING(state, view->descriptor[13]);
424          OUT_RING(state, view->descriptor[14]);
425          OUT_RING(state, view->descriptor[15]);
426       }
427 
428       if (v) {
429          const struct ir3_ibo_mapping *mapping = &v->image_mapping;
430          struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type];
431          struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type];
432 
433          for (unsigned i = 0; i < mapping->num_tex; i++) {
434             unsigned idx = mapping->tex_to_image[i];
435             if (idx & IBO_SSBO) {
436                fd6_emit_ssbo_tex(ctx, state, &buf->sb[idx & ~IBO_SSBO]);
437             } else {
438                fd6_emit_image_tex(ctx, state, &img->si[idx]);
439             }
440          }
441 
442          if (v->fb_read) {
443             fd6_emit_fb_tex(state, ctx);
444          }
445       }
446 
447       /* emit texture state: */
448       OUT_PKT7(ring, opcode, 3);
449       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
450                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
451                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
452                         CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
453                         CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures));
454       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
455 
456       OUT_PKT4(ring, tex_const_reg, 2);
457       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
458 
459       fd_ringbuffer_del(state);
460    }
461 
462    OUT_PKT4(ring, tex_count_reg, 1);
463    OUT_RING(ring, num_merged_textures);
464 
465    return needs_border;
466 }
467 
468 /* Emits combined texture state, which also includes any Image/SSBO
469  * related texture state merged in (because we must have all texture
470  * state for a given stage in a single buffer).  In the fast-path, if
471  * we don't need to merge in any image/ssbo related texture state, we
472  * just use cached texture stateobj.  Otherwise we generate a single-
473  * use stateobj.
474  *
475  * TODO Is there some sane way we can still use cached texture stateobj
476  * with image/ssbo in use?
477  *
478  * returns whether border_color is required:
479  */
480 static bool
fd6_emit_combined_textures(struct fd_ringbuffer * ring,struct fd6_emit * emit,enum pipe_shader_type type,const struct ir3_shader_variant * v)481 fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit,
482                            enum pipe_shader_type type,
483                            const struct ir3_shader_variant *v) assert_dt
484 {
485    struct fd_context *ctx = emit->ctx;
486    bool needs_border = false;
487 
488    static const struct {
489       enum fd6_state_id state_id;
490       unsigned enable_mask;
491    } s[PIPE_SHADER_TYPES] = {
492       [PIPE_SHADER_VERTEX] = {FD6_GROUP_VS_TEX, ENABLE_ALL},
493       [PIPE_SHADER_TESS_CTRL] = {FD6_GROUP_HS_TEX, ENABLE_ALL},
494       [PIPE_SHADER_TESS_EVAL] = {FD6_GROUP_DS_TEX, ENABLE_ALL},
495       [PIPE_SHADER_GEOMETRY] = {FD6_GROUP_GS_TEX, ENABLE_ALL},
496       [PIPE_SHADER_FRAGMENT] = {FD6_GROUP_FS_TEX, ENABLE_DRAW},
497    };
498 
499    assert(s[type].state_id);
500 
501    if (!v->image_mapping.num_tex && !v->fb_read) {
502       /* in the fast-path, when we don't have to mix in any image/SSBO
503        * related texture state, we can just lookup the stateobj and
504        * re-emit that:
505        *
506        * Also, framebuffer-read is a slow-path because an extra
507        * texture needs to be inserted.
508        *
509        * TODO we can probably simmplify things if we also treated
510        * border_color as a slow-path.. this way the tex state key
511        * wouldn't depend on bcolor_offset.. but fb_read might rather
512        * be *somehow* a fast-path if we eventually used it for PLS.
513        * I suppose there would be no harm in just *always* inserting
514        * an fb_read texture?
515        */
516       if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) &&
517           ctx->tex[type].num_textures > 0) {
518          struct fd6_texture_state *tex =
519             fd6_texture_state(ctx, type, &ctx->tex[type]);
520 
521          needs_border |= tex->needs_border;
522 
523          fd6_emit_add_group(emit, tex->stateobj, s[type].state_id,
524                             s[type].enable_mask);
525 
526          fd6_texture_state_reference(&tex, NULL);
527       }
528    } else {
529       /* In the slow-path, create a one-shot texture state object
530        * if either TEX|PROG|SSBO|IMAGE state is dirty:
531        */
532       if ((ctx->dirty_shader[type] &
533            (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE |
534             FD_DIRTY_SHADER_SSBO)) ||
535           v->fb_read) {
536          struct fd_texture_stateobj *tex = &ctx->tex[type];
537          struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer(
538             ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
539          unsigned bcolor_offset = fd6_border_color_offset(ctx, type, tex);
540 
541          needs_border |=
542             fd6_emit_textures(ctx, stateobj, type, tex, bcolor_offset, v);
543 
544          fd6_emit_take_group(emit, stateobj, s[type].state_id,
545                              s[type].enable_mask);
546       }
547    }
548 
549    return needs_border;
550 }
551 
552 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)553 build_vbo_state(struct fd6_emit *emit) assert_dt
554 {
555    const struct fd_vertex_state *vtx = emit->vtx;
556 
557    /* Limit PKT4 size, because at max count (32) we would overflow the
558     * size of the PKT4 size field:
559     */
560    const unsigned maxcnt = 16;
561    const unsigned cnt = vtx->vertexbuf.count;
562    const unsigned dwords = (cnt * 4) /* per vbo: reg64 + two reg32 */
563                + (1 + cnt / maxcnt); /* PKT4 hdr every 16 vbo's */
564 
565    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
566       emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING);
567 
568    for (int32_t j = 0; j < cnt; j++) {
569       if ((j % maxcnt) == 0) {
570          unsigned sz = MIN2(maxcnt, cnt - j);
571          OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 4 * sz);
572       }
573       const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
574       struct fd_resource *rsc = fd_resource(vb->buffer.resource);
575       if (rsc == NULL) {
576          OUT_RING(ring, 0);
577          OUT_RING(ring, 0);
578          OUT_RING(ring, 0);
579          OUT_RING(ring, 0);
580       } else {
581          uint32_t off = vb->buffer_offset;
582          uint32_t size = vb->buffer.resource->width0 - off;
583 
584          OUT_RELOC(ring, rsc->bo, off, 0, 0);
585          OUT_RING(ring, size);       /* VFD_FETCH[j].SIZE */
586          OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */
587       }
588    }
589 
590    return ring;
591 }
592 
593 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)594 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
595 {
596    struct fd_context *ctx = emit->ctx;
597    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
598    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
599    const struct ir3_shader_variant *fs = emit->fs;
600 
601    if (fs->fs.early_fragment_tests)
602       return A6XX_EARLY_Z;
603 
604    if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled ||
605        fs->writes_stencilref) {
606       return A6XX_LATE_Z;
607    } else if ((fs->has_kill || zsa->alpha_test) &&
608               (zsa->writes_zs || !pfb->zsbuf)) {
609       /* Slightly odd, but seems like the hw wants us to select
610        * LATE_Z mode if there is no depth buffer + discard.  Either
611        * that, or when occlusion query is enabled.  See:
612        *
613        * dEQP-GLES31.functional.fbo.no_attachments.*
614        */
615       return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
616    } else {
617       return A6XX_EARLY_Z;
618    }
619 }
620 
621 /**
622  * Calculate normalized LRZ state based on zsa/prog/blend state, updating
623  * the zsbuf's lrz state as necessary to detect the cases where we need
624  * to invalidate lrz.
625  */
626 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit,bool binning_pass)627 compute_lrz_state(struct fd6_emit *emit, bool binning_pass) assert_dt
628 {
629    struct fd_context *ctx = emit->ctx;
630    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
631    const struct ir3_shader_variant *fs = emit->fs;
632    struct fd6_lrz_state lrz;
633 
634    if (!pfb->zsbuf) {
635       memset(&lrz, 0, sizeof(lrz));
636       if (!binning_pass) {
637          lrz.z_mode = compute_ztest_mode(emit, false);
638       }
639       return lrz;
640    }
641 
642    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
643    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
644    struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
645 
646    lrz = zsa->lrz;
647 
648    /* normalize lrz state: */
649    if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill ||
650        blend->base.alpha_to_coverage) {
651       lrz.write = false;
652       if (binning_pass)
653          lrz.enable = false;
654    }
655 
656    /* if we change depthfunc direction, bail out on using LRZ.  The
657     * LRZ buffer encodes a min/max depth value per block, but if
658     * we switch from GT/GE <-> LT/LE, those values cannot be
659     * interpreted properly.
660     */
661    if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
662        (rsc->lrz_direction != lrz.direction)) {
663       rsc->lrz_valid = false;
664    }
665 
666    if (zsa->invalidate_lrz || !rsc->lrz_valid) {
667       rsc->lrz_valid = false;
668       memset(&lrz, 0, sizeof(lrz));
669    }
670 
671    if (fs->no_earlyz || fs->writes_pos) {
672       lrz.enable = false;
673       lrz.write = false;
674       lrz.test = false;
675    }
676 
677    if (!binning_pass) {
678       lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
679    }
680 
681    /* Once we start writing to the real depth buffer, we lock in the
682     * direction for LRZ.. if we have to skip a LRZ write for any
683     * reason, it is still safe to have LRZ until there is a direction
684     * reversal.  Prior to the reversal, since we disabled LRZ writes
685     * in the "unsafe" cases, this just means that the LRZ test may
686     * not early-discard some things that end up not passing a later
687     * test (ie. be overly concervative).  But once you have a reversal
688     * of direction, it is possible to increase/decrease the z value
689     * to the point where the overly-conservative test is incorrect.
690     */
691    if (zsa->base.depth_writemask) {
692       rsc->lrz_direction = lrz.direction;
693    }
694 
695    return lrz;
696 }
697 
698 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit,bool binning_pass)699 build_lrz(struct fd6_emit *emit, bool binning_pass) assert_dt
700 {
701    struct fd_context *ctx = emit->ctx;
702    struct fd6_context *fd6_ctx = fd6_context(ctx);
703    struct fd6_lrz_state lrz = compute_lrz_state(emit, binning_pass);
704 
705    /* If the LRZ state has not changed, we can skip the emit: */
706    if (!ctx->last.dirty &&
707        !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz)))
708       return NULL;
709 
710    fd6_ctx->last.lrz[binning_pass] = lrz;
711 
712    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
713       ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING);
714 
715    OUT_REG(ring,
716            A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write,
717                               .greater = lrz.direction == FD_LRZ_GREATER,
718                               .z_test_enable = lrz.test, ));
719    OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
720 
721    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
722 
723    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
724 
725    return ring;
726 }
727 
728 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)729 build_scissor(struct fd6_emit *emit) assert_dt
730 {
731    struct fd_context *ctx = emit->ctx;
732    struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
733 
734    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
735       emit->ctx->batch->submit, 3 * 4, FD_RINGBUFFER_STREAMING);
736 
737    OUT_REG(
738       ring,
739       A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = scissor->minx, .y = scissor->miny),
740       A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
741                                      .y = MAX2(scissor->maxy, 1) - 1));
742 
743    ctx->batch->max_scissor.minx =
744       MIN2(ctx->batch->max_scissor.minx, scissor->minx);
745    ctx->batch->max_scissor.miny =
746       MIN2(ctx->batch->max_scissor.miny, scissor->miny);
747    ctx->batch->max_scissor.maxx =
748       MAX2(ctx->batch->max_scissor.maxx, scissor->maxx);
749    ctx->batch->max_scissor.maxy =
750       MAX2(ctx->batch->max_scissor.maxy, scissor->maxy);
751 
752    return ring;
753 }
754 
755 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
756  * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
757  */
758 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)759 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
760 {
761    struct fd_context *ctx = emit->ctx;
762    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
763    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
764    const struct ir3_shader_variant *fs = emit->fs;
765 
766    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
767       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
768 
769    unsigned nr = pfb->nr_cbufs;
770 
771    if (ctx->rasterizer->rasterizer_discard)
772       nr = 0;
773 
774    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
775 
776    if (blend->use_dual_src_blend)
777       nr++;
778 
779    OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
780    OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
781                      COND(fs->writes_smask && pfb->samples > 1,
782                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
783                      COND(fs->writes_stencilref,
784                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
785                      COND(blend->use_dual_src_blend,
786                           A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
787    OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
788 
789    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
790    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
791 
792    unsigned mrt_components = 0;
793    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
794       if (!pfb->cbufs[i])
795          continue;
796       mrt_components |= 0xf << (i * 4);
797    }
798 
799    /* dual source blending has an extra fs output in the 2nd slot */
800    if (blend->use_dual_src_blend)
801       mrt_components |= 0xf << 4;
802 
803    mrt_components &= prog->mrt_components;
804 
805    OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
806    OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
807 
808    return ring;
809 }
810 
811 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)812 build_blend_color(struct fd6_emit *emit) assert_dt
813 {
814    struct fd_context *ctx = emit->ctx;
815    struct pipe_blend_color *bcolor = &ctx->blend_color;
816    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
817       ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
818 
819    OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
820            A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
821            A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
822            A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
823 
824    return ring;
825 }
826 
827 static struct fd_ringbuffer *
build_ibo(struct fd6_emit * emit)828 build_ibo(struct fd6_emit *emit) assert_dt
829 {
830    struct fd_context *ctx = emit->ctx;
831 
832    if (emit->hs) {
833       assert(ir3_shader_nibo(emit->hs) == 0);
834       assert(ir3_shader_nibo(emit->ds) == 0);
835    }
836    if (emit->gs) {
837       assert(ir3_shader_nibo(emit->gs) == 0);
838    }
839 
840    struct fd_ringbuffer *ibo_state =
841       fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT);
842    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
843       ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING);
844 
845    OUT_PKT7(ring, CP_LOAD_STATE6, 3);
846    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
847                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
848                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
849                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
850                      CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs)));
851    OUT_RB(ring, ibo_state);
852 
853    OUT_PKT4(ring, REG_A6XX_SP_IBO, 2);
854    OUT_RB(ring, ibo_state);
855 
856    /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could
857     * de-duplicate this from program->config_stateobj
858     */
859    OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
860    OUT_RING(ring, ir3_shader_nibo(emit->fs));
861 
862    fd_ringbuffer_del(ibo_state);
863 
864    return ring;
865 }
866 
867 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)868 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
869 {
870    struct fd_context *ctx = emit->ctx;
871    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
872    const struct ir3_stream_output_info *info = prog->stream_output;
873    struct fd_streamout_stateobj *so = &ctx->streamout;
874 
875    emit->streamout_mask = 0;
876 
877    if (!info)
878       return;
879 
880    for (unsigned i = 0; i < so->num_targets; i++) {
881       struct fd_stream_output_target *target =
882          fd_stream_output_target(so->targets[i]);
883 
884       if (!target)
885          continue;
886 
887       target->stride = info->stride[i];
888 
889       OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
890       /* VPC_SO[i].BUFFER_BASE_LO: */
891       OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
892       OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
893 
894       struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
895 
896       if (so->reset & (1 << i)) {
897          assert(so->offsets[i] == 0);
898 
899          OUT_PKT7(ring, CP_MEM_WRITE, 3);
900          OUT_RELOC(ring, offset_bo, 0, 0, 0);
901          OUT_RING(ring, target->base.buffer_offset);
902 
903          OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
904          OUT_RING(ring, target->base.buffer_offset);
905       } else {
906          OUT_PKT7(ring, CP_MEM_TO_REG, 3);
907          OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
908                            CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
909                            CP_MEM_TO_REG_0_CNT(0));
910          OUT_RELOC(ring, offset_bo, 0, 0, 0);
911       }
912 
913       // After a draw HW would write the new offset to offset_bo
914       OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
915       OUT_RELOC(ring, offset_bo, 0, 0, 0);
916 
917       so->reset &= ~(1 << i);
918 
919       emit->streamout_mask |= (1 << i);
920    }
921 
922    if (emit->streamout_mask) {
923       fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO,
924                          ENABLE_ALL);
925    } else if (ctx->last.streamout_mask != 0) {
926       /* If we transition from a draw with streamout to one without, turn
927        * off streamout.
928        */
929       fd6_emit_add_group(emit, fd6_context(ctx)->streamout_disable_stateobj,
930                          FD6_GROUP_SO, ENABLE_ALL);
931    }
932 
933    /* Make sure that any use of our TFB outputs (indirect draw source or shader
934     * UBO reads) comes after the TFB output is written.  From the GL 4.6 core
935     * spec:
936     *
937     *     "Buffers should not be bound or in use for both transform feedback and
938     *      other purposes in the GL.  Specifically, if a buffer object is
939     *      simultaneously bound to a transform feedback buffer binding point
940     *      and elsewhere in the GL, any writes to or reads from the buffer
941     *      generate undefined values."
942     *
943     * So we idle whenever SO buffers change.  Note that this function is called
944     * on every draw with TFB enabled, so check the dirty flag for the buffers
945     * themselves.
946     */
947    if (ctx->dirty & FD_DIRTY_STREAMOUT)
948       fd_wfi(ctx->batch, ring);
949 
950    ctx->last.streamout_mask = emit->streamout_mask;
951 }
952 
953 /**
954  * Stuff that less frequently changes and isn't (yet) moved into stategroups
955  */
956 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)957 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
958 {
959    struct fd_context *ctx = emit->ctx;
960    const enum fd_dirty_3d_state dirty = emit->dirty;
961 
962    if (dirty & FD_DIRTY_STENCIL_REF) {
963       struct pipe_stencil_ref *sr = &ctx->stencil_ref;
964 
965       OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
966       OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
967                         A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
968    }
969 
970    if (dirty & FD_DIRTY_VIEWPORT) {
971       struct pipe_scissor_state *scissor = &ctx->viewport_scissor;
972 
973       OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]),
974               A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]),
975               A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]),
976               A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]),
977               A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]),
978               A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2]));
979 
980       OUT_REG(
981          ring,
982          A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = scissor->minx,
983                                           .y = scissor->miny),
984          A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
985                                           .y = MAX2(scissor->maxy, 1) - 1));
986 
987       unsigned guardband_x = fd_calc_guardband(ctx->viewport.translate[0],
988                                                ctx->viewport.scale[0], false);
989       unsigned guardband_y = fd_calc_guardband(ctx->viewport.translate[1],
990                                                ctx->viewport.scale[1], false);
991 
992       OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = guardband_x,
993                                                     .vert = guardband_y));
994    }
995 
996    /* The clamp ranges are only used when the rasterizer disables
997     * depth clip.
998     */
999    if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) &&
1000        fd_depth_clip_disabled(ctx)) {
1001       float zmin, zmax;
1002       util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz,
1003                               &zmin, &zmax);
1004 
1005       OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin),
1006               A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax));
1007 
1008       OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
1009    }
1010 }
1011 
1012 void
fd6_emit_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)1013 fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
1014 {
1015    struct fd_context *ctx = emit->ctx;
1016    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
1017    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
1018    const struct ir3_shader_variant *vs = emit->vs;
1019    const struct ir3_shader_variant *hs = emit->hs;
1020    const struct ir3_shader_variant *ds = emit->ds;
1021    const struct ir3_shader_variant *gs = emit->gs;
1022    const struct ir3_shader_variant *fs = emit->fs;
1023    bool needs_border = false;
1024 
1025    emit_marker6(ring, 5);
1026 
1027    /* NOTE: we track fb_read differently than _BLEND_ENABLED since we
1028     * might decide to do sysmem in some cases when blend is enabled:
1029     */
1030    if (fs->fb_read)
1031       ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
1032 
1033    u_foreach_bit (b, emit->dirty_groups) {
1034       enum fd6_state_id group = b;
1035       struct fd_ringbuffer *state = NULL;
1036       uint32_t enable_mask = ENABLE_ALL;
1037 
1038       switch (group) {
1039       case FD6_GROUP_VTXSTATE:
1040          state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
1041          fd_ringbuffer_ref(state);
1042          break;
1043       case FD6_GROUP_VBO:
1044          state = build_vbo_state(emit);
1045          break;
1046       case FD6_GROUP_ZSA:
1047          state = fd6_zsa_state(
1048             ctx,
1049             util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
1050             fd_depth_clip_disabled(ctx));
1051          fd_ringbuffer_ref(state);
1052          break;
1053       case FD6_GROUP_LRZ:
1054          state = build_lrz(emit, false);
1055          if (!state)
1056             continue;
1057          enable_mask = ENABLE_DRAW;
1058          break;
1059       case FD6_GROUP_LRZ_BINNING:
1060          state = build_lrz(emit, true);
1061          if (!state)
1062             continue;
1063          enable_mask = CP_SET_DRAW_STATE__0_BINNING;
1064          break;
1065       case FD6_GROUP_SCISSOR:
1066          state = build_scissor(emit);
1067          break;
1068       case FD6_GROUP_PROG:
1069          fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG,
1070                             ENABLE_ALL);
1071          fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW);
1072          fd6_emit_add_group(emit, prog->binning_stateobj,
1073                             FD6_GROUP_PROG_BINNING,
1074                             CP_SET_DRAW_STATE__0_BINNING);
1075 
1076          /* emit remaining streaming program state, ie. what depends on
1077           * other emit state, so cannot be pre-baked.
1078           */
1079          fd6_emit_take_group(emit, fd6_program_interp_state(emit),
1080                              FD6_GROUP_PROG_INTERP, ENABLE_DRAW);
1081          continue;
1082       case FD6_GROUP_RASTERIZER:
1083          state = fd6_rasterizer_state(ctx, emit->primitive_restart);
1084          fd_ringbuffer_ref(state);
1085          break;
1086       case FD6_GROUP_PROG_FB_RAST:
1087          state = build_prog_fb_rast(emit);
1088          break;
1089       case FD6_GROUP_BLEND:
1090          state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask)
1091                     ->stateobj;
1092          fd_ringbuffer_ref(state);
1093          break;
1094       case FD6_GROUP_BLEND_COLOR:
1095          state = build_blend_color(emit);
1096          break;
1097       case FD6_GROUP_IBO:
1098          state = build_ibo(emit);
1099          break;
1100       case FD6_GROUP_CONST:
1101          state = fd6_build_user_consts(emit);
1102          break;
1103       case FD6_GROUP_DRIVER_PARAMS:
1104          state = fd6_build_driver_params(emit);
1105          break;
1106       case FD6_GROUP_PRIMITIVE_PARAMS:
1107          state = fd6_build_tess_consts(emit);
1108          break;
1109       case FD6_GROUP_VS_TEX:
1110          needs_border |=
1111             fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs);
1112          continue;
1113       case FD6_GROUP_HS_TEX:
1114          if (hs) {
1115             needs_border |= fd6_emit_combined_textures(
1116                ring, emit, PIPE_SHADER_TESS_CTRL, hs);
1117          }
1118          continue;
1119       case FD6_GROUP_DS_TEX:
1120          if (ds) {
1121             needs_border |= fd6_emit_combined_textures(
1122                ring, emit, PIPE_SHADER_TESS_EVAL, ds);
1123          }
1124          continue;
1125       case FD6_GROUP_GS_TEX:
1126          if (gs) {
1127             needs_border |=
1128                fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs);
1129          }
1130          continue;
1131       case FD6_GROUP_FS_TEX:
1132          needs_border |=
1133             fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs);
1134          continue;
1135       case FD6_GROUP_SO:
1136          fd6_emit_streamout(ring, emit);
1137          continue;
1138       case FD6_GROUP_NON_GROUP:
1139          fd6_emit_non_ring(ring, emit);
1140          continue;
1141       default:
1142          unreachable("bad state group");
1143       }
1144 
1145       fd6_emit_take_group(emit, state, group, enable_mask);
1146    }
1147 
1148    if (needs_border)
1149       emit_border_color(ctx, ring);
1150 
1151    if (emit->num_groups > 0) {
1152       OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups);
1153       for (unsigned i = 0; i < emit->num_groups; i++) {
1154          struct fd6_state_group *g = &emit->groups[i];
1155          unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0;
1156 
1157          assert((g->enable_mask & ~ENABLE_ALL) == 0);
1158 
1159          if (n == 0) {
1160             OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1161                               CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask |
1162                               CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1163             OUT_RING(ring, 0x00000000);
1164             OUT_RING(ring, 0x00000000);
1165          } else {
1166             OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask |
1167                               CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1168             OUT_RB(ring, g->stateobj);
1169          }
1170 
1171          if (g->stateobj)
1172             fd_ringbuffer_del(g->stateobj);
1173       }
1174       emit->num_groups = 0;
1175    }
1176 }
1177 
1178 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct ir3_shader_variant * cp)1179 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
1180                   struct ir3_shader_variant *cp)
1181 {
1182    enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
1183 
1184    if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG |
1185                 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) {
1186       struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE];
1187       unsigned bcolor_offset =
1188          fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex);
1189 
1190       bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex,
1191                                             bcolor_offset, cp);
1192 
1193       if (needs_border)
1194          emit_border_color(ctx, ring);
1195 
1196       OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1);
1197       OUT_RING(ring, 0);
1198 
1199       OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1);
1200       OUT_RING(ring, 0);
1201 
1202       OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1);
1203       OUT_RING(ring, 0);
1204 
1205       OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1);
1206       OUT_RING(ring, 0);
1207 
1208       OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1);
1209       OUT_RING(ring, 0);
1210    }
1211 
1212    if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) {
1213       struct fd_ringbuffer *state =
1214          fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE);
1215 
1216       OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
1217       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
1218                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |
1219                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1220                         CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
1221                         CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp)));
1222       OUT_RB(ring, state);
1223 
1224       OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2);
1225       OUT_RB(ring, state);
1226 
1227       OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);
1228       OUT_RING(ring, ir3_shader_nibo(cp));
1229 
1230       fd_ringbuffer_del(state);
1231    }
1232 }
1233 
1234 /* emit setup at begin of new cmdstream buffer (don't rely on previous
1235  * state, there could have been a context switch between ioctls):
1236  */
1237 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)1238 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
1239 {
1240    struct fd_screen *screen = batch->ctx->screen;
1241 
1242    if (!batch->nondraw) {
1243       trace_start_state_restore(&batch->trace, ring);
1244    }
1245 
1246    fd6_cache_inv(batch, ring);
1247 
1248    OUT_REG(ring,
1249            A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true,
1250                                     .ds_state = true, .gs_state = true,
1251                                     .fs_state = true, .cs_state = true,
1252                                     .gfx_ibo = true, .cs_ibo = true,
1253                                     .gfx_shared_const = true,
1254                                     .cs_shared_const = true,
1255                                     .gfx_bindless = 0x1f, .cs_bindless = 0x1f));
1256 
1257    OUT_WFI5(ring);
1258 
1259    WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0);
1260    WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
1261    WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0);
1262    WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
1263    WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
1264    WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
1265    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1266    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1267 
1268    WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0);
1269    WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880);
1270    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000);
1271    WRITE(REG_A6XX_SP_CHICKEN_BITS, 0x1430);
1272    WRITE(REG_A6XX_SP_IBO_COUNT, 0);
1273    WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
1274    WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
1275    WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1276    WRITE(REG_A6XX_UCHE_CLIENT_PF, 4);
1277    WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1);
1278    WRITE(REG_A6XX_SP_MODE_CONTROL,
1279          A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
1280    WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1281    WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1282    WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f);
1283 
1284    WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
1285    WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
1286    WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
1287 
1288    WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
1289    WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
1290    WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
1291    WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
1292    WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
1293    WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
1294    WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1295    WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1296 
1297    WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
1298    WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1299 
1300    WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
1301 
1302    WRITE(REG_A6XX_PC_RASTER_CNTL, 0);
1303 
1304    WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
1305 
1306    WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1307 
1308    WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1309    WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
1310    WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1311    WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1312    WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1313    WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1314    WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1315    WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1316    WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0);
1317    /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
1318     * but this seems to kill texture gather offsets.
1319     */
1320    WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
1321          A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1322    WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0);
1323    WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0);
1324    WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0);
1325    WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0);
1326    WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1327 
1328    emit_marker6(ring, 7);
1329 
1330    OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1331    OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */
1332 
1333    WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
1334 
1335    OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1);
1336    OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */
1337 
1338    /* Clear any potential pending state groups to be safe: */
1339    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1340    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1341                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1342                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1343    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1344    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1345 
1346    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1347    OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1348 
1349    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1350    OUT_RING(ring, 0x00000000);
1351 
1352    OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1353    OUT_RING(ring, 0x00000000);
1354 
1355    /* Initialize VFD_FETCH[n].SIZE to zero to avoid iova faults trying
1356     * to fetch from a VFD_FETCH[n].BASE which we've potentially inherited
1357     * from another process:
1358     */
1359    for (int32_t i = 0; i < 32; i++) {
1360       OUT_PKT4(ring, REG_A6XX_VFD_FETCH_SIZE(i), 1);
1361       OUT_RING(ring, 0);
1362    }
1363 
1364    /* This happens after all drawing has been emitted to the draw CS, so we know
1365     * whether we need the tess BO pointers.
1366     */
1367    if (batch->tessellation) {
1368       assert(screen->tess_bo);
1369       OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2);
1370       OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
1371       /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
1372       OUT_WFI5(ring);
1373    }
1374 
1375    if (!batch->nondraw) {
1376       trace_end_state_restore(&batch->trace, ring);
1377    }
1378 }
1379 
1380 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1381 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1382                unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1383                unsigned sizedwords)
1384 {
1385    struct fd_bo *src_bo = fd_resource(src)->bo;
1386    struct fd_bo *dst_bo = fd_resource(dst)->bo;
1387    unsigned i;
1388 
1389    for (i = 0; i < sizedwords; i++) {
1390       OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1391       OUT_RING(ring, 0x00000000);
1392       OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1393       OUT_RELOC(ring, src_bo, src_off, 0, 0);
1394 
1395       dst_off += 4;
1396       src_off += 4;
1397    }
1398 }
1399 
1400 /* this is *almost* the same as fd6_cache_flush().. which I guess
1401  * could be re-worked to be something a bit more generic w/ param
1402  * indicating what needs to be flushed..  although that would mean
1403  * figuring out which events trigger what state to flush..
1404  */
1405 static void
fd6_framebuffer_barrier(struct fd_context * ctx)1406 fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt
1407 {
1408    struct fd6_context *fd6_ctx = fd6_context(ctx);
1409    struct fd_batch *batch = fd_context_batch_locked(ctx);
1410    struct fd_ringbuffer *ring = batch->draw;
1411    unsigned seqno;
1412 
1413    fd_batch_needs_flush(batch);
1414 
1415    seqno = fd6_event_write(batch, ring, RB_DONE_TS, true);
1416 
1417    OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
1418    OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1419                      CP_WAIT_REG_MEM_0_POLL_MEMORY);
1420    OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1421    OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
1422    OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
1423    OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1424 
1425    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
1426    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
1427 
1428    seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
1429    fd_wfi(batch, ring);
1430 
1431    fd6_event_write(batch, ring, CACHE_INVALIDATE, false);
1432 
1433    OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
1434    OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
1435    OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1436    OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
1437 
1438    fd_batch_unlock_submit(batch);
1439    fd_batch_reference(&batch, NULL);
1440 }
1441 
1442 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1443 fd6_emit_init_screen(struct pipe_screen *pscreen)
1444 {
1445    struct fd_screen *screen = fd_screen(pscreen);
1446    screen->emit_ib = fd6_emit_ib;
1447    screen->mem_to_mem = fd6_mem_to_mem;
1448 }
1449 
1450 void
fd6_emit_init(struct pipe_context * pctx)1451 fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis
1452 {
1453    struct fd_context *ctx = fd_context(pctx);
1454    ctx->framebuffer_barrier = fd6_framebuffer_barrier;
1455 }
1456