• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 
9 /* For MSAA sample positions. */
10 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)                                          \
11    ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) |   \
12     (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) |                                \
13     (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
14 
15 /* For obtaining location coordinates from registers */
16 #define SEXT4(x)               ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
17 #define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
18 #define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
19 #define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
20 
21 /* The following sample ordering is required by EQAA.
22  *
23  * Sample 0 is approx. in the top-left quadrant.
24  * Sample 1 is approx. in the bottom-right quadrant.
25  *
26  * Sample 2 is approx. in the bottom-left quadrant.
27  * Sample 3 is approx. in the top-right quadrant.
28  * (sample I={2,3} adds more detail to the vicinity of sample I-2)
29  *
30  * Sample 4 is approx. in the same quadrant as sample 0. (top-left)
31  * Sample 5 is approx. in the same quadrant as sample 1. (bottom-right)
32  * Sample 6 is approx. in the same quadrant as sample 2. (bottom-left)
33  * Sample 7 is approx. in the same quadrant as sample 3. (top-right)
34  * (sample I={4,5,6,7} adds more detail to the vicinity of sample I-4)
35  *
36  * The next 8 samples add more detail to the vicinity of the previous samples.
37  * (sample I (I >= 8) adds more detail to the vicinity of sample I-8)
38  *
39  * The ordering is specified such that:
40  *   If we take the first 2 samples, we should get good 2x MSAA.
41  *   If we add 2 more samples, we should get good 4x MSAA with the same sample locations.
42  *   If we add 4 more samples, we should get good 8x MSAA with the same sample locations.
43  *   If we add 8 more samples, we should get perfect 16x MSAA with the same sample locations.
44  *
45  * The ordering also allows finding samples in the same vicinity.
46  *
47  * Group N of 2 samples in the same vicinity in 16x MSAA: {N,N+8}
48  * Group N of 2 samples in the same vicinity in 8x MSAA: {N,N+4}
49  * Group N of 2 samples in the same vicinity in 4x MSAA: {N,N+2}
50  *
51  * Groups of 4 samples in the same vicinity in 16x MSAA:
52  *   Top left:     {0,4,8,12}
53  *   Bottom right: {1,5,9,13}
54  *   Bottom left:  {2,6,10,14}
55  *   Top right:    {3,7,11,15}
56  *
57  * Groups of 4 samples in the same vicinity in 8x MSAA:
58  *   Left half:  {0,2,4,6}
59  *   Right half: {1,3,5,7}
60  *
61  * Groups of 8 samples in the same vicinity in 16x MSAA:
62  *   Left half:  {0,2,4,6,8,10,12,14}
63  *   Right half: {1,3,5,7,9,11,13,15}
64  */
65 
66 /* Important note: We have to use the standard DX positions because shader-based culling
67  * relies on them.
68  */
69 
70 /* 1x MSAA */
71 static const uint32_t sample_locs_1x =
72    FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
73 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
74 
75 /* 2x MSAA (the positions are sorted for EQAA) */
76 static const uint32_t sample_locs_2x =
77    FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
78 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
79 
80 /* 4x MSAA (the positions are sorted for EQAA) */
81 static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
82 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
83 
84 /* 8x MSAA (the positions are sorted for EQAA) */
85 static const uint32_t sample_locs_8x[] = {
86    FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
87    FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
88    /* The following are unused by hardware, but we emit them to IBs
89     * instead of multiple SET_CONTEXT_REG packets. */
90    0,
91    0,
92 };
93 static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
94 
95 /* 16x MSAA (the positions are sorted for EQAA) */
96 static const uint32_t sample_locs_16x[] = {
97    FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
98    FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
99    FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
100    /* We use -7 where DX sample locations want -8, which allows us to make
101     * the PA_SU_PRIM_FILTER_CNTL register immutable. That's a quality compromise
102     * for underused 16x EQAA.
103     */
104    FILL_SREG(-7, -7 /* DX uses -8 */, 2, 5, -7 /* DX uses -8 */, 0, 4, -1),
105 };
106 static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
107 
108 /* distance from the pixel center, indexed by log2(nr_samples) */
109 unsigned si_msaa_max_distance[5] = {
110    0, /* no AA */
111    4, /* 2x MSAA */
112    6, /* 4x MSAA */
113    7, /* 8x MSAA */
114    7, /* 16x MSAA */
115 };
116 
si_get_sample_position(struct pipe_context * ctx,unsigned sample_count,unsigned sample_index,float * out_value)117 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
118                                    unsigned sample_index, float *out_value)
119 {
120    const uint32_t *sample_locs;
121 
122    switch (sample_count) {
123    case 1:
124    default:
125       sample_locs = &sample_locs_1x;
126       break;
127    case 2:
128       sample_locs = &sample_locs_2x;
129       break;
130    case 4:
131       sample_locs = &sample_locs_4x;
132       break;
133    case 8:
134       sample_locs = sample_locs_8x;
135       break;
136    case 16:
137       sample_locs = sample_locs_16x;
138       break;
139    }
140 
141    out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
142    out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
143 }
144 
convert_locs_to_unsigned(uint32_t locs)145 static uint32_t convert_locs_to_unsigned(uint32_t locs)
146 {
147    uint32_t result = 0;
148 
149    /* GET_SFIELD extracts int from 4 bits. Add 8 to convert it from -8..7 to 0..15. */
150    for (unsigned i = 0; i < 8; i++)
151       result |= (uint32_t)((GET_SFIELD(locs, i) + 8) & 0xf) << (i * 4);
152 
153    return result;
154 }
155 
si_emit_max_4_sample_locs(struct si_context * sctx,uint64_t centroid_priority,uint32_t sample_locs,uint32_t max_sample_dist)156 static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
157                                       uint32_t sample_locs, uint32_t max_sample_dist)
158 {
159    if (sctx->gfx_level >= GFX12) {
160       radeon_begin(&sctx->gfx_cs);
161       gfx12_begin_context_regs();
162       gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
163       gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
164       gfx12_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
165       gfx12_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
166       gfx12_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
167       gfx12_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
168       gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES,
169                             S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
170       gfx12_end_context_regs();
171       radeon_end();
172    } else if (sctx->screen->info.has_set_context_pairs_packed) {
173       radeon_begin(&sctx->gfx_cs);
174       gfx11_begin_packed_context_regs();
175       gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
176       gfx11_set_context_reg(R_028BD8_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
177       gfx11_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
178       gfx11_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
179       gfx11_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
180       gfx11_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
181       gfx11_end_packed_context_regs();
182       radeon_end();
183    } else {
184       radeon_begin(&sctx->gfx_cs);
185       radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
186       radeon_emit(centroid_priority);
187       radeon_emit(centroid_priority >> 32);
188       radeon_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
189       radeon_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
190       radeon_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
191       radeon_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
192       radeon_end();
193    }
194 
195    if (sctx->gfx_level >= GFX12) {
196       gfx12_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4,
197                             convert_locs_to_unsigned(sample_locs));
198    } else if (sctx->screen->info.has_set_sh_pairs_packed) {
199       gfx11_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4,
200                             convert_locs_to_unsigned(sample_locs));
201    } else {
202       radeon_begin(&sctx->gfx_cs);
203       radeon_set_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4,
204                         convert_locs_to_unsigned(sample_locs));
205       radeon_end();
206    }
207 }
208 
si_emit_max_16_sample_locs(struct si_context * sctx,uint64_t centroid_priority,const uint32_t * sample_locs,unsigned num_samples,uint32_t max_sample_dist)209 static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
210                                        const uint32_t *sample_locs, unsigned num_samples,
211                                        uint32_t max_sample_dist)
212 {
213    radeon_begin(&sctx->gfx_cs);
214 
215    if (sctx->gfx_level >= GFX12) {
216       gfx12_begin_context_regs();
217       gfx12_set_context_reg(R_028BF0_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
218       gfx12_set_context_reg(R_028BF4_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
219       gfx12_set_context_reg(R_028C5C_PA_SC_SAMPLE_PROPERTIES,
220                             S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
221       gfx12_end_context_regs();
222    } else {
223       radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
224       radeon_emit(centroid_priority);
225       radeon_emit(centroid_priority >> 32);
226    }
227 
228    radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
229                               num_samples == 8 ? 14 : 16);
230    radeon_emit_array(sample_locs, 4);
231    radeon_emit_array(sample_locs, 4);
232    radeon_emit_array(sample_locs, 4);
233    radeon_emit_array(sample_locs, num_samples == 8 ? 2 : 4);
234 
235    if (sctx->gfx_level >= GFX12) {
236       gfx12_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4,
237                             convert_locs_to_unsigned(sample_locs[0]));
238       gfx12_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS1 * 4,
239                             convert_locs_to_unsigned(sample_locs[1]));
240    } else if (sctx->screen->info.has_set_sh_pairs_packed) {
241       gfx11_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4,
242                             convert_locs_to_unsigned(sample_locs[0]));
243       gfx11_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS1 * 4,
244                             convert_locs_to_unsigned(sample_locs[1]));
245    } else {
246       radeon_set_sh_reg_seq(R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_SAMPLE_LOCS0 * 4, 2);
247       radeon_emit(convert_locs_to_unsigned(sample_locs[0]));
248       radeon_emit(convert_locs_to_unsigned(sample_locs[1]));
249    }
250    radeon_end();
251 }
252 
si_emit_sample_locations(struct si_context * sctx,unsigned index)253 static void si_emit_sample_locations(struct si_context *sctx, unsigned index)
254 {
255    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
256    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
257    unsigned nr_samples = sctx->framebuffer.nr_samples;
258 
259    /* Smoothing (only possible with nr_samples == 1) uses the same
260     * sample locations as the MSAA it simulates.
261     */
262    if (nr_samples <= 1 && sctx->smoothing_enabled)
263       nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
264 
265    /* Always set MSAA sample locations even with 1x MSAA for simplicity.
266     *
267     * The only chips that don't need to set them for 1x MSAA are GFX6-8 except Polaris,
268     * but there is no benefit in not resetting them to 0 when changing framebuffers from MSAA
269     * to non-MSAA.
270     */
271    if (nr_samples != sctx->sample_locs_num_samples) {
272       unsigned max_sample_dist = si_msaa_max_distance[util_logbase2(nr_samples)];
273 
274       switch (nr_samples) {
275       default:
276       case 1:
277          si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x, max_sample_dist);
278          break;
279       case 2:
280          si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x, max_sample_dist);
281          break;
282       case 4:
283          si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x, max_sample_dist);
284          break;
285       case 8:
286          si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8, max_sample_dist);
287          break;
288       case 16:
289          si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16, max_sample_dist);
290          break;
291       }
292 
293       sctx->sample_locs_num_samples = nr_samples;
294    }
295 
296    if (sctx->screen->info.has_small_prim_filter_sample_loc_bug) {
297       /* For hardware with the sample location bug, the problem is that in order to use the small
298        * primitive filter, we need to explicitly set the sample locations to 0. But the DB doesn't
299        * properly process the change of sample locations without a flush, and so we can end up
300        * with incorrect Z values.
301        *
302        * Instead of doing a flush, just disable the small primitive filter when MSAA is
303        * force-disabled.
304        *
305        * The alternative of setting sample locations to 0 would require a DB flush to avoid
306        * Z errors, see https://bugs.freedesktop.org/show_bug.cgi?id=96908
307        */
308       bool small_prim_filter_enable = sctx->framebuffer.nr_samples <= 1 || rs->multisample_enable;
309       assert(sctx->family >= CHIP_POLARIS10);
310 
311       radeon_begin(cs);
312       radeon_opt_set_context_reg(R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
313                                  SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
314                                  S_028830_SMALL_PRIM_FILTER_ENABLE(small_prim_filter_enable) |
315                                  /* Small line culling doesn't work on Polaris10-12. */
316                                  S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12));
317       radeon_end();
318    }
319 }
320 
si_init_msaa_functions(struct si_context * sctx)321 void si_init_msaa_functions(struct si_context *sctx)
322 {
323    sctx->atoms.s.sample_locations.emit = si_emit_sample_locations;
324    sctx->b.get_sample_position = si_get_sample_position;
325 }
326