1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_build_pm4.h"
8
9 /* For MSAA sample positions. */
10 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
11 ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \
12 (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | \
13 (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
14
15 /* For obtaining location coordinates from registers */
16 #define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
17 #define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
18 #define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
19 #define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
20
21 /* The following sample ordering is required by EQAA.
22 *
23 * Sample 0 is approx. in the top-left quadrant.
24 * Sample 1 is approx. in the bottom-right quadrant.
25 *
26 * Sample 2 is approx. in the bottom-left quadrant.
27 * Sample 3 is approx. in the top-right quadrant.
28 * (sample I={2,3} adds more detail to the vicinity of sample I-2)
29 *
30 * Sample 4 is approx. in the same quadrant as sample 0. (top-left)
31 * Sample 5 is approx. in the same quadrant as sample 1. (bottom-right)
32 * Sample 6 is approx. in the same quadrant as sample 2. (bottom-left)
33 * Sample 7 is approx. in the same quadrant as sample 3. (top-right)
34 * (sample I={4,5,6,7} adds more detail to the vicinity of sample I-4)
35 *
36 * The next 8 samples add more detail to the vicinity of the previous samples.
37 * (sample I (I >= 8) adds more detail to the vicinity of sample I-8)
38 *
39 * The ordering is specified such that:
40 * If we take the first 2 samples, we should get good 2x MSAA.
41 * If we add 2 more samples, we should get good 4x MSAA with the same sample locations.
42 * If we add 4 more samples, we should get good 8x MSAA with the same sample locations.
43 * If we add 8 more samples, we should get perfect 16x MSAA with the same sample locations.
44 *
45 * The ordering also allows finding samples in the same vicinity.
46 *
47 * Group N of 2 samples in the same vicinity in 16x MSAA: {N,N+8}
48 * Group N of 2 samples in the same vicinity in 8x MSAA: {N,N+4}
49 * Group N of 2 samples in the same vicinity in 4x MSAA: {N,N+2}
50 *
51 * Groups of 4 samples in the same vicinity in 16x MSAA:
52 * Top left: {0,4,8,12}
53 * Bottom right: {1,5,9,13}
54 * Bottom left: {2,6,10,14}
55 * Top right: {3,7,11,15}
56 *
57 * Groups of 4 samples in the same vicinity in 8x MSAA:
58 * Left half: {0,2,4,6}
59 * Right half: {1,3,5,7}
60 *
61 * Groups of 8 samples in the same vicinity in 16x MSAA:
62 * Left half: {0,2,4,6,8,10,12,14}
63 * Right half: {1,3,5,7,9,11,13,15}
64 */
65
66 /* Important note: We have to use the standard DX positions because shader-based culling
67 * relies on them.
68 */
69
70 /* 1x MSAA */
71 static const uint32_t sample_locs_1x =
72 FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
73 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
74
75 /* 2x MSAA (the positions are sorted for EQAA) */
76 static const uint32_t sample_locs_2x =
77 FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
78 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
79
80 /* 4x MSAA (the positions are sorted for EQAA) */
81 static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
82 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
83
84 /* 8x MSAA (the positions are sorted for EQAA) */
85 static const uint32_t sample_locs_8x[] = {
86 FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
87 FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
88 /* The following are unused by hardware, but we emit them to IBs
89 * instead of multiple SET_CONTEXT_REG packets. */
90 0,
91 0,
92 };
93 static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
94
95 /* 16x MSAA (the positions are sorted for EQAA) */
96 static const uint32_t sample_locs_16x[] = {
97 FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
98 FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
99 FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
100 /* We use -7 where DX sample locations want -8, which allows us to make
101 * the PA_SU_PRIM_FILTER_CNTL register immutable. That's a quality compromise
102 * for underused 16x EQAA.
103 */
104 FILL_SREG(-7, -7 /* DX uses -8 */, 2, 5, -7 /* DX uses -8 */, 0, 4, -1),
105 };
106 static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
107
108 /* distance from the pixel center, indexed by log2(nr_samples) */
109 unsigned si_msaa_max_distance[5] = {
110 0, /* no AA */
111 4, /* 2x MSAA */
112 6, /* 4x MSAA */
113 7, /* 8x MSAA */
114 7, /* 16x MSAA */
115 };
116
si_get_sample_position(struct pipe_context * ctx,unsigned sample_count,unsigned sample_index,float * out_value)117 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
118 unsigned sample_index, float *out_value)
119 {
120 const uint32_t *sample_locs;
121
122 switch (sample_count) {
123 case 1:
124 default:
125 sample_locs = &sample_locs_1x;
126 break;
127 case 2:
128 sample_locs = &sample_locs_2x;
129 break;
130 case 4:
131 sample_locs = &sample_locs_4x;
132 break;
133 case 8:
134 sample_locs = sample_locs_8x;
135 break;
136 case 16:
137 sample_locs = sample_locs_16x;
138 break;
139 }
140
141 out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
142 out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
143 }
144
si_emit_max_4_sample_locs(struct si_context * sctx,uint64_t centroid_priority,uint32_t sample_locs)145 static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
146 uint32_t sample_locs)
147 {
148 if (sctx->screen->info.has_set_context_pairs_packed) {
149 radeon_begin(&sctx->gfx_cs);
150 gfx11_begin_packed_context_regs();
151 gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority);
152 gfx11_set_context_reg(R_028BD8_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32);
153 gfx11_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
154 gfx11_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
155 gfx11_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
156 gfx11_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
157 gfx11_end_packed_context_regs();
158 radeon_end();
159 } else {
160 radeon_begin(&sctx->gfx_cs);
161 radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
162 radeon_emit(centroid_priority);
163 radeon_emit(centroid_priority >> 32);
164 radeon_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
165 radeon_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
166 radeon_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
167 radeon_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
168 radeon_end();
169 }
170 }
171
si_emit_max_16_sample_locs(struct si_context * sctx,uint64_t centroid_priority,const uint32_t * sample_locs,unsigned num_samples)172 static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority,
173 const uint32_t *sample_locs, unsigned num_samples)
174 {
175 radeon_begin(&sctx->gfx_cs);
176 radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
177 radeon_emit(centroid_priority);
178 radeon_emit(centroid_priority >> 32);
179 radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
180 num_samples == 8 ? 14 : 16);
181 radeon_emit_array(sample_locs, 4);
182 radeon_emit_array(sample_locs, 4);
183 radeon_emit_array(sample_locs, 4);
184 radeon_emit_array(sample_locs, num_samples == 8 ? 2 : 4);
185 radeon_end();
186 }
187
si_emit_sample_locations(struct si_context * sctx,unsigned index)188 static void si_emit_sample_locations(struct si_context *sctx, unsigned index)
189 {
190 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
191 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
192 unsigned nr_samples = sctx->framebuffer.nr_samples;
193
194 /* Smoothing (only possible with nr_samples == 1) uses the same
195 * sample locations as the MSAA it simulates.
196 */
197 if (nr_samples <= 1 && sctx->smoothing_enabled)
198 nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
199
200 /* Always set MSAA sample locations even with 1x MSAA for simplicity.
201 *
202 * The only chips that don't need to set them for 1x MSAA are GFX6-8 except Polaris,
203 * but there is no benefit in not resetting them to 0 when changing framebuffers from MSAA
204 * to non-MSAA.
205 */
206 if (nr_samples != sctx->sample_locs_num_samples) {
207 switch (nr_samples) {
208 default:
209 case 1:
210 si_emit_max_4_sample_locs(sctx, centroid_priority_1x, sample_locs_1x);
211 break;
212 case 2:
213 si_emit_max_4_sample_locs(sctx, centroid_priority_2x, sample_locs_2x);
214 break;
215 case 4:
216 si_emit_max_4_sample_locs(sctx, centroid_priority_4x, sample_locs_4x);
217 break;
218 case 8:
219 si_emit_max_16_sample_locs(sctx, centroid_priority_8x, sample_locs_8x, 8);
220 break;
221 case 16:
222 si_emit_max_16_sample_locs(sctx, centroid_priority_16x, sample_locs_16x, 16);
223 break;
224 }
225 sctx->sample_locs_num_samples = nr_samples;
226 }
227
228 if (sctx->screen->info.has_small_prim_filter_sample_loc_bug) {
229 /* For hardware with the sample location bug, the problem is that in order to use the small
230 * primitive filter, we need to explicitly set the sample locations to 0. But the DB doesn't
231 * properly process the change of sample locations without a flush, and so we can end up
232 * with incorrect Z values.
233 *
234 * Instead of doing a flush, just disable the small primitive filter when MSAA is
235 * force-disabled.
236 *
237 * The alternative of setting sample locations to 0 would require a DB flush to avoid
238 * Z errors, see https://bugs.freedesktop.org/show_bug.cgi?id=96908
239 */
240 bool small_prim_filter_enable = sctx->framebuffer.nr_samples <= 1 || rs->multisample_enable;
241 assert(sctx->family >= CHIP_POLARIS10);
242
243 radeon_begin(cs);
244 radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
245 SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
246 S_028830_SMALL_PRIM_FILTER_ENABLE(small_prim_filter_enable) |
247 /* Small line culling doesn't work on Polaris10-12. */
248 S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12));
249 radeon_end();
250 }
251 }
252
si_init_msaa_functions(struct si_context * sctx)253 void si_init_msaa_functions(struct si_context *sctx)
254 {
255 int i;
256
257 sctx->atoms.s.sample_locations.emit = si_emit_sample_locations;
258 sctx->b.get_sample_position = si_get_sample_position;
259
260 si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
261
262 for (i = 0; i < 2; i++)
263 si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
264 for (i = 0; i < 4; i++)
265 si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
266 for (i = 0; i < 8; i++)
267 si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
268 for (i = 0; i < 16; i++)
269 si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
270 }
271