• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
19  *
20  * The above copyright notice and this permission notice (including the
21  * next paragraph) shall be included in all copies or substantial portions
22  * of the Software.
23  *
24  */
25 
26 #include "ac_llvm_cull.h"
27 
28 #include <llvm-c/Core.h>
29 
30 struct ac_position_w_info {
31    /* If a primitive intersects the W=0 plane, it causes a reflection
32     * of the determinant used for face culling. Every vertex behind
33     * the W=0 plane negates the determinant, so having 2 vertices behind
34     * the plane has no effect. This is i1 true if the determinant should be
35     * negated.
36     */
37    LLVMValueRef w_reflection;
38 
39    /* If we simplify the "-w <= p <= w" view culling equation, we get
40     * "-w <= w", which can't be satisfied when w is negative.
41     * In perspective projection, a negative W means that the primitive
42     * is behind the viewer, but the equation is independent of the type
43     * of projection.
44     *
45     * w_accepted is false when all W are negative and therefore
46     * the primitive is invisible.
47     */
48    LLVMValueRef w_accepted;
49 
50    /* The bounding box culling doesn't work and should be skipped when this is true. */
51    LLVMValueRef any_w_negative;
52 };
53 
ac_analyze_position_w(struct ac_llvm_context * ctx,LLVMValueRef pos[3][4],struct ac_position_w_info * w,unsigned num_vertices)54 static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
55                                   struct ac_position_w_info *w, unsigned num_vertices)
56 {
57    LLVMBuilderRef builder = ctx->builder;
58    LLVMValueRef all_w_negative = ctx->i1true;
59 
60    w->w_reflection = ctx->i1false;
61    w->any_w_negative = ctx->i1false;
62 
63    for (unsigned i = 0; i < num_vertices; i++) {
64       LLVMValueRef neg_w;
65 
66       neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
67       /* If neg_w is true, negate w_reflection. */
68       w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, "");
69       w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, "");
70       all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, "");
71    }
72    w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
73 }
74 
75 /* Perform front/back face culling and return true if the primitive is accepted. */
ac_cull_face(struct ac_llvm_context * ctx,LLVMValueRef pos[3][4],struct ac_position_w_info * w,bool cull_front,bool cull_back,bool cull_zero_area)76 static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
77                                  struct ac_position_w_info *w, bool cull_front, bool cull_back,
78                                  bool cull_zero_area)
79 {
80    LLVMBuilderRef builder = ctx->builder;
81 
82    if (cull_front && cull_back)
83       return ctx->i1false;
84 
85    if (!cull_front && !cull_back && !cull_zero_area)
86       return ctx->i1true;
87 
88    /* Front/back face culling. Also if the determinant == 0, the triangle
89     * area is 0.
90     */
91    LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], "");
92    LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
93    LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
94    LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
95    /* t0 * t1 - t2 * t3  =  t2 * -t3 + t0 * t1  =  fma(t2, -t3, t0 * t1) */
96    LLVMValueRef det = ac_build_fmad(ctx, det_t2, LLVMBuildFNeg(builder, det_t3, ""),
97                                     LLVMBuildFMul(builder, det_t0, det_t1, ""));
98 
99    /* Negative W negates the determinant. */
100    det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, "");
101 
102    LLVMValueRef accepted = NULL;
103    if (cull_front) {
104       LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE;
105       accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
106    } else if (cull_back) {
107       LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE;
108       accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
109    } else if (cull_zero_area) {
110       accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
111    }
112 
113    if (accepted) {
114       /* Don't reject NaN and +/-infinity, these are tricky.
115        * Just trust fixed-function HW to handle these cases correctly.
116        */
117       accepted = LLVMBuildOr(builder, accepted, ac_build_is_inf_or_nan(ctx, det), "");
118    }
119 
120    return accepted;
121 }
122 
rotate_45degrees(struct ac_llvm_context * ctx,LLVMValueRef v[2])123 static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2])
124 {
125    /* sin(45) == cos(45) */
126    LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781);
127 
128    /* x2  =  x*cos45 - y*sin45  =  x*sincos45 - y*sincos45
129     * y2  =  x*sin45 + y*cos45  =  x*sincos45 + y*sincos45
130     */
131    LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, "");
132 
133    /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */
134    LLVMValueRef result[2] = {
135       ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first),
136       ac_build_fmad(ctx, v[1], sincos45, first),
137    };
138 
139    memcpy(v, result, sizeof(result));
140 }
141 
142 /* Perform view culling and small primitive elimination and return true
143  * if the primitive is accepted and initially_accepted == true. */
cull_bbox(struct ac_llvm_context * ctx,LLVMValueRef pos[3][4],LLVMValueRef initially_accepted,struct ac_position_w_info * w,LLVMValueRef vp_scale[2],LLVMValueRef vp_translate[2],LLVMValueRef small_prim_precision,LLVMValueRef clip_half_line_width[2],struct ac_cull_options * options,ac_cull_accept_func accept_func,void * userdata)144 static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
145                       LLVMValueRef initially_accepted, struct ac_position_w_info *w,
146                       LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2],
147                       LLVMValueRef small_prim_precision,
148                       LLVMValueRef clip_half_line_width[2],
149                       struct ac_cull_options *options,
150                       ac_cull_accept_func accept_func, void *userdata)
151 {
152    LLVMBuilderRef builder = ctx->builder;
153 
154    if (!options->cull_view_xy && !options->cull_view_near_z && !options->cull_view_far_z &&
155        !options->cull_small_prims) {
156       if (accept_func)
157          accept_func(ctx, initially_accepted, userdata);
158       return;
159    }
160 
161    ac_build_ifcc(ctx, initially_accepted, 10000000);
162    {
163       LLVMValueRef bbox_min[3], bbox_max[3];
164       LLVMValueRef accepted = ctx->i1true;
165 
166       /* Compute the primitive bounding box for easy culling. */
167       for (unsigned chan = 0; chan < (options->cull_view_near_z ||
168                                       options->cull_view_far_z ? 3 : 2); chan++) {
169          assert(options->num_vertices >= 2);
170          bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
171          bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
172 
173          if (options->num_vertices == 3) {
174             bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
175             bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
176          }
177 
178          if (clip_half_line_width[chan]) {
179             bbox_min[chan] = LLVMBuildFSub(builder, bbox_min[chan], clip_half_line_width[chan], "");
180             bbox_max[chan] = LLVMBuildFAdd(builder, bbox_max[chan], clip_half_line_width[chan], "");
181          }
182       }
183 
184       /* View culling. */
185       if (options->cull_view_xy || options->cull_view_near_z || options->cull_view_far_z) {
186          for (unsigned chan = 0; chan < 3; chan++) {
187             LLVMValueRef visible;
188 
189             if ((options->cull_view_xy && chan <= 1) || (options->cull_view_near_z && chan == 2)) {
190                float t = chan == 2 && options->use_halfz_clip_space ? 0 : -1;
191                visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
192                                        LLVMConstReal(ctx->f32, t), "");
193                accepted = LLVMBuildAnd(builder, accepted, visible, "");
194             }
195 
196             if ((options->cull_view_xy && chan <= 1) || (options->cull_view_far_z && chan == 2)) {
197                visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, "");
198                accepted = LLVMBuildAnd(builder, accepted, visible, "");
199             }
200          }
201       }
202 
203       /* Small primitive culling - triangles. */
204       if (options->cull_small_prims && options->num_vertices == 3) {
205          /* Assuming a sample position at (0.5, 0.5), if we round
206           * the bounding box min/max extents and the results of
207           * the rounding are equal in either the X or Y direction,
208           * the bounding box does not intersect the sample.
209           *
210           * See these GDC slides for pictures:
211           * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
212           */
213          LLVMValueRef min, max, not_equal[2], visible;
214 
215          for (unsigned chan = 0; chan < 2; chan++) {
216             /* Convert the position to screen-space coordinates. */
217             min = ac_build_fmad(ctx, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
218             max = ac_build_fmad(ctx, bbox_max[chan], vp_scale[chan], vp_translate[chan]);
219             /* Scale the bounding box according to the precision of
220              * the rasterizer and the number of MSAA samples. */
221             min = LLVMBuildFSub(builder, min, small_prim_precision, "");
222             max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
223 
224             /* Determine if the bbox intersects the sample point.
225              * It also works for MSAA, but vp_scale, vp_translate,
226              * and small_prim_precision are computed differently.
227              */
228             min = ac_build_round(ctx, min);
229             max = ac_build_round(ctx, max);
230             not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
231          }
232          visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], "");
233          accepted = LLVMBuildAnd(builder, accepted, visible, "");
234       }
235 
236       /* Small primitive culling - lines. */
237       if (options->cull_small_prims && options->num_vertices == 2) {
238          /* This only works with lines without perpendicular end caps (lines with perpendicular
239           * end caps are rasterized as quads and thus can't be culled as small prims in 99% of
240           * cases because line_width >= 1).
241           *
242           * This takes advantage of the diamont exit rule, which says that every pixel
243           * has a diamond inside it touching the pixel boundary and only if a line exits
244           * the diamond, that pixel is filled. If a line enters the diamond or stays
245           * outside the diamond, the pixel isn't filled.
246           *
247           * This algorithm is a little simpler than that. The space outside all diamonds also
248           * has the same diamond shape, which we'll call corner diamonds.
249           *
250           * The idea is to cull all lines that are entirely inside a diamond, including
251           * corner diamonds. If a line is entirely inside a diamond, it can be culled because
252           * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
253           * because it doesn't enter any diamond and thus can't exit any diamond.
254           *
255           * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
256           * box test is used to determine whether a line is entirely inside any square (diamond).
257           *
258           * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
259           * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
260           * perpendicular end caps that enable quad rasterization for lines. Thus, this should
261           * always use non-MSAA viewport transformation and non-MSAA small prim precision.
262           *
263           * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
264           * It should contain no holes if this matches hw behavior.
265           */
266          LLVMValueRef v0[2], v1[2];
267 
268          /* Get vertex positions in pixels. */
269          for (unsigned chan = 0; chan < 2; chan++) {
270             v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]);
271             v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]);
272          }
273 
274          /* Rotate the viewport by 45 degress, so that diamonds become squares. */
275          rotate_45degrees(ctx, v0);
276          rotate_45degrees(ctx, v1);
277 
278          LLVMValueRef not_equal[2];
279 
280          for (unsigned chan = 0; chan < 2; chan++) {
281             /* The width of each square is sqrt(0.5), so scale it to 1 because we want
282              * round() to give us the position of the closest center of a square (diamond).
283              */
284             v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
285             v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
286 
287             /* Compute the bounding box around both vertices. We do this because we must
288              * enlarge the line area by the precision of the rasterizer.
289              */
290             LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]);
291             LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]);
292 
293             /* Enlarge the bounding box by the precision of the rasterizer. */
294             min = LLVMBuildFSub(builder, min, small_prim_precision, "");
295             max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
296 
297             /* Round the bounding box corners. If both rounded corners are equal,
298              * the bounding box is entirely inside a square (diamond).
299              */
300             min = ac_build_round(ctx, min);
301             max = ac_build_round(ctx, max);
302             not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
303          }
304 
305          accepted = LLVMBuildAnd(builder, accepted,
306                                  LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), "");
307       }
308 
309       /* Disregard the bounding box culling if any W is negative because the code
310        * doesn't work with that.
311        */
312       accepted = LLVMBuildOr(builder, accepted, w->any_w_negative, "");
313 
314       if (accept_func)
315          accept_func(ctx, accepted, userdata);
316    }
317    ac_build_endif(ctx, 10000000);
318 }
319 
320 /**
321  * Return i1 true if the primitive is accepted (not culled).
322  *
323  * \param pos                   Vertex positions 3x vec4
324  * \param initially_accepted    AND'ed with the result. Some computations can be
325  *                              skipped if this is false.
326  * \param vp_scale              Viewport scale XY.
327  *                              For MSAA, multiply them by the number of samples.
328  * \param vp_translate          Viewport translation XY.
329  *                              For MSAA, multiply them by the number of samples.
330  * \param small_prim_precision  Precision of small primitive culling. This should
331  *                              be the same as or greater than the precision of
332  *                              the rasterizer. Set to num_samples / 2^subpixel_bits.
333  *                              subpixel_bits are defined by the quantization mode.
334  * \param options               See ac_cull_options.
335  * \param accept_func           Callback invoked in the inner-most branch where the primitive is accepted.
336  */
ac_cull_primitive(struct ac_llvm_context * ctx,LLVMValueRef pos[3][4],LLVMValueRef initially_accepted,LLVMValueRef vp_scale[2],LLVMValueRef vp_translate[2],LLVMValueRef small_prim_precision,LLVMValueRef clip_half_line_width[2],struct ac_cull_options * options,ac_cull_accept_func accept_func,void * userdata)337 void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
338                        LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
339                        LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
340                        LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options,
341                        ac_cull_accept_func accept_func, void *userdata)
342 {
343    struct ac_position_w_info w;
344    ac_analyze_position_w(ctx, pos, &w, options->num_vertices);
345 
346    /* W culling. */
347    LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
348    accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, "");
349 
350    /* Face culling. */
351    accepted = LLVMBuildAnd(
352       ctx->builder, accepted,
353       ac_cull_face(ctx, pos, &w, options->cull_front, options->cull_back, options->cull_zero_area),
354       "");
355 
356    /* View culling and small primitive elimination. */
357    cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision,
358              clip_half_line_width, options, accept_func, userdata);
359 }
360