• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2007-2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /*
29  * Rasterization for binned triangles within a tile
30  */
31 
32 #include <limits.h>
33 #include "util/u_math.h"
34 #include "lp_debug.h"
35 #include "lp_perf.h"
36 #include "lp_rast_priv.h"
37 
38 /**
39  * Shade all pixels in a 4x4 block.
40  */
41 static void
block_full_4(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)42 block_full_4(struct lp_rasterizer_task *task,
43              const struct lp_rast_triangle *tri,
44              int x, int y)
45 {
46    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
47 }
48 
49 
50 /**
51  * Shade all pixels in a 16x16 block.
52  */
53 static void
block_full_16(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)54 block_full_16(struct lp_rasterizer_task *task,
55               const struct lp_rast_triangle *tri,
56               int x, int y)
57 {
58    unsigned ix, iy;
59    assert(x % 16 == 0);
60    assert(y % 16 == 0);
61    for (iy = 0; iy < 16; iy += 4)
62       for (ix = 0; ix < 16; ix += 4)
63 	 block_full_4(task, tri, x + ix, y + iy);
64 }
65 
66 static inline unsigned
build_mask_linear(int32_t c,int32_t dcdx,int32_t dcdy)67 build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
68 {
69    unsigned mask = 0;
70 
71    int32_t c0 = c;
72    int32_t c1 = c0 + dcdy;
73    int32_t c2 = c1 + dcdy;
74    int32_t c3 = c2 + dcdy;
75 
76    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
77    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
78    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
79    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
80    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
81    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
82    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
83    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
84    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
85    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
86    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
87    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
88    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
89    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
90    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
91    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
92 
93    return mask;
94 }
95 
96 
97 static inline void
build_masks(int32_t c,int32_t cdiff,int32_t dcdx,int32_t dcdy,unsigned * outmask,unsigned * partmask)98 build_masks(int32_t c,
99             int32_t cdiff,
100             int32_t dcdx,
101             int32_t dcdy,
102             unsigned *outmask,
103             unsigned *partmask)
104 {
105    *outmask |= build_mask_linear(c, dcdx, dcdy);
106    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
107 }
108 
109 void
lp_rast_triangle_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)110 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
111                       const union lp_rast_cmd_arg arg)
112 {
113    union lp_rast_cmd_arg arg2;
114    arg2.triangle.tri = arg.triangle.tri;
115    arg2.triangle.plane_mask = (1<<3)-1;
116    lp_rast_triangle_3(task, arg2);
117 }
118 
119 void
lp_rast_triangle_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)120 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
121                       const union lp_rast_cmd_arg arg)
122 {
123    lp_rast_triangle_3_16(task, arg);
124 }
125 
126 void
lp_rast_triangle_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
128                       const union lp_rast_cmd_arg arg)
129 {
130    union lp_rast_cmd_arg arg2;
131    arg2.triangle.tri = arg.triangle.tri;
132    arg2.triangle.plane_mask = (1<<4)-1;
133    lp_rast_triangle_4(task, arg2);
134 }
135 
136 void
lp_rast_triangle_ms_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)137 lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task,
138                       const union lp_rast_cmd_arg arg)
139 {
140    union lp_rast_cmd_arg arg2;
141    arg2.triangle.tri = arg.triangle.tri;
142    arg2.triangle.plane_mask = (1<<3)-1;
143    lp_rast_triangle_ms_3(task, arg2);
144 }
145 
146 void
lp_rast_triangle_ms_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)147 lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task,
148                       const union lp_rast_cmd_arg arg)
149 {
150    lp_rast_triangle_ms_3_16(task, arg);
151 }
152 
153 void
lp_rast_triangle_ms_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)154 lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task,
155                       const union lp_rast_cmd_arg arg)
156 {
157    union lp_rast_cmd_arg arg2;
158    arg2.triangle.tri = arg.triangle.tri;
159    arg2.triangle.plane_mask = (1<<4)-1;
160    lp_rast_triangle_ms_4(task, arg2);
161 }
162 
163 #if defined(PIPE_ARCH_SSE)
164 
165 #include <emmintrin.h>
166 #include "util/u_sse.h"
167 
168 
169 static inline void
build_masks_sse(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)170 build_masks_sse(int c,
171                 int cdiff,
172                 int dcdx,
173                 int dcdy,
174                 unsigned *outmask,
175                 unsigned *partmask)
176 {
177    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
178    __m128i xdcdy = _mm_set1_epi32(dcdy);
179 
180    /* Get values across the quad
181     */
182    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
183    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
184    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
185 
186    {
187       __m128i cstep01, cstep23, result;
188 
189       cstep01 = _mm_packs_epi32(cstep0, cstep1);
190       cstep23 = _mm_packs_epi32(cstep2, cstep3);
191       result = _mm_packs_epi16(cstep01, cstep23);
192 
193       *outmask |= _mm_movemask_epi8(result);
194    }
195 
196 
197    {
198       __m128i cio4 = _mm_set1_epi32(cdiff);
199       __m128i cstep01, cstep23, result;
200 
201       cstep0 = _mm_add_epi32(cstep0, cio4);
202       cstep1 = _mm_add_epi32(cstep1, cio4);
203       cstep2 = _mm_add_epi32(cstep2, cio4);
204       cstep3 = _mm_add_epi32(cstep3, cio4);
205 
206       cstep01 = _mm_packs_epi32(cstep0, cstep1);
207       cstep23 = _mm_packs_epi32(cstep2, cstep3);
208       result = _mm_packs_epi16(cstep01, cstep23);
209 
210       *partmask |= _mm_movemask_epi8(result);
211    }
212 }
213 
214 
215 static inline unsigned
build_mask_linear_sse(int c,int dcdx,int dcdy)216 build_mask_linear_sse(int c, int dcdx, int dcdy)
217 {
218    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
219    __m128i xdcdy = _mm_set1_epi32(dcdy);
220 
221    /* Get values across the quad
222     */
223    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
224    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
225    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
226 
227    /* pack pairs of results into epi16
228     */
229    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
230    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
231 
232    /* pack into epi8, preserving sign bits
233     */
234    __m128i result = _mm_packs_epi16(cstep01, cstep23);
235 
236    /* extract sign bits to create mask
237     */
238    return _mm_movemask_epi8(result);
239 }
240 
241 static inline unsigned
sign_bits4(const __m128i * cstep,int cdiff)242 sign_bits4(const __m128i *cstep, int cdiff)
243 {
244 
245    /* Adjust the step values
246     */
247    __m128i cio4 = _mm_set1_epi32(cdiff);
248    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
249    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
250    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
251    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
252 
253    /* Pack down to epi8
254     */
255    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
256    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
257    __m128i result = _mm_packs_epi16(cstep01, cstep23);
258 
259    /* Extract the sign bits
260     */
261    return _mm_movemask_epi8(result);
262 }
263 
264 #define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12))
265 #define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13))
266 #define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14))
267 #define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15))
268 
269 #define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3))
270 #define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7))
271 #define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11))
272 #define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15))
273 
274 #define STAMP_SIZE 4
275 static unsigned bottom_mask_tab[STAMP_SIZE] = {
276    ROW3,
277    ROW3 | ROW2,
278    ROW3 | ROW2 | ROW1,
279    ROW3 | ROW2 | ROW1 | ROW0,
280 };
281 
282 static unsigned right_mask_tab[STAMP_SIZE] = {
283    COLUMN3,
284    COLUMN3 | COLUMN2,
285    COLUMN3 | COLUMN2 | COLUMN1,
286    COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0,
287 };
288 
289 
290 #define NR_PLANES 3
291 
292 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)293 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
294                          const union lp_rast_cmd_arg arg)
295 {
296    const struct lp_rast_triangle *tri = arg.triangle.tri;
297    const struct lp_rast_plane *plane = GET_PLANES(tri);
298    int x = (arg.triangle.plane_mask & 0xff) + task->x;
299    int y = (arg.triangle.plane_mask >> 8) + task->y;
300    unsigned i, j;
301 
302    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
303    unsigned nr = 0;
304 
305    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
306    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
307    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
308    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
309    __m128i zero = _mm_setzero_si128();
310 
311    __m128i c, dcdx, dcdy, rej4;
312    __m128i dcdx_neg_mask, dcdy_neg_mask;
313    __m128i dcdx2, dcdx3;
314 
315    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
316    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
317    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
318    __m128i unused;
319 
320    transpose4_epi32(&p0, &p1, &p2, &zero,
321                     &c, &unused, &dcdx, &dcdy);
322 
323    /* recalc eo - easier than trying to load as scalars / shuffle... */
324    dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
325    dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
326    rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
327                         _mm_and_si128(dcdx_neg_mask, dcdx));
328 
329    /* Adjust dcdx;
330     */
331    dcdx = _mm_sub_epi32(zero, dcdx);
332 
333    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
334    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
335    rej4 = _mm_slli_epi32(rej4, 2);
336 
337    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
338    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
339    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
340 
341    dcdx2 = _mm_add_epi32(dcdx, dcdx);
342    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
343 
344    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
345                     &span_0, &span_1, &span_2, &unused);
346 
347    for (i = 0; i < 4; i++) {
348       __m128i cx = c;
349 
350       for (j = 0; j < 4; j++) {
351          __m128i c4rej = _mm_add_epi32(cx, rej4);
352          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
353 
354          /* if (is_zero(rej_masks)) */
355          if (_mm_movemask_epi8(rej_masks) == 0) {
356             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
357             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
358             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
359 
360             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
361 
362             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
363             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
364             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
365 
366             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
367             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
368 
369             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
370             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
371             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
372 
373             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
374 
375             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
376             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
377             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
378 
379             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
380             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
381             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
382 
383             unsigned mask = _mm_movemask_epi8(c_0123);
384 
385             out[nr].i = i;
386             out[nr].j = j;
387             out[nr].mask = mask;
388             if (mask != 0xffff)
389                nr++;
390          }
391          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
392       }
393 
394       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
395    }
396 
397    for (i = 0; i < nr; i++)
398       lp_rast_shade_quads_mask(task,
399                                &tri->inputs,
400                                x + 4 * out[i].j,
401                                y + 4 * out[i].i,
402                                0xffff & ~out[i].mask);
403 }
404 
405 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)406 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
407                         const union lp_rast_cmd_arg arg)
408 {
409    const struct lp_rast_triangle *tri = arg.triangle.tri;
410    const struct lp_rast_plane *plane = GET_PLANES(tri);
411    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
412    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
413 
414    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
415    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
416    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
417    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
418    __m128i zero = _mm_setzero_si128();
419 
420    __m128i c, dcdx, dcdy;
421    __m128i dcdx2, dcdx3;
422 
423    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
424    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
425    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
426    __m128i unused;
427 
428    transpose4_epi32(&p0, &p1, &p2, &zero,
429                     &c, &unused, &dcdx, &dcdy);
430 
431    /* Adjust dcdx;
432     */
433    dcdx = _mm_sub_epi32(zero, dcdx);
434 
435    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
436    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
437 
438    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
439    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
440 
441    dcdx2 = _mm_add_epi32(dcdx, dcdx);
442    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
443 
444    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
445                     &span_0, &span_1, &span_2, &unused);
446 
447 
448    {
449       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
450       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
451       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
452 
453       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
454 
455       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
456       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
457       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
458 
459       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
460       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
461 
462       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
463       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
464       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
465 
466       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
467 
468       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
469       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
470       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
471 
472       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
473       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
474       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
475 
476       unsigned mask = _mm_movemask_epi8(c_0123);
477 
478       if (mask != 0xffff)
479          lp_rast_shade_quads_mask(task,
480                                   &tri->inputs,
481                                   x,
482                                   y,
483                                   0xffff & ~mask);
484    }
485 }
486 
487 #undef NR_PLANES
488 
489 #else
490 
491 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
492 
493 #include <altivec.h>
494 #include "util/u_pwr8.h"
495 
496 static inline void
build_masks_ppc(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)497 build_masks_ppc(int c,
498                 int cdiff,
499                 int dcdx,
500                 int dcdy,
501                 unsigned *outmask,
502                 unsigned *partmask)
503 {
504    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
505    __m128i xdcdy = (__m128i) vec_splats(dcdy);
506 
507    /* Get values across the quad
508     */
509    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
510    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
511    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
512 
513    {
514       __m128i cstep01, cstep23, result;
515 
516       cstep01 = vec_packs_epi32(cstep0, cstep1);
517       cstep23 = vec_packs_epi32(cstep2, cstep3);
518       result = vec_packs_epi16(cstep01, cstep23);
519 
520       *outmask |= vec_movemask_epi8(result);
521    }
522 
523 
524    {
525       __m128i cio4 = (__m128i) vec_splats(cdiff);
526       __m128i cstep01, cstep23, result;
527 
528       cstep0 = vec_add_epi32(cstep0, cio4);
529       cstep1 = vec_add_epi32(cstep1, cio4);
530       cstep2 = vec_add_epi32(cstep2, cio4);
531       cstep3 = vec_add_epi32(cstep3, cio4);
532 
533       cstep01 = vec_packs_epi32(cstep0, cstep1);
534       cstep23 = vec_packs_epi32(cstep2, cstep3);
535       result = vec_packs_epi16(cstep01, cstep23);
536 
537       *partmask |= vec_movemask_epi8(result);
538    }
539 }
540 
541 static inline unsigned
build_mask_linear_ppc(int c,int dcdx,int dcdy)542 build_mask_linear_ppc(int c, int dcdx, int dcdy)
543 {
544    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
545    __m128i xdcdy = (__m128i) vec_splats(dcdy);
546 
547    /* Get values across the quad
548     */
549    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
550    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
551    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
552 
553    /* pack pairs of results into epi16
554     */
555    __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
556    __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
557 
558    /* pack into epi8, preserving sign bits
559     */
560    __m128i result = vec_packs_epi16(cstep01, cstep23);
561 
562    /* extract sign bits to create mask
563     */
564    return vec_movemask_epi8(result);
565 }
566 
567 static inline __m128i
lp_plane_to_m128i(const struct lp_rast_plane * plane)568 lp_plane_to_m128i(const struct lp_rast_plane *plane)
569 {
570    return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
571                          (int32_t)plane->dcdy, (int32_t)plane->eo);
572 }
573 
574 #define NR_PLANES 3
575 
576 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)577 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
578                       const union lp_rast_cmd_arg arg)
579 {
580    const struct lp_rast_triangle *tri = arg.triangle.tri;
581    const struct lp_rast_plane *plane = GET_PLANES(tri);
582    int x = (arg.triangle.plane_mask & 0xff) + task->x;
583    int y = (arg.triangle.plane_mask >> 8) + task->y;
584    unsigned i, j;
585 
586    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
587    unsigned nr = 0;
588 
589    __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
590    __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
591    __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
592    __m128i zero = vec_splats((unsigned char) 0);
593 
594    __m128i c;
595    __m128i dcdx;
596    __m128i dcdy;
597    __m128i rej4;
598 
599    __m128i dcdx2;
600    __m128i dcdx3;
601 
602    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
603    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
604    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
605    __m128i unused;
606 
607    __m128i vshuf_mask0;
608    __m128i vshuf_mask1;
609    __m128i vshuf_mask2;
610 
611 #if UTIL_ARCH_LITTLE_ENDIAN
612    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
613    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
614    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
615 #else
616    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
617    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
618    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
619 #endif
620 
621    transpose4_epi32(&p0, &p1, &p2, &zero,
622                     &c, &dcdx, &dcdy, &rej4);
623 
624    /* Adjust dcdx;
625     */
626    dcdx = vec_sub_epi32(zero, dcdx);
627 
628    c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
629    c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
630    rej4 = vec_slli_epi32(rej4, 2);
631 
632    /*
633     * Adjust so we can just check the sign bit (< 0 comparison),
634     * instead of having to do a less efficient <= 0 comparison
635     */
636    c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
637    rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
638 
639    dcdx2 = vec_add_epi32(dcdx, dcdx);
640    dcdx3 = vec_add_epi32(dcdx2, dcdx);
641 
642    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
643                     &span_0, &span_1, &span_2, &unused);
644 
645    for (i = 0; i < 4; i++) {
646       __m128i cx = c;
647 
648       for (j = 0; j < 4; j++) {
649          __m128i c4rej = vec_add_epi32(cx, rej4);
650          __m128i rej_masks = vec_srai_epi32(c4rej, 31);
651 
652          /* if (is_zero(rej_masks)) */
653          if (vec_movemask_epi8(rej_masks) == 0) {
654             __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
655             __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
656             __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
657 
658             __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
659 
660             __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
661             __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
662             __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
663 
664             __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
665             __m128i c_01 = vec_packs_epi32(c_0, c_1);
666 
667             __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
668             __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
669             __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
670 
671             __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
672 
673             __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
674             __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
675             __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
676 
677             __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
678             __m128i c_23 = vec_packs_epi32(c_2, c_3);
679             __m128i c_0123 = vec_packs_epi16(c_01, c_23);
680 
681             unsigned mask = vec_movemask_epi8(c_0123);
682 
683             out[nr].i = i;
684             out[nr].j = j;
685             out[nr].mask = mask;
686             if (mask != 0xffff)
687                nr++;
688          }
689          cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
690       }
691 
692       c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
693    }
694 
695    for (i = 0; i < nr; i++)
696       lp_rast_shade_quads_mask(task,
697                                &tri->inputs,
698                                x + 4 * out[i].j,
699                                y + 4 * out[i].i,
700                                0xffff & ~out[i].mask);
701 }
702 
703 #undef NR_PLANES
704 
705 #else
706 
707 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)708 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
709                          const union lp_rast_cmd_arg arg)
710 {
711    union lp_rast_cmd_arg arg2;
712    arg2.triangle.tri = arg.triangle.tri;
713    arg2.triangle.plane_mask = (1<<3)-1;
714    lp_rast_triangle_32_3(task, arg2);
715 }
716 
717 #endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
718 
719 void
lp_rast_triangle_32_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)720 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
721                          const union lp_rast_cmd_arg arg)
722 {
723    union lp_rast_cmd_arg arg2;
724    arg2.triangle.tri = arg.triangle.tri;
725    arg2.triangle.plane_mask = (1<<4)-1;
726    lp_rast_triangle_32_4(task, arg2);
727 }
728 
729 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)730 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
731                       const union lp_rast_cmd_arg arg)
732 {
733    lp_rast_triangle_32_3_16(task, arg);
734 }
735 
736 #endif
737 
738 #if defined PIPE_ARCH_SSE
739 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
740 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
741 #elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)
742 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
743 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
744 #else
745 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
746 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
747 #endif
748 
749 #define RASTER_64 1
750 
751 #define TAG(x) x##_1
752 #define NR_PLANES 1
753 #include "lp_rast_tri_tmp.h"
754 
755 #define TAG(x) x##_2
756 #define NR_PLANES 2
757 #include "lp_rast_tri_tmp.h"
758 
759 #define TAG(x) x##_3
760 #define NR_PLANES 3
761 /*#define TRI_4 lp_rast_triangle_3_4*/
762 /*#define TRI_16 lp_rast_triangle_3_16*/
763 #include "lp_rast_tri_tmp.h"
764 
765 #define TAG(x) x##_4
766 #define NR_PLANES 4
767 /*#define TRI_16 lp_rast_triangle_4_16*/
768 #include "lp_rast_tri_tmp.h"
769 
770 #define TAG(x) x##_5
771 #define NR_PLANES 5
772 #include "lp_rast_tri_tmp.h"
773 
774 #define TAG(x) x##_6
775 #define NR_PLANES 6
776 #include "lp_rast_tri_tmp.h"
777 
778 #define TAG(x) x##_7
779 #define NR_PLANES 7
780 #include "lp_rast_tri_tmp.h"
781 
782 #define TAG(x) x##_8
783 #define NR_PLANES 8
784 #include "lp_rast_tri_tmp.h"
785 
786 #undef RASTER_64
787 
788 #define TAG(x) x##_32_1
789 #define NR_PLANES 1
790 #include "lp_rast_tri_tmp.h"
791 
792 #define TAG(x) x##_32_2
793 #define NR_PLANES 2
794 #include "lp_rast_tri_tmp.h"
795 
796 #define TAG(x) x##_32_3
797 #define NR_PLANES 3
798 /*#define TRI_4 lp_rast_triangle_3_4*/
799 /*#define TRI_16 lp_rast_triangle_3_16*/
800 #include "lp_rast_tri_tmp.h"
801 
802 #define TAG(x) x##_32_4
803 #define NR_PLANES 4
804 #ifdef PIPE_ARCH_SSE
805 #define TRI_16 lp_rast_triangle_32_4_16
806 #endif
807 #include "lp_rast_tri_tmp.h"
808 
809 #define TAG(x) x##_32_5
810 #define NR_PLANES 5
811 #include "lp_rast_tri_tmp.h"
812 
813 #define TAG(x) x##_32_6
814 #define NR_PLANES 6
815 #include "lp_rast_tri_tmp.h"
816 
817 #define TAG(x) x##_32_7
818 #define NR_PLANES 7
819 #include "lp_rast_tri_tmp.h"
820 
821 #define TAG(x) x##_32_8
822 #define NR_PLANES 8
823 #include "lp_rast_tri_tmp.h"
824 
825 #define MULTISAMPLE 1
826 #define RASTER_64 1
827 
828 #define TAG(x) x##_ms_1
829 #define NR_PLANES 1
830 #include "lp_rast_tri_tmp.h"
831 
832 #define TAG(x) x##_ms_2
833 #define NR_PLANES 2
834 #include "lp_rast_tri_tmp.h"
835 
836 #define TAG(x) x##_ms_3
837 #define NR_PLANES 3
838 /*#define TRI_4 lp_rast_triangle_3_4*/
839 /*#define TRI_16 lp_rast_triangle_3_16*/
840 #include "lp_rast_tri_tmp.h"
841 
842 #define TAG(x) x##_ms_4
843 #define NR_PLANES 4
844 /*#define TRI_16 lp_rast_triangle_4_16*/
845 #include "lp_rast_tri_tmp.h"
846 
847 #define TAG(x) x##_ms_5
848 #define NR_PLANES 5
849 #include "lp_rast_tri_tmp.h"
850 
851 #define TAG(x) x##_ms_6
852 #define NR_PLANES 6
853 #include "lp_rast_tri_tmp.h"
854 
855 #define TAG(x) x##_ms_7
856 #define NR_PLANES 7
857 #include "lp_rast_tri_tmp.h"
858 
859 #define TAG(x) x##_ms_8
860 #define NR_PLANES 8
861 #include "lp_rast_tri_tmp.h"
862 
863 #undef RASTER_64
864