• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2007-2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /*
29  * Rasterization for binned triangles within a tile
30  */
31 
32 #include <limits.h>
33 #include "util/u_math.h"
34 #include "lp_debug.h"
35 #include "lp_perf.h"
36 #include "lp_rast_priv.h"
37 
38 /**
39  * Shade all pixels in a 4x4 block.
40  */
41 static void
block_full_4(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)42 block_full_4(struct lp_rasterizer_task *task,
43              const struct lp_rast_triangle *tri,
44              int x, int y)
45 {
46    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
47 }
48 
49 
50 /**
51  * Shade all pixels in a 16x16 block.
52  */
53 static void
block_full_16(struct lp_rasterizer_task * task,const struct lp_rast_triangle * tri,int x,int y)54 block_full_16(struct lp_rasterizer_task *task,
55               const struct lp_rast_triangle *tri,
56               int x, int y)
57 {
58    assert(x % 16 == 0);
59    assert(y % 16 == 0);
60    for (unsigned iy = 0; iy < 16; iy += 4)
61       for (unsigned ix = 0; ix < 16; ix += 4)
62          block_full_4(task, tri, x + ix, y + iy);
63 }
64 
65 static inline unsigned
build_mask_linear(int32_t c,int32_t dcdx,int32_t dcdy)66 build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
67 {
68    unsigned mask = 0;
69 
70    int32_t c0 = c;
71    int32_t c1 = c0 + dcdy;
72    int32_t c2 = c1 + dcdy;
73    int32_t c3 = c2 + dcdy;
74 
75    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
76    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
77    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
78    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
79    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
80    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
81    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
82    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
83    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
84    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
85    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
86    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
87    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
88    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
89    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
90    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
91 
92    return mask;
93 }
94 
95 
96 static inline void
build_masks(int32_t c,int32_t cdiff,int32_t dcdx,int32_t dcdy,unsigned * outmask,unsigned * partmask)97 build_masks(int32_t c,
98             int32_t cdiff,
99             int32_t dcdx,
100             int32_t dcdy,
101             unsigned *outmask,
102             unsigned *partmask)
103 {
104    *outmask |= build_mask_linear(c, dcdx, dcdy);
105    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
106 }
107 
108 void
lp_rast_triangle_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)109 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
110                       const union lp_rast_cmd_arg arg)
111 {
112    union lp_rast_cmd_arg arg2;
113    arg2.triangle.tri = arg.triangle.tri;
114    arg2.triangle.plane_mask = (1<<3)-1;
115    lp_rast_triangle_3(task, arg2);
116 }
117 
118 void
lp_rast_triangle_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)119 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
120                       const union lp_rast_cmd_arg arg)
121 {
122    lp_rast_triangle_3_16(task, arg);
123 }
124 
125 void
lp_rast_triangle_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)126 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
127                       const union lp_rast_cmd_arg arg)
128 {
129    union lp_rast_cmd_arg arg2;
130    arg2.triangle.tri = arg.triangle.tri;
131    arg2.triangle.plane_mask = (1<<4)-1;
132    lp_rast_triangle_4(task, arg2);
133 }
134 
135 void
lp_rast_triangle_ms_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)136 lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task,
137                       const union lp_rast_cmd_arg arg)
138 {
139    union lp_rast_cmd_arg arg2;
140    arg2.triangle.tri = arg.triangle.tri;
141    arg2.triangle.plane_mask = (1<<3)-1;
142    lp_rast_triangle_ms_3(task, arg2);
143 }
144 
145 void
lp_rast_triangle_ms_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)146 lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task,
147                       const union lp_rast_cmd_arg arg)
148 {
149    lp_rast_triangle_ms_3_16(task, arg);
150 }
151 
152 void
lp_rast_triangle_ms_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)153 lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task,
154                       const union lp_rast_cmd_arg arg)
155 {
156    union lp_rast_cmd_arg arg2;
157    arg2.triangle.tri = arg.triangle.tri;
158    arg2.triangle.plane_mask = (1<<4)-1;
159    lp_rast_triangle_ms_4(task, arg2);
160 }
161 
162 #if defined(PIPE_ARCH_SSE)
163 
164 #include <emmintrin.h>
165 #include "util/u_sse.h"
166 
167 
168 static inline void
build_masks_sse(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)169 build_masks_sse(int c,
170                 int cdiff,
171                 int dcdx,
172                 int dcdy,
173                 unsigned *outmask,
174                 unsigned *partmask)
175 {
176    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
177    __m128i xdcdy = _mm_set1_epi32(dcdy);
178 
179    /* Get values across the quad
180     */
181    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
182    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
183    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
184 
185    {
186       __m128i cstep01, cstep23, result;
187 
188       cstep01 = _mm_packs_epi32(cstep0, cstep1);
189       cstep23 = _mm_packs_epi32(cstep2, cstep3);
190       result = _mm_packs_epi16(cstep01, cstep23);
191 
192       *outmask |= _mm_movemask_epi8(result);
193    }
194 
195 
196    {
197       __m128i cio4 = _mm_set1_epi32(cdiff);
198       __m128i cstep01, cstep23, result;
199 
200       cstep0 = _mm_add_epi32(cstep0, cio4);
201       cstep1 = _mm_add_epi32(cstep1, cio4);
202       cstep2 = _mm_add_epi32(cstep2, cio4);
203       cstep3 = _mm_add_epi32(cstep3, cio4);
204 
205       cstep01 = _mm_packs_epi32(cstep0, cstep1);
206       cstep23 = _mm_packs_epi32(cstep2, cstep3);
207       result = _mm_packs_epi16(cstep01, cstep23);
208 
209       *partmask |= _mm_movemask_epi8(result);
210    }
211 }
212 
213 
214 static inline unsigned
build_mask_linear_sse(int c,int dcdx,int dcdy)215 build_mask_linear_sse(int c, int dcdx, int dcdy)
216 {
217    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
218    __m128i xdcdy = _mm_set1_epi32(dcdy);
219 
220    /* Get values across the quad
221     */
222    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
223    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
224    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
225 
226    /* pack pairs of results into epi16
227     */
228    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
229    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
230 
231    /* pack into epi8, preserving sign bits
232     */
233    __m128i result = _mm_packs_epi16(cstep01, cstep23);
234 
235    /* extract sign bits to create mask
236     */
237    return _mm_movemask_epi8(result);
238 }
239 
240 static inline unsigned
sign_bits4(const __m128i * cstep,int cdiff)241 sign_bits4(const __m128i *cstep, int cdiff)
242 {
243 
244    /* Adjust the step values
245     */
246    __m128i cio4 = _mm_set1_epi32(cdiff);
247    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
248    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
249    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
250    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
251 
252    /* Pack down to epi8
253     */
254    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
255    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
256    __m128i result = _mm_packs_epi16(cstep01, cstep23);
257 
258    /* Extract the sign bits
259     */
260    return _mm_movemask_epi8(result);
261 }
262 
263 #define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12))
264 #define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13))
265 #define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14))
266 #define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15))
267 
268 #define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3))
269 #define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7))
270 #define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11))
271 #define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15))
272 
273 #define STAMP_SIZE 4
274 static unsigned bottom_mask_tab[STAMP_SIZE] = {
275    ROW3,
276    ROW3 | ROW2,
277    ROW3 | ROW2 | ROW1,
278    ROW3 | ROW2 | ROW1 | ROW0,
279 };
280 
281 static unsigned right_mask_tab[STAMP_SIZE] = {
282    COLUMN3,
283    COLUMN3 | COLUMN2,
284    COLUMN3 | COLUMN2 | COLUMN1,
285    COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0,
286 };
287 
288 
289 #define NR_PLANES 3
290 
291 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)292 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
293                          const union lp_rast_cmd_arg arg)
294 {
295    const struct lp_rast_triangle *tri = arg.triangle.tri;
296    const struct lp_rast_plane *plane = GET_PLANES(tri);
297    const int x = (arg.triangle.plane_mask & 0xff) + task->x;
298    const int y = (arg.triangle.plane_mask >> 8) + task->y;
299 
300    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
301    unsigned nr = 0;
302 
303    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
304    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
305    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
306    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
307    __m128i zero = _mm_setzero_si128();
308 
309    __m128i c, dcdx, dcdy, rej4;
310    __m128i dcdx_neg_mask, dcdy_neg_mask;
311    __m128i dcdx2, dcdx3;
312 
313    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
314    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
315    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
316    __m128i unused;
317 
318    transpose4_epi32(&p0, &p1, &p2, &zero,
319                     &c, &unused, &dcdx, &dcdy);
320 
321    /* recalc eo - easier than trying to load as scalars / shuffle... */
322    dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
323    dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
324    rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
325                         _mm_and_si128(dcdx_neg_mask, dcdx));
326 
327    /* Adjust dcdx;
328     */
329    dcdx = _mm_sub_epi32(zero, dcdx);
330 
331    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
332    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
333    rej4 = _mm_slli_epi32(rej4, 2);
334 
335    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
336    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
337    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
338 
339    dcdx2 = _mm_add_epi32(dcdx, dcdx);
340    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
341 
342    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
343                     &span_0, &span_1, &span_2, &unused);
344 
345    for (unsigned i = 0; i < 4; i++) {
346       __m128i cx = c;
347 
348       for (unsigned j = 0; j < 4; j++) {
349          __m128i c4rej = _mm_add_epi32(cx, rej4);
350          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
351 
352          /* if (is_zero(rej_masks)) */
353          if (_mm_movemask_epi8(rej_masks) == 0) {
354             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
355             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
356             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
357 
358             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
359 
360             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
361             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
362             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
363 
364             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
365             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
366 
367             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
368             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
369             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
370 
371             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
372 
373             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
374             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
375             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
376 
377             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
378             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
379             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
380 
381             unsigned mask = _mm_movemask_epi8(c_0123);
382 
383             out[nr].i = i;
384             out[nr].j = j;
385             out[nr].mask = mask;
386             if (mask != 0xffff)
387                nr++;
388          }
389          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
390       }
391 
392       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
393    }
394 
395    for (unsigned i = 0; i < nr; i++)
396       lp_rast_shade_quads_mask(task,
397                                &tri->inputs,
398                                x + 4 * out[i].j,
399                                y + 4 * out[i].i,
400                                0xffff & ~out[i].mask);
401 }
402 
403 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)404 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
405                         const union lp_rast_cmd_arg arg)
406 {
407    const struct lp_rast_triangle *tri = arg.triangle.tri;
408    const struct lp_rast_plane *plane = GET_PLANES(tri);
409    const unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
410    const unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
411 
412    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
413    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
414    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
415    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
416    __m128i zero = _mm_setzero_si128();
417 
418    __m128i c, dcdx, dcdy;
419    __m128i dcdx2, dcdx3;
420 
421    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
422    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
423    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
424    __m128i unused;
425 
426    transpose4_epi32(&p0, &p1, &p2, &zero,
427                     &c, &unused, &dcdx, &dcdy);
428 
429    /* Adjust dcdx;
430     */
431    dcdx = _mm_sub_epi32(zero, dcdx);
432 
433    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
434    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
435 
436    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
437    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
438 
439    dcdx2 = _mm_add_epi32(dcdx, dcdx);
440    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
441 
442    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
443                     &span_0, &span_1, &span_2, &unused);
444 
445 
446    {
447       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
448       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
449       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
450 
451       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
452 
453       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
454       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
455       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
456 
457       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
458       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
459 
460       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
461       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
462       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
463 
464       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
465 
466       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
467       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
468       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
469 
470       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
471       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
472       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
473 
474       unsigned mask = _mm_movemask_epi8(c_0123);
475 
476       if (mask != 0xffff)
477          lp_rast_shade_quads_mask(task,
478                                   &tri->inputs,
479                                   x,
480                                   y,
481                                   0xffff & ~mask);
482    }
483 }
484 
485 #undef NR_PLANES
486 
487 #else
488 
489 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
490 
491 #include <altivec.h>
492 #include "util/u_pwr8.h"
493 
494 static inline void
build_masks_ppc(int c,int cdiff,int dcdx,int dcdy,unsigned * outmask,unsigned * partmask)495 build_masks_ppc(int c,
496                 int cdiff,
497                 int dcdx,
498                 int dcdy,
499                 unsigned *outmask,
500                 unsigned *partmask)
501 {
502    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
503    __m128i xdcdy = (__m128i) vec_splats(dcdy);
504 
505    /* Get values across the quad
506     */
507    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
508    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
509    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
510 
511    {
512       __m128i cstep01, cstep23, result;
513 
514       cstep01 = vec_packs_epi32(cstep0, cstep1);
515       cstep23 = vec_packs_epi32(cstep2, cstep3);
516       result = vec_packs_epi16(cstep01, cstep23);
517 
518       *outmask |= vec_movemask_epi8(result);
519    }
520 
521 
522    {
523       __m128i cio4 = (__m128i) vec_splats(cdiff);
524       __m128i cstep01, cstep23, result;
525 
526       cstep0 = vec_add_epi32(cstep0, cio4);
527       cstep1 = vec_add_epi32(cstep1, cio4);
528       cstep2 = vec_add_epi32(cstep2, cio4);
529       cstep3 = vec_add_epi32(cstep3, cio4);
530 
531       cstep01 = vec_packs_epi32(cstep0, cstep1);
532       cstep23 = vec_packs_epi32(cstep2, cstep3);
533       result = vec_packs_epi16(cstep01, cstep23);
534 
535       *partmask |= vec_movemask_epi8(result);
536    }
537 }
538 
539 static inline unsigned
build_mask_linear_ppc(int c,int dcdx,int dcdy)540 build_mask_linear_ppc(int c, int dcdx, int dcdy)
541 {
542    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
543    __m128i xdcdy = (__m128i) vec_splats(dcdy);
544 
545    /* Get values across the quad
546     */
547    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
548    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
549    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
550 
551    /* pack pairs of results into epi16
552     */
553    __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
554    __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
555 
556    /* pack into epi8, preserving sign bits
557     */
558    __m128i result = vec_packs_epi16(cstep01, cstep23);
559 
560    /* extract sign bits to create mask
561     */
562    return vec_movemask_epi8(result);
563 }
564 
565 static inline __m128i
lp_plane_to_m128i(const struct lp_rast_plane * plane)566 lp_plane_to_m128i(const struct lp_rast_plane *plane)
567 {
568    return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
569                          (int32_t)plane->dcdy, (int32_t)plane->eo);
570 }
571 
572 #define NR_PLANES 3
573 
574 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)575 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
576                       const union lp_rast_cmd_arg arg)
577 {
578    const struct lp_rast_triangle *tri = arg.triangle.tri;
579    const struct lp_rast_plane *plane = GET_PLANES(tri);
580    const int x = (arg.triangle.plane_mask & 0xff) + task->x;
581    const int y = (arg.triangle.plane_mask >> 8) + task->y;
582 
583    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
584    unsigned nr = 0;
585 
586    __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
587    __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
588    __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
589    __m128i zero = vec_splats((unsigned char) 0);
590 
591    __m128i c;
592    __m128i dcdx;
593    __m128i dcdy;
594    __m128i rej4;
595 
596    __m128i dcdx2;
597    __m128i dcdx3;
598 
599    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
600    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
601    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
602    __m128i unused;
603 
604    __m128i vshuf_mask0;
605    __m128i vshuf_mask1;
606    __m128i vshuf_mask2;
607 
608 #if UTIL_ARCH_LITTLE_ENDIAN
609    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
610    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
611    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
612 #else
613    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
614    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
615    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
616 #endif
617 
618    transpose4_epi32(&p0, &p1, &p2, &zero,
619                     &c, &dcdx, &dcdy, &rej4);
620 
621    /* Adjust dcdx;
622     */
623    dcdx = vec_sub_epi32(zero, dcdx);
624 
625    c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
626    c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
627    rej4 = vec_slli_epi32(rej4, 2);
628 
629    /*
630     * Adjust so we can just check the sign bit (< 0 comparison),
631     * instead of having to do a less efficient <= 0 comparison
632     */
633    c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
634    rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
635 
636    dcdx2 = vec_add_epi32(dcdx, dcdx);
637    dcdx3 = vec_add_epi32(dcdx2, dcdx);
638 
639    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
640                     &span_0, &span_1, &span_2, &unused);
641 
642    for (unsigned i = 0; i < 4; i++) {
643       __m128i cx = c;
644 
645       for (unsigned j = 0; j < 4; j++) {
646          __m128i c4rej = vec_add_epi32(cx, rej4);
647          __m128i rej_masks = vec_srai_epi32(c4rej, 31);
648 
649          /* if (is_zero(rej_masks)) */
650          if (vec_movemask_epi8(rej_masks) == 0) {
651             __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
652             __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
653             __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
654 
655             __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
656 
657             __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
658             __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
659             __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
660 
661             __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
662             __m128i c_01 = vec_packs_epi32(c_0, c_1);
663 
664             __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
665             __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
666             __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
667 
668             __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
669 
670             __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
671             __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
672             __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
673 
674             __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
675             __m128i c_23 = vec_packs_epi32(c_2, c_3);
676             __m128i c_0123 = vec_packs_epi16(c_01, c_23);
677 
678             unsigned mask = vec_movemask_epi8(c_0123);
679 
680             out[nr].i = i;
681             out[nr].j = j;
682             out[nr].mask = mask;
683             if (mask != 0xffff)
684                nr++;
685          }
686          cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
687       }
688 
689       c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
690    }
691 
692    for (unsigned i = 0; i < nr; i++)
693       lp_rast_shade_quads_mask(task,
694                                &tri->inputs,
695                                x + 4 * out[i].j,
696                                y + 4 * out[i].i,
697                                0xffff & ~out[i].mask);
698 }
699 
700 #undef NR_PLANES
701 
702 #else
703 
704 void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)705 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
706                          const union lp_rast_cmd_arg arg)
707 {
708    union lp_rast_cmd_arg arg2;
709    arg2.triangle.tri = arg.triangle.tri;
710    arg2.triangle.plane_mask = (1<<3)-1;
711    lp_rast_triangle_32_3(task, arg2);
712 }
713 
714 #endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
715 
716 void
lp_rast_triangle_32_4_16(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)717 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
718                          const union lp_rast_cmd_arg arg)
719 {
720    union lp_rast_cmd_arg arg2;
721    arg2.triangle.tri = arg.triangle.tri;
722    arg2.triangle.plane_mask = (1<<4)-1;
723    lp_rast_triangle_32_4(task, arg2);
724 }
725 
726 void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task * task,const union lp_rast_cmd_arg arg)727 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
728                       const union lp_rast_cmd_arg arg)
729 {
730    lp_rast_triangle_32_3_16(task, arg);
731 }
732 
733 #endif
734 
735 #if defined PIPE_ARCH_SSE
736 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
737 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
738 #elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)
739 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
740 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
741 #else
742 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
743 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
744 #endif
745 
746 #define RASTER_64 1
747 
748 #define TAG(x) x##_1
749 #define NR_PLANES 1
750 #include "lp_rast_tri_tmp.h"
751 
752 #define TAG(x) x##_2
753 #define NR_PLANES 2
754 #include "lp_rast_tri_tmp.h"
755 
756 #define TAG(x) x##_3
757 #define NR_PLANES 3
758 /*#define TRI_4 lp_rast_triangle_3_4*/
759 /*#define TRI_16 lp_rast_triangle_3_16*/
760 #include "lp_rast_tri_tmp.h"
761 
762 #define TAG(x) x##_4
763 #define NR_PLANES 4
764 /*#define TRI_16 lp_rast_triangle_4_16*/
765 #include "lp_rast_tri_tmp.h"
766 
767 #define TAG(x) x##_5
768 #define NR_PLANES 5
769 #include "lp_rast_tri_tmp.h"
770 
771 #define TAG(x) x##_6
772 #define NR_PLANES 6
773 #include "lp_rast_tri_tmp.h"
774 
775 #define TAG(x) x##_7
776 #define NR_PLANES 7
777 #include "lp_rast_tri_tmp.h"
778 
779 #define TAG(x) x##_8
780 #define NR_PLANES 8
781 #include "lp_rast_tri_tmp.h"
782 
783 #undef RASTER_64
784 
785 #define TAG(x) x##_32_1
786 #define NR_PLANES 1
787 #include "lp_rast_tri_tmp.h"
788 
789 #define TAG(x) x##_32_2
790 #define NR_PLANES 2
791 #include "lp_rast_tri_tmp.h"
792 
793 #define TAG(x) x##_32_3
794 #define NR_PLANES 3
795 /*#define TRI_4 lp_rast_triangle_3_4*/
796 /*#define TRI_16 lp_rast_triangle_3_16*/
797 #include "lp_rast_tri_tmp.h"
798 
799 #define TAG(x) x##_32_4
800 #define NR_PLANES 4
801 #ifdef PIPE_ARCH_SSE
802 #define TRI_16 lp_rast_triangle_32_4_16
803 #endif
804 #include "lp_rast_tri_tmp.h"
805 
806 #define TAG(x) x##_32_5
807 #define NR_PLANES 5
808 #include "lp_rast_tri_tmp.h"
809 
810 #define TAG(x) x##_32_6
811 #define NR_PLANES 6
812 #include "lp_rast_tri_tmp.h"
813 
814 #define TAG(x) x##_32_7
815 #define NR_PLANES 7
816 #include "lp_rast_tri_tmp.h"
817 
818 #define TAG(x) x##_32_8
819 #define NR_PLANES 8
820 #include "lp_rast_tri_tmp.h"
821 
822 #define MULTISAMPLE 1
823 #define RASTER_64 1
824 
825 #define TAG(x) x##_ms_1
826 #define NR_PLANES 1
827 #include "lp_rast_tri_tmp.h"
828 
829 #define TAG(x) x##_ms_2
830 #define NR_PLANES 2
831 #include "lp_rast_tri_tmp.h"
832 
833 #define TAG(x) x##_ms_3
834 #define NR_PLANES 3
835 /*#define TRI_4 lp_rast_triangle_3_4*/
836 /*#define TRI_16 lp_rast_triangle_3_16*/
837 #include "lp_rast_tri_tmp.h"
838 
839 #define TAG(x) x##_ms_4
840 #define NR_PLANES 4
841 /*#define TRI_16 lp_rast_triangle_4_16*/
842 #include "lp_rast_tri_tmp.h"
843 
844 #define TAG(x) x##_ms_5
845 #define NR_PLANES 5
846 #include "lp_rast_tri_tmp.h"
847 
848 #define TAG(x) x##_ms_6
849 #define NR_PLANES 6
850 #include "lp_rast_tri_tmp.h"
851 
852 #define TAG(x) x##_ms_7
853 #define NR_PLANES 7
854 #include "lp_rast_tri_tmp.h"
855 
856 #define TAG(x) x##_ms_8
857 #define NR_PLANES 8
858 #include "lp_rast_tri_tmp.h"
859 
860 #undef RASTER_64
861