• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2010-2021 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "pipe/p_config.h"
30 
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36 
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41 
42 #if defined(PIPE_ARCH_SSE)
43 
44 #define FIXED16_SHIFT  16
45 #define FIXED16_ONE    (1<<16)
46 #define FIXED16_HALF   (1<<15)
47 
48 /*
49  * Color tolerance.  Allow 1 bit of error in 8 bit unorm colors.
50  */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52 
53 /*
54  * Tolerance for texture coordinate derivatives when doing linear filtering.
55  *
56  * (Note that extra care needs to be taken when doing linear filtering as
57  * coordinates may snap up to neighbour texels inside the tile).
58  */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60 
61 static inline int
float_to_fixed16(float f)62 float_to_fixed16(float f)
63 {
64    return f * (float)FIXED16_ONE;
65 }
66 
67 static inline int
fixed16_frac(int x)68 fixed16_frac(int x)
69 {
70    return x & (FIXED16_ONE - 1);
71 }
72 
73 static inline int
fixed16_approx(int x,int y,int tol)74 fixed16_approx(int x, int y, int tol)
75 {
76    return y - tol <= x && x <= y + tol;
77 }
78 
79 
80 /*
81  * Unstretched blit of a bgra texture.
82  */
83 static const uint32_t *
fetch_bgra_memcpy(struct lp_linear_elem * elem)84 fetch_bgra_memcpy(struct lp_linear_elem *elem)
85 {
86    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
87    const struct lp_jit_texture *texture = samp->texture;
88    const uint32_t *src_row =
89       (const uint32_t *)((const uint8_t *)texture->base +
90                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
91    const int s     = samp->s;
92    const int width = samp->width;
93    const uint32_t *row;
94 
95    src_row = &src_row[s >> FIXED16_SHIFT];
96 
97    if (((uintptr_t)src_row & 0xf) == 0) {
98       /* The source texels are already aligned. Return them */
99       row = src_row;
100    } else {
101       memcpy(samp->row, src_row, width * sizeof *row);
102       row = samp->row;
103    }
104 
105    samp->t += samp->dtdy;
106    return row;
107 }
108 
109 
110 /*
111  * Unstretched blit of a bgrx texture.
112  */
113 static const uint32_t *
fetch_bgrx_memcpy(struct lp_linear_elem * elem)114 fetch_bgrx_memcpy(struct lp_linear_elem *elem)
115 {
116    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
117    const struct lp_jit_texture *texture = samp->texture;
118    const uint32_t *src_row =
119       (const uint32_t *)((const uint8_t *)texture->base +
120                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
121    const int s     = samp->s;
122    const int width = samp->width;
123    uint32_t *row   = samp->row;
124    int i;
125 
126    src_row = &src_row[s >> FIXED16_SHIFT];
127 
128    for (i = 0; i < width; i++) {
129       row[i] = src_row[i] | 0xff000000;
130    }
131 
132    samp->t += samp->dtdy;
133    return row;
134 }
135 
136 
137 /*
138  * Perform nearest filtered lookup of a row of texels.  Texture lookup
139  * is assumed to be axis aligned but with arbitrary scaling.
140  *
141  * Texture coordinate interpolation is performed in 16.16 fixed point,
142  * not to be confused with the 1.15 format used by the interpolants.
143  *
144  * After 64 pixels (ie. in the next tile), the starting point will be
145  * recalculated with floating point arithmetic.
146  */
147 static const uint32_t *
fetch_bgra_axis_aligned(struct lp_linear_elem * elem)148 fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
149 {
150    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
151    const struct lp_jit_texture *texture = samp->texture;
152    const uint32_t *src_row =
153       (const uint32_t *)((const uint8_t *)texture->base +
154                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
155    const int dsdx  = samp->dsdx;
156    const int width = samp->width;
157    uint32_t *row   = samp->row;
158    int s = samp->s;
159    int i;
160 
161    for (i = 0; i < width; i++) {
162       row[i] = src_row[s>>FIXED16_SHIFT];
163       s += dsdx;
164    }
165 
166    samp->t += samp->dtdy;
167    return row;
168 }
169 
170 static const uint32_t *
fetch_bgrx_axis_aligned(struct lp_linear_elem * elem)171 fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
172 {
173    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
174    const struct lp_jit_texture *texture = samp->texture;
175    const uint32_t *src_row =
176       (const uint32_t *)((const uint8_t *)texture->base +
177                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
178    const int dsdx  = samp->dsdx;
179    const int width = samp->width;
180    uint32_t *row   = samp->row;
181    int s = samp->s;
182    int i;
183 
184    for (i = 0; i < width; i++) {
185       row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
186       s += dsdx;
187    }
188 
189    samp->t += samp->dtdy;
190    return row;
191 }
192 
193 /* Non-axis aligned, but no clamping or wrapping required
194  */
195 static const uint32_t *
fetch_bgra(struct lp_linear_elem * elem)196 fetch_bgra(struct lp_linear_elem *elem)
197 {
198    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
199    const struct lp_jit_texture *texture = samp->texture;
200    const uint8_t *src = texture->base;
201    const int stride = texture->row_stride[0];
202    const int dsdx  = samp->dsdx;
203    const int dtdx  = samp->dtdx;
204    const int width = samp->width;
205    uint32_t *row   = samp->row;
206    int s = samp->s;
207    int t = samp->t;
208    int i;
209 
210    for (i = 0; i < width; i++) {
211       const uint8_t *texel = (src +
212                               (t>>FIXED16_SHIFT) * stride +
213                               (s>>FIXED16_SHIFT) * 4);
214 
215       row[i] = *(const uint32_t *)texel;
216 
217       s += dsdx;
218       t += dtdx;
219    }
220 
221    samp->s += samp->dsdy;
222    samp->t += samp->dtdy;
223    return row;
224 }
225 
226 
227 static const uint32_t *
fetch_bgrx(struct lp_linear_elem * elem)228 fetch_bgrx(struct lp_linear_elem *elem)
229 {
230    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
231    const struct lp_jit_texture *texture = samp->texture;
232    const uint8_t *src = texture->base;
233    const int stride = texture->row_stride[0];
234    const int dsdx  = samp->dsdx;
235    const int dtdx  = samp->dtdx;
236    const int width = samp->width;
237    uint32_t *row   = samp->row;
238    int s = samp->s;
239    int t = samp->t;
240    int i;
241 
242    for (i = 0; i < width; i++) {
243       const uint8_t *texel = (src +
244                               (t>>FIXED16_SHIFT) * stride +
245                               (s>>FIXED16_SHIFT) * 4);
246 
247       row[i] = (*(const uint32_t *)texel) | 0xff000000;
248 
249       s += dsdx;
250       t += dtdx;
251    }
252 
253    samp->s += samp->dsdy;
254    samp->t += samp->dtdy;
255    return row;
256 }
257 
258 /* Non-axis aligned, clamped.
259  */
260 static const uint32_t *
fetch_bgra_clamp(struct lp_linear_elem * elem)261 fetch_bgra_clamp(struct lp_linear_elem *elem)
262 {
263    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
264    const struct lp_jit_texture *texture = samp->texture;
265    const uint8_t *src   = texture->base;
266    const int stride     = texture->row_stride[0];
267    const int tex_height = texture->height - 1;
268    const int tex_width  = texture->width - 1;
269    const int dsdx  = samp->dsdx;
270    const int dtdx  = samp->dtdx;
271    const int width = samp->width;
272    uint32_t *row   = samp->row;
273    int s = samp->s;
274    int t = samp->t;
275    int i;
276 
277    for (i = 0; i < width; i++) {
278       int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
279       int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
280 
281       const uint8_t *texel = (src +
282                               ct * stride +
283                               cs * 4);
284 
285       row[i] = *(const uint32_t *)texel;
286 
287       s += dsdx;
288       t += dtdx;
289    }
290 
291    samp->s += samp->dsdy;
292    samp->t += samp->dtdy;
293    return row;
294 }
295 
296 static const uint32_t *
fetch_bgrx_clamp(struct lp_linear_elem * elem)297 fetch_bgrx_clamp(struct lp_linear_elem *elem)
298 {
299    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
300    const struct lp_jit_texture *texture = samp->texture;
301    const uint8_t *src   = texture->base;
302    const int stride     = texture->row_stride[0];
303    const int tex_height = texture->height - 1;
304    const int tex_width  = texture->width - 1;
305    const int dsdx  = samp->dsdx;
306    const int dtdx  = samp->dtdx;
307    const int width = samp->width;
308    uint32_t *row   = samp->row;
309    int s = samp->s;
310    int t = samp->t;
311    int i;
312 
313    for (i = 0; i < width; i++) {
314       int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
315       int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
316 
317       const uint8_t *texel = (src +
318                               ct * stride +
319                               cs * 4);
320 
321       row[i] = (*(const uint32_t *)texel) | 0xff000000;
322 
323       s += dsdx;
324       t += dtdx;
325    }
326 
327    samp->s += samp->dsdy;
328    samp->t += samp->dtdy;
329    return row;
330 }
331 
332 /**
333  * Fetch and stretch one row.
334  */
335 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)336 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
337                            int y)
338 {
339    const struct lp_jit_texture *texture = samp->texture;
340    const uint32_t *data = (const uint32_t *)texture->base;
341    const int stride = texture->row_stride[0] / sizeof(uint32_t);
342    const uint32_t * restrict src_row;
343    uint32_t * restrict dst_row;
344    const int width = samp->width;
345 
346    /*
347     * Search the stretched row cache first.
348     */
349 
350    if (y == samp->stretched_row_y[0]) {
351       samp->stretched_row_index = 1;
352       return samp->stretched_row[0];
353    }
354 
355    if (y == samp->stretched_row_y[1]) {
356       samp->stretched_row_index = 0;
357       return samp->stretched_row[1];
358    }
359 
360    /*
361     * Replace one entry.
362     */
363 
364    src_row = data + y * stride;
365 
366    dst_row = samp->stretched_row[samp->stretched_row_index];
367 
368    if (fixed16_frac(samp->s) == 0 &&
369        samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
370       /*
371        * 1:1 blit on the x direction.
372        */
373 
374       unsigned i;
375 
376       src_row += samp->s >> FIXED16_SHIFT;
377 
378       if (((uintptr_t)src_row & 0xf) == 0) {
379          /* The source texture is already aligned. Return it */
380          return src_row;
381       }
382 
383       /* Copy the source texture */
384       for (i = 0; i < width; i += 4) {
385          __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
386          *(__m128i *)&dst_row[i] = src;
387       }
388    }
389    else {
390       util_sse2_stretch_row_8unorm((__m128i *)dst_row,
391                                    align(width, 4),
392                                    src_row, samp->s, samp->dsdx);
393    }
394 
395    samp->stretched_row_y[samp->stretched_row_index] = y;
396    samp->stretched_row_index ^= 1;
397 
398    return dst_row;
399 }
400 
401 /* Maximise only as we fetch unscaled pixels linearly into a size-64
402  * temporary.  For minimise, we will want to either have a bigger
403  * temporary or fetch sparsely.
404  */
405 static const uint32_t *
fetch_bgra_axis_aligned_linear(struct lp_linear_elem * elem)406 fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
407 {
408    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
409    const int width = samp->width;
410    const uint32_t * restrict src_row0;
411    const uint32_t * restrict src_row1;
412    uint32_t * restrict row = samp->row;
413    int y = samp->t >> FIXED16_SHIFT;
414    int w = (samp->t >> 8) & 0xff;
415    int i;
416    __m128i wt;
417 
418    samp->t += samp->dtdy;
419 
420    src_row0 = fetch_and_stretch_bgra_row(samp, y);
421 
422    if (w == 0) {
423       return src_row0;
424    }
425 
426    src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
427 
428    wt = _mm_set1_epi16(w);
429 
430    /* Combine the two rows using a constant weight.
431     */
432    for (i = 0; i < width; i += 4) {
433       __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
434       __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
435 
436       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
437    }
438 
439    return row;
440 }
441 
442 /* Non-axis-aligned version.  Don't try to take advantage of
443  * maximize.
444  */
445 static const uint32_t *
fetch_bgra_linear(struct lp_linear_elem * elem)446 fetch_bgra_linear(struct lp_linear_elem *elem)
447 {
448    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
449    const struct lp_jit_texture *texture = samp->texture;
450    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
451    const uint32_t *data  = (const uint32_t *)texture->base;
452    const int dsdx  = samp->dsdx;
453    const int dtdx  = samp->dtdx;
454    const int width = samp->width;
455    uint32_t *row   = samp->row;
456    int s = samp->s;
457    int t = samp->t;
458    int i, j;
459 
460    for (i = 0; i < width; i += 4) {
461       union m128i si0, si1, si2, si3, ws, wt;
462       __m128i si02, si13;
463 
464       for (j = 0; j < 4; j++) {
465          const uint32_t *src = data + (t >> 16) * stride + (s>>16);
466 
467          si0.ui[j] = src[0];
468          si1.ui[j] = src[1];
469          si2.ui[j] = src[stride + 0];
470          si3.ui[j] = src[stride + 1];
471 
472          ws.ui[j] = (s>>8) & 0xff;
473          wt.ui[j] = (t>>8) & 0xff;
474 
475          s += dsdx;
476          t += dtdx;
477       }
478 
479       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
480       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
481 
482       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
483       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
484 
485       si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
486       si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
487 
488       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
489    }
490 
491    samp->s += samp->dsdy;
492    samp->t += samp->dtdy;
493    return row;
494 }
495 
496 
497 /* Clamped, non-axis-aligned version.  Don't try to take advantage of
498  * maximize.
499  */
500 static const uint32_t *
fetch_bgra_clamp_linear(struct lp_linear_elem * elem)501 fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
502 {
503    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
504    const struct lp_jit_texture *texture = samp->texture;
505    const uint32_t *data  = (const uint32_t *)texture->base;
506    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
507    const int tex_height = texture->height - 1;
508    const int tex_width  = texture->width - 1;
509    const int dsdx  = samp->dsdx;
510    const int dtdx  = samp->dtdx;
511    const int width = samp->width;
512    uint32_t *row   = samp->row;
513    int s = samp->s;
514    int t = samp->t;
515    int i, j;
516    /* width, height, stride (in pixels) must be smaller than 32768 */
517    __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
518    s4 = _mm_set1_epi32(s);
519    t4 = _mm_set1_epi32(t);
520    s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
521    t4 =  _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
522    dsdx4 = _mm_set1_epi32(4*dsdx);
523    dtdx4 = _mm_set1_epi32(4*dtdx);
524    stride4 = _mm_set1_epi32(stride);
525    w4 = _mm_set1_epi32(tex_width);
526    h4 = _mm_set1_epi32(tex_height);
527    zero = _mm_setzero_si128();
528    one = _mm_set1_epi32(1);
529 
530    for (i = 0; i < width; i += 4) {
531       union m128i addr[4];
532       __m128i ws, wt, wsl, wsh, wtl, wth;
533       __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
534 
535       s4s = _mm_srli_epi32(s4, 16);
536       t4s = _mm_srli_epi32(t4, 16);
537       cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
538       cs1 = _mm_add_epi16(s4s, one);
539       cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
540       ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
541       ct1 = _mm_add_epi16(t4s, one);
542       ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
543       tmp = _mm_madd_epi16(ct0, stride4);
544       addr[0].m = _mm_add_epi32(tmp, cs0);
545       addr[1].m = _mm_add_epi32(tmp, cs1);
546       tmp = _mm_madd_epi16(ct1, stride4);
547       addr[2].m = _mm_add_epi32(tmp, cs0);
548       addr[3].m = _mm_add_epi32(tmp, cs1);
549 
550       for (j = 0; j < 4; j++) {
551          __m128i ld1, ld2, ld3;
552          si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
553          ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
554          si[j] = _mm_unpacklo_epi32(si[j], ld1);
555          ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
556          ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
557          ld2 = _mm_unpacklo_epi32(ld2, ld3);
558          si[j] =  _mm_unpacklo_epi64(si[j], ld2);
559       }
560 
561       ws = _mm_srli_epi32(s4, 8);
562       ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
563       wt = _mm_srli_epi32(t4, 8);
564       wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
565 
566       s4 = _mm_add_epi32(s4, dsdx4);
567       t4 = _mm_add_epi32(t4, dtdx4);
568 
569 #if 0
570 /* scalar code for reference */
571       for (j = 0; j < 4; j++) {
572          int s0 = s >> FIXED16_SHIFT;
573          int t0 = t >> FIXED16_SHIFT;
574          int cs0 = CLAMP(s0    , 0, tex_width);
575          int cs1 = CLAMP(s0 + 1, 0, tex_width);
576          int ct0 = CLAMP(t0    , 0, tex_height);
577          int ct1 = CLAMP(t0 + 1, 0, tex_height);
578 
579          si0.ui[j] = data[ct0 * stride + cs0];
580          si1.ui[j] = data[ct0 * stride + cs1];
581          si2.ui[j] = data[ct1 * stride + cs0];
582          si3.ui[j] = data[ct1 * stride + cs1];
583 
584          ws.ui[j] = (s>>8) & 0xff;
585          wt.ui[j] = (t>>8) & 0xff;
586 
587          s += dsdx;
588          t += dtdx;
589       }
590 #endif
591 
592       ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
593       wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
594       wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
595 
596       wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
597       wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
598       wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
599 
600       *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
601                                                            &si[1], &si[3],
602                                                            &wtl, &wth,
603                                                            &wsl, &wsh);
604    }
605 
606    samp->s += samp->dsdy;
607    samp->t += samp->dtdy;
608    return row;
609 }
610 
611 static const uint32_t *
fetch_bgrx_axis_aligned_linear(struct lp_linear_elem * elem)612 fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
613 {
614    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
615    const __m128i mask = _mm_set1_epi32(0xff000000);
616    uint32_t *dst_row = samp->row;
617    const uint32_t *src_row;
618    int width = samp->width;
619    int i;
620 
621    src_row = fetch_bgra_axis_aligned_linear(&samp->base);
622 
623    for (i = 0; i < width; i += 4) {
624       __m128i bgra = *(__m128i *)&src_row[i];
625       __m128i bgrx = _mm_or_si128(bgra, mask);
626       *(__m128i *)&dst_row[i] = bgrx;
627    }
628 
629    return dst_row;
630 }
631 
632 
633 static const uint32_t *
fetch_bgrx_clamp_linear(struct lp_linear_elem * elem)634 fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
635 {
636    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
637    const __m128i mask = _mm_set1_epi32(0xff000000);
638    uint32_t *row   = samp->row;
639    int width = samp->width;
640    int i;
641 
642    fetch_bgra_clamp_linear(&samp->base);
643 
644    for (i = 0; i < width; i += 4) {
645       __m128i bgra = *(__m128i *)&row[i];
646       __m128i bgrx = _mm_or_si128(bgra, mask);
647       *(__m128i *)&row[i] = bgrx;
648    }
649 
650    return row;
651 }
652 
653 
654 static const uint32_t *
fetch_bgrx_linear(struct lp_linear_elem * elem)655 fetch_bgrx_linear(struct lp_linear_elem *elem)
656 {
657    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
658    const __m128i mask = _mm_set1_epi32(0xff000000);
659    uint32_t *row   = samp->row;
660    int width = samp->width;
661    int i;
662 
663    fetch_bgra_linear(&samp->base);
664 
665    for (i = 0; i < width; i += 4) {
666       __m128i bgra = *(__m128i *)&row[i];
667       __m128i bgrx = _mm_or_si128(bgra, mask);
668       *(__m128i *)&row[i] = bgrx;
669    }
670 
671    return row;
672 }
673 
674 
675 static boolean
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,boolean minify)676 sampler_is_nearest(const struct lp_linear_sampler *samp,
677                    const struct lp_sampler_static_state *sampler_state,
678                    boolean minify)
679 {
680    unsigned img_filter;
681 
682    if (minify)
683       img_filter = sampler_state->sampler_state.min_img_filter;
684    else
685       img_filter = sampler_state->sampler_state.mag_img_filter;
686 
687    /* Is it obviously nearest?
688     */
689    if (img_filter == PIPE_TEX_FILTER_NEAREST)
690       return TRUE;
691 
692    /* Otherwise look for linear samplers which devolve to nearest.
693     */
694 
695    /* Needs to be axis aligned.
696     */
697    if (!samp->axis_aligned)
698       return FALSE;
699 
700    if (0) {
701       /* For maximizing shaders, revert to nearest
702        */
703       if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
704           samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
705          return TRUE;
706 
707       /* For severely minimising shaders, revert to nearest:
708        */
709       if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
710           (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
711          return TRUE;
712    }
713 
714    /*
715     * Must be near a pixel center:
716     */
717    if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
718        !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
719       return FALSE;
720 
721    /*
722     * Must make a full step between pixels:
723     */
724    if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
725        !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
726       return FALSE;
727 
728    /* Treat it as nearest!
729     */
730    return TRUE;
731 }
732 
733 /* XXX: Lots of static-state parameters being passed in here but very
734  * little info is extracted from each one.  Consolidate it all down to
735  * something succinct in the prepare phase?
736  */
737 boolean
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4])738 lp_linear_init_sampler(struct lp_linear_sampler *samp,
739                        const struct lp_tgsi_texture_info *info,
740                        const struct lp_sampler_static_state *sampler_state,
741                        const struct lp_jit_texture *texture,
742                        int x0, int y0, int width, int height,
743                        const float (*a0)[4],
744                        const float (*dadx)[4],
745                        const float (*dady)[4])
746 {
747    const struct lp_tgsi_channel_info *schan = &info->coord[0];
748    const struct lp_tgsi_channel_info *tchan = &info->coord[1];
749 
750    float w0   =   a0[0][3];
751 
752    float s0   =   a0[schan->u.index+1][schan->swizzle];
753    float dsdx = dadx[schan->u.index+1][schan->swizzle];
754    float dsdy = dady[schan->u.index+1][schan->swizzle];
755 
756    float t0   =   a0[tchan->u.index+1][tchan->swizzle];
757    float dtdx = dadx[tchan->u.index+1][tchan->swizzle];
758    float dtdy = dady[tchan->u.index+1][tchan->swizzle];
759 
760    int mins, mint, maxs, maxt;
761    float oow = 1.0f / w0;
762    float width_oow = texture->width * oow;
763    float height_oow = texture->height * oow;
764    float fdsdx = dsdx * width_oow;
765    float fdsdy = dsdy * width_oow;
766    float fdtdx = dtdx * height_oow;
767    float fdtdy = dtdy * height_oow;
768    int fetch_width;
769    int fetch_height;
770    boolean minify;
771    boolean need_wrap;
772    boolean is_nearest;
773 
774    samp->texture = texture;
775    samp->width = width;
776 
777    samp->s = float_to_fixed16(fdsdx * x0 +
778                               fdsdy * y0 +
779                               s0 * width_oow);
780 
781    samp->t = float_to_fixed16(fdtdx * x0 +
782                               fdtdy * y0 +
783                               t0 * height_oow);
784 
785    samp->dsdx = float_to_fixed16(fdsdx);
786    samp->dsdy = float_to_fixed16(fdsdy);
787    samp->dtdx = float_to_fixed16(fdtdx);
788    samp->dtdy = float_to_fixed16(fdtdy);
789 
790 
791    samp->axis_aligned = (samp->dsdy == 0 &&
792                          samp->dtdx == 0); // TODO: could be relaxed
793 
794    {
795       int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
796       int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
797       int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
798       int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
799       int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
800 
801       minify = (rho > FIXED16_ONE);
802    }
803 
804    is_nearest = sampler_is_nearest(samp, sampler_state, minify);
805 
806    if (!is_nearest) {
807       samp->s -= FIXED16_HALF;
808       samp->t -= FIXED16_HALF;
809    }
810 
811    /* Check for clamping.  This rarely happens as we're rejecting interpolants
812     * which fall outside the 0..1 range.
813     */
814 
815    if (is_nearest) {
816       /* Nearest fetch routines don't employ SSE and always operate one pixel
817        * at a time.
818        */
819       fetch_width = width - 1;
820    }
821    else {
822       /* Linear fetch routines employ SSE, and always fetch groups of four
823        * texels.
824        */
825       fetch_width = align(width, 4) - 1;
826    }
827    fetch_height = height - 1;
828 
829    if (samp->axis_aligned) {
830       int s0 = samp->s;
831       int s1 = samp->s + fetch_width  * samp->dsdx;
832       int t0 = samp->t;
833       int t1 = samp->t + fetch_height * samp->dtdy;
834 
835       mins = MIN2(s0, s1);
836       mint = MIN2(t0, t1);
837       maxs = MAX2(s0, s1);
838       maxt = MAX2(t0, t1);
839    }
840    else {
841       int s0 = samp->s;
842       int s1 = samp->s + fetch_width  * samp->dsdx;
843       int s2 = samp->s + fetch_height * samp->dsdy;
844       int s3 = samp->s + fetch_width  * samp->dsdx + fetch_height * samp->dsdy;
845       int t0 = samp->t;
846       int t1 = samp->t + fetch_width  * samp->dtdx;
847       int t2 = samp->t + fetch_height * samp->dtdy;
848       int t3 = samp->t + fetch_width  * samp->dtdx + fetch_height * samp->dtdy;
849 
850       mins = MIN4(s0, s1, s2, s3);
851       mint = MIN4(t0, t1, t2, t3);
852       maxs = MAX4(s0, s1, s2, s3);
853       maxt = MAX4(t0, t1, t2, t3);
854    }
855 
856    if (is_nearest) {
857       need_wrap = (mins < 0 ||
858                    mint < 0 ||
859                    maxs >= (texture->width  << FIXED16_SHIFT) ||
860                    maxt >= (texture->height << FIXED16_SHIFT));
861    } else {
862       need_wrap = (mins < 0 ||
863                    mint < 0 ||
864                    maxs + FIXED16_ONE >= (texture->width  << FIXED16_SHIFT) ||
865                    maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
866    }
867 
868    if (0 && need_wrap) {
869       debug_printf("%u x %u %s\n",
870                    texture->width, texture->height,
871                    is_nearest ? "nearest" : "linear");
872       debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
873       debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
874       debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
875       debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
876       debug_printf("\n");
877    }
878 
879    /* We accept any mode below, but we only implement clamping.
880     */
881    if (need_wrap &&
882        (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
883         sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
884        return FALSE;
885    }
886 
887    if (is_nearest) {
888       switch (sampler_state->texture_state.format) {
889       case PIPE_FORMAT_B8G8R8A8_UNORM:
890          if (need_wrap)
891             samp->base.fetch = fetch_bgra_clamp;
892          else if (!samp->axis_aligned)
893             samp->base.fetch = fetch_bgra;
894          else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
895             samp->base.fetch = fetch_bgra_axis_aligned;
896          else
897             samp->base.fetch = fetch_bgra_memcpy;
898 
899          return TRUE;
900 
901       case PIPE_FORMAT_B8G8R8X8_UNORM:
902          if (need_wrap)
903             samp->base.fetch = fetch_bgrx_clamp;
904          else if (!samp->axis_aligned)
905             samp->base.fetch = fetch_bgrx;
906          else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
907             samp->base.fetch = fetch_bgrx_axis_aligned;
908          else
909             samp->base.fetch = fetch_bgrx_memcpy;
910 
911          return TRUE;
912 
913       default:
914          break;
915       }
916 
917       FAIL("unknown format for nearest");
918    }
919    else {
920       samp->stretched_row_y[0] = -1;
921       samp->stretched_row_y[1] = -1;
922       samp->stretched_row_index = 0;
923 
924       switch (sampler_state->texture_state.format) {
925       case PIPE_FORMAT_B8G8R8A8_UNORM:
926          if (need_wrap)
927             samp->base.fetch = fetch_bgra_clamp_linear;
928          else if (!samp->axis_aligned)
929             samp->base.fetch = fetch_bgra_linear;
930          else
931             samp->base.fetch = fetch_bgra_axis_aligned_linear;
932 
933          return TRUE;
934 
935       case PIPE_FORMAT_B8G8R8X8_UNORM:
936          if (need_wrap)
937             samp->base.fetch = fetch_bgrx_clamp_linear;
938          else if (!samp->axis_aligned)
939             samp->base.fetch = fetch_bgrx_linear;
940          else
941             samp->base.fetch = fetch_bgrx_axis_aligned_linear;
942          return TRUE;
943 
944       default:
945          break;
946       }
947 
948       FAIL("unknown format");
949    }
950 }
951 
952 
953 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)954 fetch_noop(struct lp_linear_elem *elem)
955 {
956    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
957    return samp->row;
958 }
959 
960 
961 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)962 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
963 {
964    samp->base.fetch = fetch_noop;
965 }
966 
967 /* Check the variant for linear path compatibility.
968  */
969 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)970 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
971                         const struct lp_tgsi_texture_info *tex)
972 {
973    if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
974       return FALSE;
975 
976    if (tex->target != TGSI_TEXTURE_2D)
977       return FALSE;
978 
979    if (tex->coord[0].file != TGSI_FILE_INPUT ||
980        tex->coord[1].file != TGSI_FILE_INPUT)
981       return FALSE;
982 
983    /* These are the only sampling modes we support at the moment.
984     *
985     * Actually we'll accept any mode as we're failing on any
986     * interpolant which exceeds 0..1.  Clamping is applied only to
987     * avoid invalid reads.
988     */
989    if (!is_nearest_sampler(sampler) &&
990        !is_linear_sampler(sampler))
991       return FALSE;
992 
993    /* These are the only texture formats we support at the moment
994     */
995    if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
996        sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM)
997       return FALSE;
998 
999    return TRUE;
1000 }
1001 
1002 #else
1003 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)1004 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
1005                         const struct lp_tgsi_texture_info *tex)
1006 {
1007    return FALSE;
1008 }
1009 #endif
1010