• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2010-2021 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "pipe/p_config.h"
30 
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36 
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41 
42 #if defined(PIPE_ARCH_SSE)
43 
44 #define FIXED16_SHIFT  16
45 #define FIXED16_ONE    (1<<16)
46 #define FIXED16_HALF   (1<<15)
47 
48 /*
49  * Color tolerance.  Allow 1 bit of error in 8 bit unorm colors.
50  */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52 
53 /*
54  * Tolerance for texture coordinate derivatives when doing linear filtering.
55  *
56  * (Note that extra care needs to be taken when doing linear filtering as
57  * coordinates may snap up to neighbour texels inside the tile).
58  */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60 
61 static inline int
float_to_fixed16(float f)62 float_to_fixed16(float f)
63 {
64    return f * (float)FIXED16_ONE;
65 }
66 
67 static inline int
fixed16_frac(int x)68 fixed16_frac(int x)
69 {
70    return x & (FIXED16_ONE - 1);
71 }
72 
73 static inline int
fixed16_approx(int x,int y,int tol)74 fixed16_approx(int x, int y, int tol)
75 {
76    return y - tol <= x && x <= y + tol;
77 }
78 
79 
80 /*
81  * Unstretched blit of a bgra texture.
82  */
83 static const uint32_t *
fetch_bgra_memcpy(struct lp_linear_elem * elem)84 fetch_bgra_memcpy(struct lp_linear_elem *elem)
85 {
86    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
87    const struct lp_jit_texture *texture = samp->texture;
88    const uint32_t *src_row =
89       (const uint32_t *)((const uint8_t *)texture->base +
90                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
91    const int s     = samp->s;
92    const int width = samp->width;
93    const uint32_t *row;
94 
95    src_row = &src_row[s >> FIXED16_SHIFT];
96 
97    if (((uintptr_t)src_row & 0xf) == 0) {
98       /* The source texels are already aligned. Return them */
99       row = src_row;
100    } else {
101       memcpy(samp->row, src_row, width * sizeof *row);
102       row = samp->row;
103    }
104 
105    samp->t += samp->dtdy;
106    return row;
107 }
108 
109 
110 /*
111  * Unstretched blit of a bgrx texture.
112  */
113 static const uint32_t *
fetch_bgrx_memcpy(struct lp_linear_elem * elem)114 fetch_bgrx_memcpy(struct lp_linear_elem *elem)
115 {
116    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
117    const struct lp_jit_texture *texture = samp->texture;
118    const uint32_t *src_row =
119       (const uint32_t *)((const uint8_t *)texture->base +
120                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
121    const int s     = samp->s;
122    const int width = samp->width;
123    uint32_t *row   = samp->row;
124 
125    src_row = &src_row[s >> FIXED16_SHIFT];
126 
127    for (int i = 0; i < width; i++) {
128       row[i] = src_row[i] | 0xff000000;
129    }
130 
131    samp->t += samp->dtdy;
132    return row;
133 }
134 
135 
136 /*
137  * Perform nearest filtered lookup of a row of texels.  Texture lookup
138  * is assumed to be axis aligned but with arbitrary scaling.
139  *
140  * Texture coordinate interpolation is performed in 16.16 fixed point,
141  * not to be confused with the 1.15 format used by the interpolants.
142  *
143  * After 64 pixels (ie. in the next tile), the starting point will be
144  * recalculated with floating point arithmetic.
145  */
146 static const uint32_t *
fetch_bgra_axis_aligned(struct lp_linear_elem * elem)147 fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
148 {
149    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
150    const struct lp_jit_texture *texture = samp->texture;
151    const uint32_t *src_row =
152       (const uint32_t *)((const uint8_t *)texture->base +
153                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
154    const int dsdx  = samp->dsdx;
155    const int width = samp->width;
156    uint32_t *row   = samp->row;
157    int s = samp->s;
158 
159    for (int i = 0; i < width; i++) {
160       row[i] = src_row[s>>FIXED16_SHIFT];
161       s += dsdx;
162    }
163 
164    samp->t += samp->dtdy;
165    return row;
166 }
167 
168 
169 static const uint32_t *
fetch_bgrx_axis_aligned(struct lp_linear_elem * elem)170 fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
171 {
172    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
173    const struct lp_jit_texture *texture = samp->texture;
174    const uint32_t *src_row =
175       (const uint32_t *)((const uint8_t *)texture->base +
176                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
177    const int dsdx  = samp->dsdx;
178    const int width = samp->width;
179    uint32_t *row   = samp->row;
180    int s = samp->s;
181 
182    for (int i = 0; i < width; i++) {
183       row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
184       s += dsdx;
185    }
186 
187    samp->t += samp->dtdy;
188    return row;
189 }
190 
191 
192 /* Non-axis aligned, but no clamping or wrapping required
193  */
194 static const uint32_t *
fetch_bgra(struct lp_linear_elem * elem)195 fetch_bgra(struct lp_linear_elem *elem)
196 {
197    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
198    const struct lp_jit_texture *texture = samp->texture;
199    const uint8_t *src = texture->base;
200    const int stride = texture->row_stride[0];
201    const int dsdx  = samp->dsdx;
202    const int dtdx  = samp->dtdx;
203    const int width = samp->width;
204    uint32_t *row   = samp->row;
205    int s = samp->s;
206    int t = samp->t;
207 
208    for (int i = 0; i < width; i++) {
209       const uint8_t *texel = (src +
210                               (t>>FIXED16_SHIFT) * stride +
211                               (s>>FIXED16_SHIFT) * 4);
212 
213       row[i] = *(const uint32_t *)texel;
214 
215       s += dsdx;
216       t += dtdx;
217    }
218 
219    samp->s += samp->dsdy;
220    samp->t += samp->dtdy;
221    return row;
222 }
223 
224 
225 static const uint32_t *
fetch_bgrx(struct lp_linear_elem * elem)226 fetch_bgrx(struct lp_linear_elem *elem)
227 {
228    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
229    const struct lp_jit_texture *texture = samp->texture;
230    const uint8_t *src = texture->base;
231    const int stride = texture->row_stride[0];
232    const int dsdx  = samp->dsdx;
233    const int dtdx  = samp->dtdx;
234    const int width = samp->width;
235    uint32_t *row   = samp->row;
236    int s = samp->s;
237    int t = samp->t;
238 
239    for (int i = 0; i < width; i++) {
240       const uint8_t *texel = (src +
241                               (t>>FIXED16_SHIFT) * stride +
242                               (s>>FIXED16_SHIFT) * 4);
243 
244       row[i] = (*(const uint32_t *)texel) | 0xff000000;
245 
246       s += dsdx;
247       t += dtdx;
248    }
249 
250    samp->s += samp->dsdy;
251    samp->t += samp->dtdy;
252    return row;
253 }
254 
255 /* Non-axis aligned, clamped.
256  */
257 static const uint32_t *
fetch_bgra_clamp(struct lp_linear_elem * elem)258 fetch_bgra_clamp(struct lp_linear_elem *elem)
259 {
260    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
261    const struct lp_jit_texture *texture = samp->texture;
262    const uint8_t *src   = texture->base;
263    const int stride     = texture->row_stride[0];
264    const int tex_height = texture->height - 1;
265    const int tex_width  = texture->width - 1;
266    const int dsdx  = samp->dsdx;
267    const int dtdx  = samp->dtdx;
268    const int width = samp->width;
269    uint32_t *row   = samp->row;
270    int s = samp->s;
271    int t = samp->t;
272 
273    for (int i = 0; i < width; i++) {
274       int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
275       int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
276 
277       const uint8_t *texel = src + ct * stride + cs * 4;
278 
279       row[i] = *(const uint32_t *)texel;
280 
281       s += dsdx;
282       t += dtdx;
283    }
284 
285    samp->s += samp->dsdy;
286    samp->t += samp->dtdy;
287    return row;
288 }
289 
290 static const uint32_t *
fetch_bgrx_clamp(struct lp_linear_elem * elem)291 fetch_bgrx_clamp(struct lp_linear_elem *elem)
292 {
293    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
294    const struct lp_jit_texture *texture = samp->texture;
295    const uint8_t *src   = texture->base;
296    const int stride     = texture->row_stride[0];
297    const int tex_height = texture->height - 1;
298    const int tex_width  = texture->width - 1;
299    const int dsdx  = samp->dsdx;
300    const int dtdx  = samp->dtdx;
301    const int width = samp->width;
302    uint32_t *row   = samp->row;
303    int s = samp->s;
304    int t = samp->t;
305 
306    for (int i = 0; i < width; i++) {
307       int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
308       int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
309 
310       const uint8_t *texel = src + ct * stride + cs * 4;
311 
312       row[i] = (*(const uint32_t *)texel) | 0xff000000;
313 
314       s += dsdx;
315       t += dtdx;
316    }
317 
318    samp->s += samp->dsdy;
319    samp->t += samp->dtdy;
320    return row;
321 }
322 
323 /**
324  * Fetch and stretch one row.
325  */
326 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)327 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
328                            int y)
329 {
330    const struct lp_jit_texture *texture = samp->texture;
331    const uint32_t *data = (const uint32_t *)texture->base;
332    const int stride = texture->row_stride[0] / sizeof(uint32_t);
333    const int width = samp->width;
334 
335    /*
336     * Search the stretched row cache first.
337     */
338 
339    if (y == samp->stretched_row_y[0]) {
340       samp->stretched_row_index = 1;
341       return samp->stretched_row[0];
342    }
343 
344    if (y == samp->stretched_row_y[1]) {
345       samp->stretched_row_index = 0;
346       return samp->stretched_row[1];
347    }
348 
349    /*
350     * Replace one entry.
351     */
352 
353    const uint32_t * restrict src_row = data + y * stride;
354    uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];
355 
356    if (fixed16_frac(samp->s) == 0 &&
357        samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
358       /*
359        * 1:1 blit on the x direction.
360        */
361       src_row += samp->s >> FIXED16_SHIFT;
362 
363       if (((uintptr_t)src_row & 0xf) == 0) {
364          /* The source texture is already aligned. Return it */
365          return src_row;
366       }
367 
368       /* Copy the source texture */
369       for (int i = 0; i < width; i += 4) {
370          __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
371          *(__m128i *)&dst_row[i] = src;
372       }
373    }
374    else {
375       util_sse2_stretch_row_8unorm((__m128i *)dst_row,
376                                    align(width, 4),
377                                    src_row, samp->s, samp->dsdx);
378    }
379 
380    samp->stretched_row_y[samp->stretched_row_index] = y;
381    samp->stretched_row_index ^= 1;
382 
383    return dst_row;
384 }
385 
386 
387 /* Maximise only as we fetch unscaled pixels linearly into a size-64
388  * temporary.  For minimise, we will want to either have a bigger
389  * temporary or fetch sparsely.
390  */
391 static const uint32_t *
fetch_bgra_axis_aligned_linear(struct lp_linear_elem * elem)392 fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
393 {
394    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
395    const int width = samp->width;
396    uint32_t * restrict row = samp->row;
397    const int y = samp->t >> FIXED16_SHIFT;
398    const int w = (samp->t >> 8) & 0xff;
399 
400    samp->t += samp->dtdy;
401 
402    const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);
403 
404    if (w == 0) {
405       return src_row0;
406    }
407 
408    const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
409 
410    __m128i wt = _mm_set1_epi16(w);
411 
412    /* Combine the two rows using a constant weight.
413     */
414    for (int i = 0; i < width; i += 4) {
415       __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
416       __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
417 
418       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
419    }
420 
421    return row;
422 }
423 
424 
425 /* Non-axis-aligned version.  Don't try to take advantage of
426  * maximize.
427  */
428 static const uint32_t *
fetch_bgra_linear(struct lp_linear_elem * elem)429 fetch_bgra_linear(struct lp_linear_elem *elem)
430 {
431    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
432    const struct lp_jit_texture *texture = samp->texture;
433    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
434    const uint32_t *data  = (const uint32_t *)texture->base;
435    const int dsdx  = samp->dsdx;
436    const int dtdx  = samp->dtdx;
437    const int width = samp->width;
438    uint32_t *row   = samp->row;
439    int s = samp->s;
440    int t = samp->t;
441 
442    for (int i = 0; i < width; i += 4) {
443       union m128i si0, si1, si2, si3, ws, wt;
444       __m128i si02, si13;
445 
446       for (int j = 0; j < 4; j++) {
447          const uint32_t *src = data + (t >> 16) * stride + (s >> 16);
448 
449          si0.ui[j] = src[0];
450          si1.ui[j] = src[1];
451          si2.ui[j] = src[stride + 0];
452          si3.ui[j] = src[stride + 1];
453 
454          ws.ui[j] = (s>>8) & 0xff;
455          wt.ui[j] = (t>>8) & 0xff;
456 
457          s += dsdx;
458          t += dtdx;
459       }
460 
461       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
462       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
463 
464       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
465       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
466 
467       si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
468       si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
469 
470       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
471    }
472 
473    samp->s += samp->dsdy;
474    samp->t += samp->dtdy;
475    return row;
476 }
477 
478 
479 /* Clamped, non-axis-aligned version.  Don't try to take advantage of
480  * maximize.
481  */
482 static const uint32_t *
fetch_bgra_clamp_linear(struct lp_linear_elem * elem)483 fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
484 {
485    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
486    const struct lp_jit_texture *texture = samp->texture;
487    const uint32_t *data  = (const uint32_t *)texture->base;
488    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
489    const int tex_height = texture->height - 1;
490    const int tex_width  = texture->width - 1;
491    const int dsdx  = samp->dsdx;
492    const int dtdx  = samp->dtdx;
493    const int width = samp->width;
494    uint32_t *row   = samp->row;
495    int s = samp->s;
496    int t = samp->t;
497 
498    /* width, height, stride (in pixels) must be smaller than 32768 */
499    __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
500    s4 = _mm_set1_epi32(s);
501    t4 = _mm_set1_epi32(t);
502    s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
503    t4 =  _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
504    dsdx4 = _mm_set1_epi32(4*dsdx);
505    dtdx4 = _mm_set1_epi32(4*dtdx);
506    stride4 = _mm_set1_epi32(stride);
507    w4 = _mm_set1_epi32(tex_width);
508    h4 = _mm_set1_epi32(tex_height);
509    zero = _mm_setzero_si128();
510    one = _mm_set1_epi32(1);
511 
512    for (int i = 0; i < width; i += 4) {
513       union m128i addr[4];
514       __m128i ws, wt, wsl, wsh, wtl, wth;
515       __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
516 
517       s4s = _mm_srli_epi32(s4, 16);
518       t4s = _mm_srli_epi32(t4, 16);
519       cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
520       cs1 = _mm_add_epi16(s4s, one);
521       cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
522       ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
523       ct1 = _mm_add_epi16(t4s, one);
524       ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
525       tmp = _mm_madd_epi16(ct0, stride4);
526       addr[0].m = _mm_add_epi32(tmp, cs0);
527       addr[1].m = _mm_add_epi32(tmp, cs1);
528       tmp = _mm_madd_epi16(ct1, stride4);
529       addr[2].m = _mm_add_epi32(tmp, cs0);
530       addr[3].m = _mm_add_epi32(tmp, cs1);
531 
532       for (int j = 0; j < 4; j++) {
533          __m128i ld1, ld2, ld3;
534          si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
535          ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
536          si[j] = _mm_unpacklo_epi32(si[j], ld1);
537          ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
538          ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
539          ld2 = _mm_unpacklo_epi32(ld2, ld3);
540          si[j] =  _mm_unpacklo_epi64(si[j], ld2);
541       }
542 
543       ws = _mm_srli_epi32(s4, 8);
544       ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
545       wt = _mm_srli_epi32(t4, 8);
546       wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
547 
548       s4 = _mm_add_epi32(s4, dsdx4);
549       t4 = _mm_add_epi32(t4, dtdx4);
550 
551 #if 0
552 /* scalar code for reference */
553       for (int j = 0; j < 4; j++) {
554          int s0 = s >> FIXED16_SHIFT;
555          int t0 = t >> FIXED16_SHIFT;
556          int cs0 = CLAMP(s0    , 0, tex_width);
557          int cs1 = CLAMP(s0 + 1, 0, tex_width);
558          int ct0 = CLAMP(t0    , 0, tex_height);
559          int ct1 = CLAMP(t0 + 1, 0, tex_height);
560 
561          si0.ui[j] = data[ct0 * stride + cs0];
562          si1.ui[j] = data[ct0 * stride + cs1];
563          si2.ui[j] = data[ct1 * stride + cs0];
564          si3.ui[j] = data[ct1 * stride + cs1];
565 
566          ws.ui[j] = (s>>8) & 0xff;
567          wt.ui[j] = (t>>8) & 0xff;
568 
569          s += dsdx;
570          t += dtdx;
571       }
572 #endif
573 
574       ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
575       wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
576       wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
577 
578       wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
579       wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
580       wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
581 
582       *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
583                                                            &si[1], &si[3],
584                                                            &wtl, &wth,
585                                                            &wsl, &wsh);
586    }
587 
588    samp->s += samp->dsdy;
589    samp->t += samp->dtdy;
590 
591    return row;
592 }
593 
594 
595 static const uint32_t *
fetch_bgrx_axis_aligned_linear(struct lp_linear_elem * elem)596 fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
597 {
598    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
599    const __m128i mask = _mm_set1_epi32(0xff000000);
600    uint32_t *dst_row = samp->row;
601    const uint32_t *src_row = fetch_bgra_axis_aligned_linear(&samp->base);
602    const int width = samp->width;
603 
604    for (int i = 0; i < width; i += 4) {
605       __m128i bgra = *(__m128i *)&src_row[i];
606       __m128i bgrx = _mm_or_si128(bgra, mask);
607       *(__m128i *)&dst_row[i] = bgrx;
608    }
609 
610    return dst_row;
611 }
612 
613 
614 static const uint32_t *
fetch_bgrx_clamp_linear(struct lp_linear_elem * elem)615 fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
616 {
617    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
618    const __m128i mask = _mm_set1_epi32(0xff000000);
619    uint32_t *row = samp->row;
620    const int width = samp->width;
621 
622    fetch_bgra_clamp_linear(&samp->base);
623 
624    for (int i = 0; i < width; i += 4) {
625       __m128i bgra = *(__m128i *)&row[i];
626       __m128i bgrx = _mm_or_si128(bgra, mask);
627       *(__m128i *)&row[i] = bgrx;
628    }
629 
630    return row;
631 }
632 
633 
634 static const uint32_t *
fetch_bgrx_linear(struct lp_linear_elem * elem)635 fetch_bgrx_linear(struct lp_linear_elem *elem)
636 {
637    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
638    const __m128i mask = _mm_set1_epi32(0xff000000);
639    uint32_t *row = samp->row;
640    const int width = samp->width;
641 
642    fetch_bgra_linear(&samp->base);
643 
644    for (int i = 0; i < width; i += 4) {
645       __m128i bgra = *(__m128i *)&row[i];
646       __m128i bgrx = _mm_or_si128(bgra, mask);
647       *(__m128i *)&row[i] = bgrx;
648    }
649 
650    return row;
651 }
652 
653 
654 static boolean
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,boolean minify)655 sampler_is_nearest(const struct lp_linear_sampler *samp,
656                    const struct lp_sampler_static_state *sampler_state,
657                    boolean minify)
658 {
659    unsigned img_filter;
660 
661    if (minify)
662       img_filter = sampler_state->sampler_state.min_img_filter;
663    else
664       img_filter = sampler_state->sampler_state.mag_img_filter;
665 
666    /* Is it obviously nearest?
667     */
668    if (img_filter == PIPE_TEX_FILTER_NEAREST)
669       return TRUE;
670 
671    /* Otherwise look for linear samplers which devolve to nearest.
672     */
673 
674    /* Needs to be axis aligned.
675     */
676    if (!samp->axis_aligned)
677       return FALSE;
678 
679    if (0) {
680       /* For maximizing shaders, revert to nearest
681        */
682       if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
683           samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
684          return TRUE;
685 
686       /* For severely minimising shaders, revert to nearest:
687        */
688       if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
689           (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
690          return TRUE;
691    }
692 
693    /*
694     * Must be near a pixel center:
695     */
696    if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
697        !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
698       return FALSE;
699 
700    /*
701     * Must make a full step between pixels:
702     */
703    if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
704        !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
705       return FALSE;
706 
707    /* Treat it as nearest!
708     */
709    return TRUE;
710 }
711 
712 /* XXX: Lots of static-state parameters being passed in here but very
713  * little info is extracted from each one.  Consolidate it all down to
714  * something succinct in the prepare phase?
715  */
716 boolean
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4])717 lp_linear_init_sampler(struct lp_linear_sampler *samp,
718                        const struct lp_tgsi_texture_info *info,
719                        const struct lp_sampler_static_state *sampler_state,
720                        const struct lp_jit_texture *texture,
721                        int x0, int y0, int width, int height,
722                        const float (*a0)[4],
723                        const float (*dadx)[4],
724                        const float (*dady)[4])
725 {
726    const struct lp_tgsi_channel_info *schan = &info->coord[0];
727    const struct lp_tgsi_channel_info *tchan = &info->coord[1];
728 
729    assert(schan->file == TGSI_FILE_INPUT);
730    assert(tchan->file == TGSI_FILE_INPUT);
731 
732    float w0   =   a0[0][3];
733 
734    int foo = 1;
735    float s0   =   a0[schan->u.index+foo][schan->swizzle];
736    float dsdx = dadx[schan->u.index+foo][schan->swizzle];
737    float dsdy = dady[schan->u.index+foo][schan->swizzle];
738 
739    float t0   =   a0[tchan->u.index+foo][tchan->swizzle];
740    float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
741    float dtdy = dady[tchan->u.index+foo][tchan->swizzle];
742 
743    int mins, mint, maxs, maxt;
744    float oow = 1.0f / w0;
745    float width_oow = texture->width * oow;
746    float height_oow = texture->height * oow;
747    float fdsdx = dsdx * width_oow;
748    float fdsdy = dsdy * width_oow;
749    float fdtdx = dtdx * height_oow;
750    float fdtdy = dtdy * height_oow;
751    int fetch_width;
752    int fetch_height;
753    boolean minify;
754    boolean need_wrap;
755    boolean is_nearest;
756 
757    samp->texture = texture;
758    samp->width = width;
759 
760    samp->s = float_to_fixed16(fdsdx * x0 +
761                               fdsdy * y0 +
762                               s0 * width_oow);
763 
764    samp->t = float_to_fixed16(fdtdx * x0 +
765                               fdtdy * y0 +
766                               t0 * height_oow);
767 
768    samp->dsdx = float_to_fixed16(fdsdx);
769    samp->dsdy = float_to_fixed16(fdsdy);
770    samp->dtdx = float_to_fixed16(fdtdx);
771    samp->dtdy = float_to_fixed16(fdtdy);
772 
773 
774    samp->axis_aligned = (samp->dsdy == 0 &&
775                          samp->dtdx == 0); // TODO: could be relaxed
776 
777    {
778       int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
779       int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
780       int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
781       int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
782       int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
783 
784       minify = (rho > FIXED16_ONE);
785    }
786 
787    is_nearest = sampler_is_nearest(samp, sampler_state, minify);
788 
789    if (!is_nearest) {
790       samp->s -= FIXED16_HALF;
791       samp->t -= FIXED16_HALF;
792    }
793 
794    /* Check for clamping.  This rarely happens as we're rejecting interpolants
795     * which fall outside the 0..1 range.
796     */
797 
798    if (is_nearest) {
799       /* Nearest fetch routines don't employ SSE and always operate one pixel
800        * at a time.
801        */
802       fetch_width = width - 1;
803    }
804    else {
805       /* Linear fetch routines employ SSE, and always fetch groups of four
806        * texels.
807        */
808       fetch_width = align(width, 4) - 1;
809    }
810    fetch_height = height - 1;
811 
812    if (samp->axis_aligned) {
813       int s0 = samp->s;
814       int s1 = samp->s + fetch_width  * samp->dsdx;
815       int t0 = samp->t;
816       int t1 = samp->t + fetch_height * samp->dtdy;
817 
818       mins = MIN2(s0, s1);
819       mint = MIN2(t0, t1);
820       maxs = MAX2(s0, s1);
821       maxt = MAX2(t0, t1);
822    }
823    else {
824       int s0 = samp->s;
825       int s1 = samp->s + fetch_width  * samp->dsdx;
826       int s2 = samp->s + fetch_height * samp->dsdy;
827       int s3 = samp->s + fetch_width  * samp->dsdx + fetch_height * samp->dsdy;
828       int t0 = samp->t;
829       int t1 = samp->t + fetch_width  * samp->dtdx;
830       int t2 = samp->t + fetch_height * samp->dtdy;
831       int t3 = samp->t + fetch_width  * samp->dtdx + fetch_height * samp->dtdy;
832 
833       mins = MIN4(s0, s1, s2, s3);
834       mint = MIN4(t0, t1, t2, t3);
835       maxs = MAX4(s0, s1, s2, s3);
836       maxt = MAX4(t0, t1, t2, t3);
837    }
838 
839    if (is_nearest) {
840       need_wrap = (mins < 0 ||
841                    mint < 0 ||
842                    maxs >= (texture->width  << FIXED16_SHIFT) ||
843                    maxt >= (texture->height << FIXED16_SHIFT));
844    } else {
845       need_wrap = (mins < 0 ||
846                    mint < 0 ||
847                    maxs + FIXED16_ONE >= (texture->width  << FIXED16_SHIFT) ||
848                    maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
849    }
850 
851    if (0 && need_wrap) {
852       debug_printf("%u x %u %s\n",
853                    texture->width, texture->height,
854                    is_nearest ? "nearest" : "linear");
855       debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
856       debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
857       debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
858       debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
859       debug_printf("\n");
860    }
861 
862    /* We accept any mode below, but we only implement clamping.
863     */
864    if (need_wrap &&
865        (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
866         sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
867        return FALSE;
868    }
869 
870    if (is_nearest) {
871       switch (sampler_state->texture_state.format) {
872       case PIPE_FORMAT_B8G8R8A8_UNORM:
873          if (need_wrap)
874             samp->base.fetch = fetch_bgra_clamp;
875          else if (!samp->axis_aligned)
876             samp->base.fetch = fetch_bgra;
877          else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
878             samp->base.fetch = fetch_bgra_axis_aligned;
879          else
880             samp->base.fetch = fetch_bgra_memcpy;
881          return TRUE;
882       case PIPE_FORMAT_B8G8R8X8_UNORM:
883          if (need_wrap)
884             samp->base.fetch = fetch_bgrx_clamp;
885          else if (!samp->axis_aligned)
886             samp->base.fetch = fetch_bgrx;
887          else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
888             samp->base.fetch = fetch_bgrx_axis_aligned;
889          else
890             samp->base.fetch = fetch_bgrx_memcpy;
891          return TRUE;
892       default:
893          break;
894       }
895 
896       FAIL("unknown format for nearest");
897    }
898    else {
899       samp->stretched_row_y[0] = -1;
900       samp->stretched_row_y[1] = -1;
901       samp->stretched_row_index = 0;
902 
903       switch (sampler_state->texture_state.format) {
904       case PIPE_FORMAT_B8G8R8A8_UNORM:
905          if (need_wrap)
906             samp->base.fetch = fetch_bgra_clamp_linear;
907          else if (!samp->axis_aligned)
908             samp->base.fetch = fetch_bgra_linear;
909          else
910             samp->base.fetch = fetch_bgra_axis_aligned_linear;
911          return TRUE;
912       case PIPE_FORMAT_B8G8R8X8_UNORM:
913          if (need_wrap)
914             samp->base.fetch = fetch_bgrx_clamp_linear;
915          else if (!samp->axis_aligned)
916             samp->base.fetch = fetch_bgrx_linear;
917          else
918             samp->base.fetch = fetch_bgrx_axis_aligned_linear;
919          return TRUE;
920       default:
921          break;
922       }
923 
924       FAIL("unknown format");
925    }
926 }
927 
928 
929 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)930 fetch_noop(struct lp_linear_elem *elem)
931 {
932    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
933    return samp->row;
934 }
935 
936 
937 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)938 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
939 {
940    samp->base.fetch = fetch_noop;
941 }
942 
943 
944 /*
945  * Check the given sampler and texture info for linear path compatibility.
946  */
947 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)948 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
949                         const struct lp_tgsi_texture_info *tex)
950 {
951    if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
952       return FALSE;
953 
954    if (tex->target != TGSI_TEXTURE_2D)
955       return FALSE;
956 
957    if (tex->coord[0].file != TGSI_FILE_INPUT ||
958        tex->coord[1].file != TGSI_FILE_INPUT)
959       return FALSE;
960 
961    /* These are the only sampling modes we support at the moment.
962     *
963     * Actually we'll accept any mode as we're failing on any
964     * interpolant which exceeds 0..1.  Clamping is applied only to
965     * avoid invalid reads.
966     */
967    if (!is_nearest_sampler(sampler) &&
968        !is_linear_sampler(sampler))
969       return FALSE;
970 
971    /* These are the only texture formats we support at the moment
972     */
973    if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
974        sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM)
975       return FALSE;
976 
977    /* We don't support sampler view swizzling on the linear path */
978    if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
979        sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
980        sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
981        sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
982       return FALSE;
983    }
984 
985    return TRUE;
986 }
987 
988 #else
989 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)990 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
991                         const struct lp_tgsi_texture_info *tex)
992 {
993    return FALSE;
994 }
995 #endif
996