1 /**************************************************************************
2 *
3 * Copyright 2010-2021 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_config.h"
30
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41
42 #if defined(PIPE_ARCH_SSE)
43
44 #define FIXED16_SHIFT 16
45 #define FIXED16_ONE (1<<16)
46 #define FIXED16_HALF (1<<15)
47
48 /*
49 * Color tolerance. Allow 1 bit of error in 8 bit unorm colors.
50 */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52
53 /*
54 * Tolerance for texture coordinate derivatives when doing linear filtering.
55 *
56 * (Note that extra care needs to be taken when doing linear filtering as
57 * coordinates may snap up to neighbour texels inside the tile).
58 */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60
61 static inline int
float_to_fixed16(float f)62 float_to_fixed16(float f)
63 {
64 return f * (float)FIXED16_ONE;
65 }
66
67 static inline int
fixed16_frac(int x)68 fixed16_frac(int x)
69 {
70 return x & (FIXED16_ONE - 1);
71 }
72
73 static inline int
fixed16_approx(int x,int y,int tol)74 fixed16_approx(int x, int y, int tol)
75 {
76 return y - tol <= x && x <= y + tol;
77 }
78
79
80 /*
81 * Unstretched blit of a bgra texture.
82 */
83 static const uint32_t *
fetch_bgra_memcpy(struct lp_linear_elem * elem)84 fetch_bgra_memcpy(struct lp_linear_elem *elem)
85 {
86 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
87 const struct lp_jit_texture *texture = samp->texture;
88 const uint32_t *src_row =
89 (const uint32_t *)((const uint8_t *)texture->base +
90 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
91 const int s = samp->s;
92 const int width = samp->width;
93 const uint32_t *row;
94
95 src_row = &src_row[s >> FIXED16_SHIFT];
96
97 if (((uintptr_t)src_row & 0xf) == 0) {
98 /* The source texels are already aligned. Return them */
99 row = src_row;
100 } else {
101 memcpy(samp->row, src_row, width * sizeof *row);
102 row = samp->row;
103 }
104
105 samp->t += samp->dtdy;
106 return row;
107 }
108
109
110 /*
111 * Unstretched blit of a bgrx texture.
112 */
113 static const uint32_t *
fetch_bgrx_memcpy(struct lp_linear_elem * elem)114 fetch_bgrx_memcpy(struct lp_linear_elem *elem)
115 {
116 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
117 const struct lp_jit_texture *texture = samp->texture;
118 const uint32_t *src_row =
119 (const uint32_t *)((const uint8_t *)texture->base +
120 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
121 const int s = samp->s;
122 const int width = samp->width;
123 uint32_t *row = samp->row;
124 int i;
125
126 src_row = &src_row[s >> FIXED16_SHIFT];
127
128 for (i = 0; i < width; i++) {
129 row[i] = src_row[i] | 0xff000000;
130 }
131
132 samp->t += samp->dtdy;
133 return row;
134 }
135
136
137 /*
138 * Perform nearest filtered lookup of a row of texels. Texture lookup
139 * is assumed to be axis aligned but with arbitrary scaling.
140 *
141 * Texture coordinate interpolation is performed in 16.16 fixed point,
142 * not to be confused with the 1.15 format used by the interpolants.
143 *
144 * After 64 pixels (ie. in the next tile), the starting point will be
145 * recalculated with floating point arithmetic.
146 */
147 static const uint32_t *
fetch_bgra_axis_aligned(struct lp_linear_elem * elem)148 fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
149 {
150 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
151 const struct lp_jit_texture *texture = samp->texture;
152 const uint32_t *src_row =
153 (const uint32_t *)((const uint8_t *)texture->base +
154 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
155 const int dsdx = samp->dsdx;
156 const int width = samp->width;
157 uint32_t *row = samp->row;
158 int s = samp->s;
159 int i;
160
161 for (i = 0; i < width; i++) {
162 row[i] = src_row[s>>FIXED16_SHIFT];
163 s += dsdx;
164 }
165
166 samp->t += samp->dtdy;
167 return row;
168 }
169
170 static const uint32_t *
fetch_bgrx_axis_aligned(struct lp_linear_elem * elem)171 fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
172 {
173 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
174 const struct lp_jit_texture *texture = samp->texture;
175 const uint32_t *src_row =
176 (const uint32_t *)((const uint8_t *)texture->base +
177 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
178 const int dsdx = samp->dsdx;
179 const int width = samp->width;
180 uint32_t *row = samp->row;
181 int s = samp->s;
182 int i;
183
184 for (i = 0; i < width; i++) {
185 row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
186 s += dsdx;
187 }
188
189 samp->t += samp->dtdy;
190 return row;
191 }
192
193 /* Non-axis aligned, but no clamping or wrapping required
194 */
195 static const uint32_t *
fetch_bgra(struct lp_linear_elem * elem)196 fetch_bgra(struct lp_linear_elem *elem)
197 {
198 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
199 const struct lp_jit_texture *texture = samp->texture;
200 const uint8_t *src = texture->base;
201 const int stride = texture->row_stride[0];
202 const int dsdx = samp->dsdx;
203 const int dtdx = samp->dtdx;
204 const int width = samp->width;
205 uint32_t *row = samp->row;
206 int s = samp->s;
207 int t = samp->t;
208 int i;
209
210 for (i = 0; i < width; i++) {
211 const uint8_t *texel = (src +
212 (t>>FIXED16_SHIFT) * stride +
213 (s>>FIXED16_SHIFT) * 4);
214
215 row[i] = *(const uint32_t *)texel;
216
217 s += dsdx;
218 t += dtdx;
219 }
220
221 samp->s += samp->dsdy;
222 samp->t += samp->dtdy;
223 return row;
224 }
225
226
227 static const uint32_t *
fetch_bgrx(struct lp_linear_elem * elem)228 fetch_bgrx(struct lp_linear_elem *elem)
229 {
230 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
231 const struct lp_jit_texture *texture = samp->texture;
232 const uint8_t *src = texture->base;
233 const int stride = texture->row_stride[0];
234 const int dsdx = samp->dsdx;
235 const int dtdx = samp->dtdx;
236 const int width = samp->width;
237 uint32_t *row = samp->row;
238 int s = samp->s;
239 int t = samp->t;
240 int i;
241
242 for (i = 0; i < width; i++) {
243 const uint8_t *texel = (src +
244 (t>>FIXED16_SHIFT) * stride +
245 (s>>FIXED16_SHIFT) * 4);
246
247 row[i] = (*(const uint32_t *)texel) | 0xff000000;
248
249 s += dsdx;
250 t += dtdx;
251 }
252
253 samp->s += samp->dsdy;
254 samp->t += samp->dtdy;
255 return row;
256 }
257
258 /* Non-axis aligned, clamped.
259 */
260 static const uint32_t *
fetch_bgra_clamp(struct lp_linear_elem * elem)261 fetch_bgra_clamp(struct lp_linear_elem *elem)
262 {
263 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
264 const struct lp_jit_texture *texture = samp->texture;
265 const uint8_t *src = texture->base;
266 const int stride = texture->row_stride[0];
267 const int tex_height = texture->height - 1;
268 const int tex_width = texture->width - 1;
269 const int dsdx = samp->dsdx;
270 const int dtdx = samp->dtdx;
271 const int width = samp->width;
272 uint32_t *row = samp->row;
273 int s = samp->s;
274 int t = samp->t;
275 int i;
276
277 for (i = 0; i < width; i++) {
278 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
279 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
280
281 const uint8_t *texel = (src +
282 ct * stride +
283 cs * 4);
284
285 row[i] = *(const uint32_t *)texel;
286
287 s += dsdx;
288 t += dtdx;
289 }
290
291 samp->s += samp->dsdy;
292 samp->t += samp->dtdy;
293 return row;
294 }
295
296 static const uint32_t *
fetch_bgrx_clamp(struct lp_linear_elem * elem)297 fetch_bgrx_clamp(struct lp_linear_elem *elem)
298 {
299 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
300 const struct lp_jit_texture *texture = samp->texture;
301 const uint8_t *src = texture->base;
302 const int stride = texture->row_stride[0];
303 const int tex_height = texture->height - 1;
304 const int tex_width = texture->width - 1;
305 const int dsdx = samp->dsdx;
306 const int dtdx = samp->dtdx;
307 const int width = samp->width;
308 uint32_t *row = samp->row;
309 int s = samp->s;
310 int t = samp->t;
311 int i;
312
313 for (i = 0; i < width; i++) {
314 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
315 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
316
317 const uint8_t *texel = (src +
318 ct * stride +
319 cs * 4);
320
321 row[i] = (*(const uint32_t *)texel) | 0xff000000;
322
323 s += dsdx;
324 t += dtdx;
325 }
326
327 samp->s += samp->dsdy;
328 samp->t += samp->dtdy;
329 return row;
330 }
331
332 /**
333 * Fetch and stretch one row.
334 */
335 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)336 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
337 int y)
338 {
339 const struct lp_jit_texture *texture = samp->texture;
340 const uint32_t *data = (const uint32_t *)texture->base;
341 const int stride = texture->row_stride[0] / sizeof(uint32_t);
342 const uint32_t * restrict src_row;
343 uint32_t * restrict dst_row;
344 const int width = samp->width;
345
346 /*
347 * Search the stretched row cache first.
348 */
349
350 if (y == samp->stretched_row_y[0]) {
351 samp->stretched_row_index = 1;
352 return samp->stretched_row[0];
353 }
354
355 if (y == samp->stretched_row_y[1]) {
356 samp->stretched_row_index = 0;
357 return samp->stretched_row[1];
358 }
359
360 /*
361 * Replace one entry.
362 */
363
364 src_row = data + y * stride;
365
366 dst_row = samp->stretched_row[samp->stretched_row_index];
367
368 if (fixed16_frac(samp->s) == 0 &&
369 samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
370 /*
371 * 1:1 blit on the x direction.
372 */
373
374 unsigned i;
375
376 src_row += samp->s >> FIXED16_SHIFT;
377
378 if (((uintptr_t)src_row & 0xf) == 0) {
379 /* The source texture is already aligned. Return it */
380 return src_row;
381 }
382
383 /* Copy the source texture */
384 for (i = 0; i < width; i += 4) {
385 __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
386 *(__m128i *)&dst_row[i] = src;
387 }
388 }
389 else {
390 util_sse2_stretch_row_8unorm((__m128i *)dst_row,
391 align(width, 4),
392 src_row, samp->s, samp->dsdx);
393 }
394
395 samp->stretched_row_y[samp->stretched_row_index] = y;
396 samp->stretched_row_index ^= 1;
397
398 return dst_row;
399 }
400
401 /* Maximise only as we fetch unscaled pixels linearly into a size-64
402 * temporary. For minimise, we will want to either have a bigger
403 * temporary or fetch sparsely.
404 */
405 static const uint32_t *
fetch_bgra_axis_aligned_linear(struct lp_linear_elem * elem)406 fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
407 {
408 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
409 const int width = samp->width;
410 const uint32_t * restrict src_row0;
411 const uint32_t * restrict src_row1;
412 uint32_t * restrict row = samp->row;
413 int y = samp->t >> FIXED16_SHIFT;
414 int w = (samp->t >> 8) & 0xff;
415 int i;
416 __m128i wt;
417
418 samp->t += samp->dtdy;
419
420 src_row0 = fetch_and_stretch_bgra_row(samp, y);
421
422 if (w == 0) {
423 return src_row0;
424 }
425
426 src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
427
428 wt = _mm_set1_epi16(w);
429
430 /* Combine the two rows using a constant weight.
431 */
432 for (i = 0; i < width; i += 4) {
433 __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
434 __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
435
436 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
437 }
438
439 return row;
440 }
441
442 /* Non-axis-aligned version. Don't try to take advantage of
443 * maximize.
444 */
445 static const uint32_t *
fetch_bgra_linear(struct lp_linear_elem * elem)446 fetch_bgra_linear(struct lp_linear_elem *elem)
447 {
448 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
449 const struct lp_jit_texture *texture = samp->texture;
450 const int stride = texture->row_stride[0] / sizeof(uint32_t);
451 const uint32_t *data = (const uint32_t *)texture->base;
452 const int dsdx = samp->dsdx;
453 const int dtdx = samp->dtdx;
454 const int width = samp->width;
455 uint32_t *row = samp->row;
456 int s = samp->s;
457 int t = samp->t;
458 int i, j;
459
460 for (i = 0; i < width; i += 4) {
461 union m128i si0, si1, si2, si3, ws, wt;
462 __m128i si02, si13;
463
464 for (j = 0; j < 4; j++) {
465 const uint32_t *src = data + (t >> 16) * stride + (s>>16);
466
467 si0.ui[j] = src[0];
468 si1.ui[j] = src[1];
469 si2.ui[j] = src[stride + 0];
470 si3.ui[j] = src[stride + 1];
471
472 ws.ui[j] = (s>>8) & 0xff;
473 wt.ui[j] = (t>>8) & 0xff;
474
475 s += dsdx;
476 t += dtdx;
477 }
478
479 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
480 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
481
482 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
483 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
484
485 si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
486 si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
487
488 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
489 }
490
491 samp->s += samp->dsdy;
492 samp->t += samp->dtdy;
493 return row;
494 }
495
496
497 /* Clamped, non-axis-aligned version. Don't try to take advantage of
498 * maximize.
499 */
500 static const uint32_t *
fetch_bgra_clamp_linear(struct lp_linear_elem * elem)501 fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
502 {
503 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
504 const struct lp_jit_texture *texture = samp->texture;
505 const uint32_t *data = (const uint32_t *)texture->base;
506 const int stride = texture->row_stride[0] / sizeof(uint32_t);
507 const int tex_height = texture->height - 1;
508 const int tex_width = texture->width - 1;
509 const int dsdx = samp->dsdx;
510 const int dtdx = samp->dtdx;
511 const int width = samp->width;
512 uint32_t *row = samp->row;
513 int s = samp->s;
514 int t = samp->t;
515 int i, j;
516 /* width, height, stride (in pixels) must be smaller than 32768 */
517 __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
518 s4 = _mm_set1_epi32(s);
519 t4 = _mm_set1_epi32(t);
520 s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
521 t4 = _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
522 dsdx4 = _mm_set1_epi32(4*dsdx);
523 dtdx4 = _mm_set1_epi32(4*dtdx);
524 stride4 = _mm_set1_epi32(stride);
525 w4 = _mm_set1_epi32(tex_width);
526 h4 = _mm_set1_epi32(tex_height);
527 zero = _mm_setzero_si128();
528 one = _mm_set1_epi32(1);
529
530 for (i = 0; i < width; i += 4) {
531 union m128i addr[4];
532 __m128i ws, wt, wsl, wsh, wtl, wth;
533 __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
534
535 s4s = _mm_srli_epi32(s4, 16);
536 t4s = _mm_srli_epi32(t4, 16);
537 cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
538 cs1 = _mm_add_epi16(s4s, one);
539 cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
540 ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
541 ct1 = _mm_add_epi16(t4s, one);
542 ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
543 tmp = _mm_madd_epi16(ct0, stride4);
544 addr[0].m = _mm_add_epi32(tmp, cs0);
545 addr[1].m = _mm_add_epi32(tmp, cs1);
546 tmp = _mm_madd_epi16(ct1, stride4);
547 addr[2].m = _mm_add_epi32(tmp, cs0);
548 addr[3].m = _mm_add_epi32(tmp, cs1);
549
550 for (j = 0; j < 4; j++) {
551 __m128i ld1, ld2, ld3;
552 si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
553 ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
554 si[j] = _mm_unpacklo_epi32(si[j], ld1);
555 ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
556 ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
557 ld2 = _mm_unpacklo_epi32(ld2, ld3);
558 si[j] = _mm_unpacklo_epi64(si[j], ld2);
559 }
560
561 ws = _mm_srli_epi32(s4, 8);
562 ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
563 wt = _mm_srli_epi32(t4, 8);
564 wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
565
566 s4 = _mm_add_epi32(s4, dsdx4);
567 t4 = _mm_add_epi32(t4, dtdx4);
568
569 #if 0
570 /* scalar code for reference */
571 for (j = 0; j < 4; j++) {
572 int s0 = s >> FIXED16_SHIFT;
573 int t0 = t >> FIXED16_SHIFT;
574 int cs0 = CLAMP(s0 , 0, tex_width);
575 int cs1 = CLAMP(s0 + 1, 0, tex_width);
576 int ct0 = CLAMP(t0 , 0, tex_height);
577 int ct1 = CLAMP(t0 + 1, 0, tex_height);
578
579 si0.ui[j] = data[ct0 * stride + cs0];
580 si1.ui[j] = data[ct0 * stride + cs1];
581 si2.ui[j] = data[ct1 * stride + cs0];
582 si3.ui[j] = data[ct1 * stride + cs1];
583
584 ws.ui[j] = (s>>8) & 0xff;
585 wt.ui[j] = (t>>8) & 0xff;
586
587 s += dsdx;
588 t += dtdx;
589 }
590 #endif
591
592 ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
593 wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
594 wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
595
596 wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
597 wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
598 wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
599
600 *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
601 &si[1], &si[3],
602 &wtl, &wth,
603 &wsl, &wsh);
604 }
605
606 samp->s += samp->dsdy;
607 samp->t += samp->dtdy;
608 return row;
609 }
610
611 static const uint32_t *
fetch_bgrx_axis_aligned_linear(struct lp_linear_elem * elem)612 fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
613 {
614 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
615 const __m128i mask = _mm_set1_epi32(0xff000000);
616 uint32_t *dst_row = samp->row;
617 const uint32_t *src_row;
618 int width = samp->width;
619 int i;
620
621 src_row = fetch_bgra_axis_aligned_linear(&samp->base);
622
623 for (i = 0; i < width; i += 4) {
624 __m128i bgra = *(__m128i *)&src_row[i];
625 __m128i bgrx = _mm_or_si128(bgra, mask);
626 *(__m128i *)&dst_row[i] = bgrx;
627 }
628
629 return dst_row;
630 }
631
632
633 static const uint32_t *
fetch_bgrx_clamp_linear(struct lp_linear_elem * elem)634 fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
635 {
636 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
637 const __m128i mask = _mm_set1_epi32(0xff000000);
638 uint32_t *row = samp->row;
639 int width = samp->width;
640 int i;
641
642 fetch_bgra_clamp_linear(&samp->base);
643
644 for (i = 0; i < width; i += 4) {
645 __m128i bgra = *(__m128i *)&row[i];
646 __m128i bgrx = _mm_or_si128(bgra, mask);
647 *(__m128i *)&row[i] = bgrx;
648 }
649
650 return row;
651 }
652
653
654 static const uint32_t *
fetch_bgrx_linear(struct lp_linear_elem * elem)655 fetch_bgrx_linear(struct lp_linear_elem *elem)
656 {
657 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
658 const __m128i mask = _mm_set1_epi32(0xff000000);
659 uint32_t *row = samp->row;
660 int width = samp->width;
661 int i;
662
663 fetch_bgra_linear(&samp->base);
664
665 for (i = 0; i < width; i += 4) {
666 __m128i bgra = *(__m128i *)&row[i];
667 __m128i bgrx = _mm_or_si128(bgra, mask);
668 *(__m128i *)&row[i] = bgrx;
669 }
670
671 return row;
672 }
673
674
675 static boolean
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,boolean minify)676 sampler_is_nearest(const struct lp_linear_sampler *samp,
677 const struct lp_sampler_static_state *sampler_state,
678 boolean minify)
679 {
680 unsigned img_filter;
681
682 if (minify)
683 img_filter = sampler_state->sampler_state.min_img_filter;
684 else
685 img_filter = sampler_state->sampler_state.mag_img_filter;
686
687 /* Is it obviously nearest?
688 */
689 if (img_filter == PIPE_TEX_FILTER_NEAREST)
690 return TRUE;
691
692 /* Otherwise look for linear samplers which devolve to nearest.
693 */
694
695 /* Needs to be axis aligned.
696 */
697 if (!samp->axis_aligned)
698 return FALSE;
699
700 if (0) {
701 /* For maximizing shaders, revert to nearest
702 */
703 if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
704 samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
705 return TRUE;
706
707 /* For severely minimising shaders, revert to nearest:
708 */
709 if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
710 (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
711 return TRUE;
712 }
713
714 /*
715 * Must be near a pixel center:
716 */
717 if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
718 !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
719 return FALSE;
720
721 /*
722 * Must make a full step between pixels:
723 */
724 if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
725 !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
726 return FALSE;
727
728 /* Treat it as nearest!
729 */
730 return TRUE;
731 }
732
733 /* XXX: Lots of static-state parameters being passed in here but very
734 * little info is extracted from each one. Consolidate it all down to
735 * something succinct in the prepare phase?
736 */
737 boolean
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4])738 lp_linear_init_sampler(struct lp_linear_sampler *samp,
739 const struct lp_tgsi_texture_info *info,
740 const struct lp_sampler_static_state *sampler_state,
741 const struct lp_jit_texture *texture,
742 int x0, int y0, int width, int height,
743 const float (*a0)[4],
744 const float (*dadx)[4],
745 const float (*dady)[4])
746 {
747 const struct lp_tgsi_channel_info *schan = &info->coord[0];
748 const struct lp_tgsi_channel_info *tchan = &info->coord[1];
749
750 float w0 = a0[0][3];
751
752 float s0 = a0[schan->u.index+1][schan->swizzle];
753 float dsdx = dadx[schan->u.index+1][schan->swizzle];
754 float dsdy = dady[schan->u.index+1][schan->swizzle];
755
756 float t0 = a0[tchan->u.index+1][tchan->swizzle];
757 float dtdx = dadx[tchan->u.index+1][tchan->swizzle];
758 float dtdy = dady[tchan->u.index+1][tchan->swizzle];
759
760 int mins, mint, maxs, maxt;
761 float oow = 1.0f / w0;
762 float width_oow = texture->width * oow;
763 float height_oow = texture->height * oow;
764 float fdsdx = dsdx * width_oow;
765 float fdsdy = dsdy * width_oow;
766 float fdtdx = dtdx * height_oow;
767 float fdtdy = dtdy * height_oow;
768 int fetch_width;
769 int fetch_height;
770 boolean minify;
771 boolean need_wrap;
772 boolean is_nearest;
773
774 samp->texture = texture;
775 samp->width = width;
776
777 samp->s = float_to_fixed16(fdsdx * x0 +
778 fdsdy * y0 +
779 s0 * width_oow);
780
781 samp->t = float_to_fixed16(fdtdx * x0 +
782 fdtdy * y0 +
783 t0 * height_oow);
784
785 samp->dsdx = float_to_fixed16(fdsdx);
786 samp->dsdy = float_to_fixed16(fdsdy);
787 samp->dtdx = float_to_fixed16(fdtdx);
788 samp->dtdy = float_to_fixed16(fdtdy);
789
790
791 samp->axis_aligned = (samp->dsdy == 0 &&
792 samp->dtdx == 0); // TODO: could be relaxed
793
794 {
795 int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
796 int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
797 int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
798 int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
799 int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
800
801 minify = (rho > FIXED16_ONE);
802 }
803
804 is_nearest = sampler_is_nearest(samp, sampler_state, minify);
805
806 if (!is_nearest) {
807 samp->s -= FIXED16_HALF;
808 samp->t -= FIXED16_HALF;
809 }
810
811 /* Check for clamping. This rarely happens as we're rejecting interpolants
812 * which fall outside the 0..1 range.
813 */
814
815 if (is_nearest) {
816 /* Nearest fetch routines don't employ SSE and always operate one pixel
817 * at a time.
818 */
819 fetch_width = width - 1;
820 }
821 else {
822 /* Linear fetch routines employ SSE, and always fetch groups of four
823 * texels.
824 */
825 fetch_width = align(width, 4) - 1;
826 }
827 fetch_height = height - 1;
828
829 if (samp->axis_aligned) {
830 int s0 = samp->s;
831 int s1 = samp->s + fetch_width * samp->dsdx;
832 int t0 = samp->t;
833 int t1 = samp->t + fetch_height * samp->dtdy;
834
835 mins = MIN2(s0, s1);
836 mint = MIN2(t0, t1);
837 maxs = MAX2(s0, s1);
838 maxt = MAX2(t0, t1);
839 }
840 else {
841 int s0 = samp->s;
842 int s1 = samp->s + fetch_width * samp->dsdx;
843 int s2 = samp->s + fetch_height * samp->dsdy;
844 int s3 = samp->s + fetch_width * samp->dsdx + fetch_height * samp->dsdy;
845 int t0 = samp->t;
846 int t1 = samp->t + fetch_width * samp->dtdx;
847 int t2 = samp->t + fetch_height * samp->dtdy;
848 int t3 = samp->t + fetch_width * samp->dtdx + fetch_height * samp->dtdy;
849
850 mins = MIN4(s0, s1, s2, s3);
851 mint = MIN4(t0, t1, t2, t3);
852 maxs = MAX4(s0, s1, s2, s3);
853 maxt = MAX4(t0, t1, t2, t3);
854 }
855
856 if (is_nearest) {
857 need_wrap = (mins < 0 ||
858 mint < 0 ||
859 maxs >= (texture->width << FIXED16_SHIFT) ||
860 maxt >= (texture->height << FIXED16_SHIFT));
861 } else {
862 need_wrap = (mins < 0 ||
863 mint < 0 ||
864 maxs + FIXED16_ONE >= (texture->width << FIXED16_SHIFT) ||
865 maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
866 }
867
868 if (0 && need_wrap) {
869 debug_printf("%u x %u %s\n",
870 texture->width, texture->height,
871 is_nearest ? "nearest" : "linear");
872 debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
873 debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
874 debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
875 debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
876 debug_printf("\n");
877 }
878
879 /* We accept any mode below, but we only implement clamping.
880 */
881 if (need_wrap &&
882 (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
883 sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
884 return FALSE;
885 }
886
887 if (is_nearest) {
888 switch (sampler_state->texture_state.format) {
889 case PIPE_FORMAT_B8G8R8A8_UNORM:
890 if (need_wrap)
891 samp->base.fetch = fetch_bgra_clamp;
892 else if (!samp->axis_aligned)
893 samp->base.fetch = fetch_bgra;
894 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
895 samp->base.fetch = fetch_bgra_axis_aligned;
896 else
897 samp->base.fetch = fetch_bgra_memcpy;
898
899 return TRUE;
900
901 case PIPE_FORMAT_B8G8R8X8_UNORM:
902 if (need_wrap)
903 samp->base.fetch = fetch_bgrx_clamp;
904 else if (!samp->axis_aligned)
905 samp->base.fetch = fetch_bgrx;
906 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
907 samp->base.fetch = fetch_bgrx_axis_aligned;
908 else
909 samp->base.fetch = fetch_bgrx_memcpy;
910
911 return TRUE;
912
913 default:
914 break;
915 }
916
917 FAIL("unknown format for nearest");
918 }
919 else {
920 samp->stretched_row_y[0] = -1;
921 samp->stretched_row_y[1] = -1;
922 samp->stretched_row_index = 0;
923
924 switch (sampler_state->texture_state.format) {
925 case PIPE_FORMAT_B8G8R8A8_UNORM:
926 if (need_wrap)
927 samp->base.fetch = fetch_bgra_clamp_linear;
928 else if (!samp->axis_aligned)
929 samp->base.fetch = fetch_bgra_linear;
930 else
931 samp->base.fetch = fetch_bgra_axis_aligned_linear;
932
933 return TRUE;
934
935 case PIPE_FORMAT_B8G8R8X8_UNORM:
936 if (need_wrap)
937 samp->base.fetch = fetch_bgrx_clamp_linear;
938 else if (!samp->axis_aligned)
939 samp->base.fetch = fetch_bgrx_linear;
940 else
941 samp->base.fetch = fetch_bgrx_axis_aligned_linear;
942 return TRUE;
943
944 default:
945 break;
946 }
947
948 FAIL("unknown format");
949 }
950 }
951
952
953 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)954 fetch_noop(struct lp_linear_elem *elem)
955 {
956 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
957 return samp->row;
958 }
959
960
961 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)962 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
963 {
964 samp->base.fetch = fetch_noop;
965 }
966
967 /* Check the variant for linear path compatibility.
968 */
969 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)970 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
971 const struct lp_tgsi_texture_info *tex)
972 {
973 if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
974 return FALSE;
975
976 if (tex->target != TGSI_TEXTURE_2D)
977 return FALSE;
978
979 if (tex->coord[0].file != TGSI_FILE_INPUT ||
980 tex->coord[1].file != TGSI_FILE_INPUT)
981 return FALSE;
982
983 /* These are the only sampling modes we support at the moment.
984 *
985 * Actually we'll accept any mode as we're failing on any
986 * interpolant which exceeds 0..1. Clamping is applied only to
987 * avoid invalid reads.
988 */
989 if (!is_nearest_sampler(sampler) &&
990 !is_linear_sampler(sampler))
991 return FALSE;
992
993 /* These are the only texture formats we support at the moment
994 */
995 if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
996 sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM)
997 return FALSE;
998
999 return TRUE;
1000 }
1001
1002 #else
1003 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)1004 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
1005 const struct lp_tgsi_texture_info *tex)
1006 {
1007 return FALSE;
1008 }
1009 #endif
1010