1 /**************************************************************************
2 *
3 * Copyright 2010-2021 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_config.h"
30
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41
42 #if defined(PIPE_ARCH_SSE)
43
44 #define FIXED16_SHIFT 16
45 #define FIXED16_ONE (1<<16)
46 #define FIXED16_HALF (1<<15)
47
48 /*
49 * Color tolerance. Allow 1 bit of error in 8 bit unorm colors.
50 */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52
53 /*
54 * Tolerance for texture coordinate derivatives when doing linear filtering.
55 *
56 * (Note that extra care needs to be taken when doing linear filtering as
57 * coordinates may snap up to neighbour texels inside the tile).
58 */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60
61 static inline int
float_to_fixed16(float f)62 float_to_fixed16(float f)
63 {
64 return f * (float)FIXED16_ONE;
65 }
66
67 static inline int
fixed16_frac(int x)68 fixed16_frac(int x)
69 {
70 return x & (FIXED16_ONE - 1);
71 }
72
73 static inline int
fixed16_approx(int x,int y,int tol)74 fixed16_approx(int x, int y, int tol)
75 {
76 return y - tol <= x && x <= y + tol;
77 }
78
79
80 /*
81 * Unstretched blit of a bgra texture.
82 */
83 static const uint32_t *
fetch_bgra_memcpy(struct lp_linear_elem * elem)84 fetch_bgra_memcpy(struct lp_linear_elem *elem)
85 {
86 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
87 const struct lp_jit_texture *texture = samp->texture;
88 const uint32_t *src_row =
89 (const uint32_t *)((const uint8_t *)texture->base +
90 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
91 const int s = samp->s;
92 const int width = samp->width;
93 const uint32_t *row;
94
95 src_row = &src_row[s >> FIXED16_SHIFT];
96
97 if (((uintptr_t)src_row & 0xf) == 0) {
98 /* The source texels are already aligned. Return them */
99 row = src_row;
100 } else {
101 memcpy(samp->row, src_row, width * sizeof *row);
102 row = samp->row;
103 }
104
105 samp->t += samp->dtdy;
106 return row;
107 }
108
109
110 /*
111 * Unstretched blit of a bgrx texture.
112 */
113 static const uint32_t *
fetch_bgrx_memcpy(struct lp_linear_elem * elem)114 fetch_bgrx_memcpy(struct lp_linear_elem *elem)
115 {
116 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
117 const struct lp_jit_texture *texture = samp->texture;
118 const uint32_t *src_row =
119 (const uint32_t *)((const uint8_t *)texture->base +
120 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
121 const int s = samp->s;
122 const int width = samp->width;
123 uint32_t *row = samp->row;
124
125 src_row = &src_row[s >> FIXED16_SHIFT];
126
127 for (int i = 0; i < width; i++) {
128 row[i] = src_row[i] | 0xff000000;
129 }
130
131 samp->t += samp->dtdy;
132 return row;
133 }
134
135
136 /*
137 * Perform nearest filtered lookup of a row of texels. Texture lookup
138 * is assumed to be axis aligned but with arbitrary scaling.
139 *
140 * Texture coordinate interpolation is performed in 16.16 fixed point,
141 * not to be confused with the 1.15 format used by the interpolants.
142 *
143 * After 64 pixels (ie. in the next tile), the starting point will be
144 * recalculated with floating point arithmetic.
145 */
146 static const uint32_t *
fetch_bgra_axis_aligned(struct lp_linear_elem * elem)147 fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
148 {
149 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
150 const struct lp_jit_texture *texture = samp->texture;
151 const uint32_t *src_row =
152 (const uint32_t *)((const uint8_t *)texture->base +
153 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
154 const int dsdx = samp->dsdx;
155 const int width = samp->width;
156 uint32_t *row = samp->row;
157 int s = samp->s;
158
159 for (int i = 0; i < width; i++) {
160 row[i] = src_row[s>>FIXED16_SHIFT];
161 s += dsdx;
162 }
163
164 samp->t += samp->dtdy;
165 return row;
166 }
167
168
169 static const uint32_t *
fetch_bgrx_axis_aligned(struct lp_linear_elem * elem)170 fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
171 {
172 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
173 const struct lp_jit_texture *texture = samp->texture;
174 const uint32_t *src_row =
175 (const uint32_t *)((const uint8_t *)texture->base +
176 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
177 const int dsdx = samp->dsdx;
178 const int width = samp->width;
179 uint32_t *row = samp->row;
180 int s = samp->s;
181
182 for (int i = 0; i < width; i++) {
183 row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
184 s += dsdx;
185 }
186
187 samp->t += samp->dtdy;
188 return row;
189 }
190
191
192 /* Non-axis aligned, but no clamping or wrapping required
193 */
194 static const uint32_t *
fetch_bgra(struct lp_linear_elem * elem)195 fetch_bgra(struct lp_linear_elem *elem)
196 {
197 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
198 const struct lp_jit_texture *texture = samp->texture;
199 const uint8_t *src = texture->base;
200 const int stride = texture->row_stride[0];
201 const int dsdx = samp->dsdx;
202 const int dtdx = samp->dtdx;
203 const int width = samp->width;
204 uint32_t *row = samp->row;
205 int s = samp->s;
206 int t = samp->t;
207
208 for (int i = 0; i < width; i++) {
209 const uint8_t *texel = (src +
210 (t>>FIXED16_SHIFT) * stride +
211 (s>>FIXED16_SHIFT) * 4);
212
213 row[i] = *(const uint32_t *)texel;
214
215 s += dsdx;
216 t += dtdx;
217 }
218
219 samp->s += samp->dsdy;
220 samp->t += samp->dtdy;
221 return row;
222 }
223
224
225 static const uint32_t *
fetch_bgrx(struct lp_linear_elem * elem)226 fetch_bgrx(struct lp_linear_elem *elem)
227 {
228 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
229 const struct lp_jit_texture *texture = samp->texture;
230 const uint8_t *src = texture->base;
231 const int stride = texture->row_stride[0];
232 const int dsdx = samp->dsdx;
233 const int dtdx = samp->dtdx;
234 const int width = samp->width;
235 uint32_t *row = samp->row;
236 int s = samp->s;
237 int t = samp->t;
238
239 for (int i = 0; i < width; i++) {
240 const uint8_t *texel = (src +
241 (t>>FIXED16_SHIFT) * stride +
242 (s>>FIXED16_SHIFT) * 4);
243
244 row[i] = (*(const uint32_t *)texel) | 0xff000000;
245
246 s += dsdx;
247 t += dtdx;
248 }
249
250 samp->s += samp->dsdy;
251 samp->t += samp->dtdy;
252 return row;
253 }
254
255 /* Non-axis aligned, clamped.
256 */
257 static const uint32_t *
fetch_bgra_clamp(struct lp_linear_elem * elem)258 fetch_bgra_clamp(struct lp_linear_elem *elem)
259 {
260 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
261 const struct lp_jit_texture *texture = samp->texture;
262 const uint8_t *src = texture->base;
263 const int stride = texture->row_stride[0];
264 const int tex_height = texture->height - 1;
265 const int tex_width = texture->width - 1;
266 const int dsdx = samp->dsdx;
267 const int dtdx = samp->dtdx;
268 const int width = samp->width;
269 uint32_t *row = samp->row;
270 int s = samp->s;
271 int t = samp->t;
272
273 for (int i = 0; i < width; i++) {
274 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
275 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
276
277 const uint8_t *texel = src + ct * stride + cs * 4;
278
279 row[i] = *(const uint32_t *)texel;
280
281 s += dsdx;
282 t += dtdx;
283 }
284
285 samp->s += samp->dsdy;
286 samp->t += samp->dtdy;
287 return row;
288 }
289
290 static const uint32_t *
fetch_bgrx_clamp(struct lp_linear_elem * elem)291 fetch_bgrx_clamp(struct lp_linear_elem *elem)
292 {
293 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
294 const struct lp_jit_texture *texture = samp->texture;
295 const uint8_t *src = texture->base;
296 const int stride = texture->row_stride[0];
297 const int tex_height = texture->height - 1;
298 const int tex_width = texture->width - 1;
299 const int dsdx = samp->dsdx;
300 const int dtdx = samp->dtdx;
301 const int width = samp->width;
302 uint32_t *row = samp->row;
303 int s = samp->s;
304 int t = samp->t;
305
306 for (int i = 0; i < width; i++) {
307 int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
308 int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
309
310 const uint8_t *texel = src + ct * stride + cs * 4;
311
312 row[i] = (*(const uint32_t *)texel) | 0xff000000;
313
314 s += dsdx;
315 t += dtdx;
316 }
317
318 samp->s += samp->dsdy;
319 samp->t += samp->dtdy;
320 return row;
321 }
322
323 /**
324 * Fetch and stretch one row.
325 */
326 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)327 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
328 int y)
329 {
330 const struct lp_jit_texture *texture = samp->texture;
331 const uint32_t *data = (const uint32_t *)texture->base;
332 const int stride = texture->row_stride[0] / sizeof(uint32_t);
333 const int width = samp->width;
334
335 /*
336 * Search the stretched row cache first.
337 */
338
339 if (y == samp->stretched_row_y[0]) {
340 samp->stretched_row_index = 1;
341 return samp->stretched_row[0];
342 }
343
344 if (y == samp->stretched_row_y[1]) {
345 samp->stretched_row_index = 0;
346 return samp->stretched_row[1];
347 }
348
349 /*
350 * Replace one entry.
351 */
352
353 const uint32_t * restrict src_row = data + y * stride;
354 uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];
355
356 if (fixed16_frac(samp->s) == 0 &&
357 samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
358 /*
359 * 1:1 blit on the x direction.
360 */
361 src_row += samp->s >> FIXED16_SHIFT;
362
363 if (((uintptr_t)src_row & 0xf) == 0) {
364 /* The source texture is already aligned. Return it */
365 return src_row;
366 }
367
368 /* Copy the source texture */
369 for (int i = 0; i < width; i += 4) {
370 __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
371 *(__m128i *)&dst_row[i] = src;
372 }
373 }
374 else {
375 util_sse2_stretch_row_8unorm((__m128i *)dst_row,
376 align(width, 4),
377 src_row, samp->s, samp->dsdx);
378 }
379
380 samp->stretched_row_y[samp->stretched_row_index] = y;
381 samp->stretched_row_index ^= 1;
382
383 return dst_row;
384 }
385
386
387 /* Maximise only as we fetch unscaled pixels linearly into a size-64
388 * temporary. For minimise, we will want to either have a bigger
389 * temporary or fetch sparsely.
390 */
391 static const uint32_t *
fetch_bgra_axis_aligned_linear(struct lp_linear_elem * elem)392 fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
393 {
394 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
395 const int width = samp->width;
396 uint32_t * restrict row = samp->row;
397 const int y = samp->t >> FIXED16_SHIFT;
398 const int w = (samp->t >> 8) & 0xff;
399
400 samp->t += samp->dtdy;
401
402 const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);
403
404 if (w == 0) {
405 return src_row0;
406 }
407
408 const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
409
410 __m128i wt = _mm_set1_epi16(w);
411
412 /* Combine the two rows using a constant weight.
413 */
414 for (int i = 0; i < width; i += 4) {
415 __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
416 __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
417
418 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
419 }
420
421 return row;
422 }
423
424
425 /* Non-axis-aligned version. Don't try to take advantage of
426 * maximize.
427 */
428 static const uint32_t *
fetch_bgra_linear(struct lp_linear_elem * elem)429 fetch_bgra_linear(struct lp_linear_elem *elem)
430 {
431 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
432 const struct lp_jit_texture *texture = samp->texture;
433 const int stride = texture->row_stride[0] / sizeof(uint32_t);
434 const uint32_t *data = (const uint32_t *)texture->base;
435 const int dsdx = samp->dsdx;
436 const int dtdx = samp->dtdx;
437 const int width = samp->width;
438 uint32_t *row = samp->row;
439 int s = samp->s;
440 int t = samp->t;
441
442 for (int i = 0; i < width; i += 4) {
443 union m128i si0, si1, si2, si3, ws, wt;
444 __m128i si02, si13;
445
446 for (int j = 0; j < 4; j++) {
447 const uint32_t *src = data + (t >> 16) * stride + (s >> 16);
448
449 si0.ui[j] = src[0];
450 si1.ui[j] = src[1];
451 si2.ui[j] = src[stride + 0];
452 si3.ui[j] = src[stride + 1];
453
454 ws.ui[j] = (s>>8) & 0xff;
455 wt.ui[j] = (t>>8) & 0xff;
456
457 s += dsdx;
458 t += dtdx;
459 }
460
461 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
462 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
463
464 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
465 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
466
467 si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
468 si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
469
470 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
471 }
472
473 samp->s += samp->dsdy;
474 samp->t += samp->dtdy;
475 return row;
476 }
477
478
479 /* Clamped, non-axis-aligned version. Don't try to take advantage of
480 * maximize.
481 */
482 static const uint32_t *
fetch_bgra_clamp_linear(struct lp_linear_elem * elem)483 fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
484 {
485 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
486 const struct lp_jit_texture *texture = samp->texture;
487 const uint32_t *data = (const uint32_t *)texture->base;
488 const int stride = texture->row_stride[0] / sizeof(uint32_t);
489 const int tex_height = texture->height - 1;
490 const int tex_width = texture->width - 1;
491 const int dsdx = samp->dsdx;
492 const int dtdx = samp->dtdx;
493 const int width = samp->width;
494 uint32_t *row = samp->row;
495 int s = samp->s;
496 int t = samp->t;
497
498 /* width, height, stride (in pixels) must be smaller than 32768 */
499 __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
500 s4 = _mm_set1_epi32(s);
501 t4 = _mm_set1_epi32(t);
502 s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
503 t4 = _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
504 dsdx4 = _mm_set1_epi32(4*dsdx);
505 dtdx4 = _mm_set1_epi32(4*dtdx);
506 stride4 = _mm_set1_epi32(stride);
507 w4 = _mm_set1_epi32(tex_width);
508 h4 = _mm_set1_epi32(tex_height);
509 zero = _mm_setzero_si128();
510 one = _mm_set1_epi32(1);
511
512 for (int i = 0; i < width; i += 4) {
513 union m128i addr[4];
514 __m128i ws, wt, wsl, wsh, wtl, wth;
515 __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
516
517 s4s = _mm_srli_epi32(s4, 16);
518 t4s = _mm_srli_epi32(t4, 16);
519 cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
520 cs1 = _mm_add_epi16(s4s, one);
521 cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
522 ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
523 ct1 = _mm_add_epi16(t4s, one);
524 ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
525 tmp = _mm_madd_epi16(ct0, stride4);
526 addr[0].m = _mm_add_epi32(tmp, cs0);
527 addr[1].m = _mm_add_epi32(tmp, cs1);
528 tmp = _mm_madd_epi16(ct1, stride4);
529 addr[2].m = _mm_add_epi32(tmp, cs0);
530 addr[3].m = _mm_add_epi32(tmp, cs1);
531
532 for (int j = 0; j < 4; j++) {
533 __m128i ld1, ld2, ld3;
534 si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
535 ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
536 si[j] = _mm_unpacklo_epi32(si[j], ld1);
537 ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
538 ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
539 ld2 = _mm_unpacklo_epi32(ld2, ld3);
540 si[j] = _mm_unpacklo_epi64(si[j], ld2);
541 }
542
543 ws = _mm_srli_epi32(s4, 8);
544 ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
545 wt = _mm_srli_epi32(t4, 8);
546 wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
547
548 s4 = _mm_add_epi32(s4, dsdx4);
549 t4 = _mm_add_epi32(t4, dtdx4);
550
551 #if 0
552 /* scalar code for reference */
553 for (int j = 0; j < 4; j++) {
554 int s0 = s >> FIXED16_SHIFT;
555 int t0 = t >> FIXED16_SHIFT;
556 int cs0 = CLAMP(s0 , 0, tex_width);
557 int cs1 = CLAMP(s0 + 1, 0, tex_width);
558 int ct0 = CLAMP(t0 , 0, tex_height);
559 int ct1 = CLAMP(t0 + 1, 0, tex_height);
560
561 si0.ui[j] = data[ct0 * stride + cs0];
562 si1.ui[j] = data[ct0 * stride + cs1];
563 si2.ui[j] = data[ct1 * stride + cs0];
564 si3.ui[j] = data[ct1 * stride + cs1];
565
566 ws.ui[j] = (s>>8) & 0xff;
567 wt.ui[j] = (t>>8) & 0xff;
568
569 s += dsdx;
570 t += dtdx;
571 }
572 #endif
573
574 ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
575 wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
576 wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
577
578 wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
579 wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
580 wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
581
582 *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
583 &si[1], &si[3],
584 &wtl, &wth,
585 &wsl, &wsh);
586 }
587
588 samp->s += samp->dsdy;
589 samp->t += samp->dtdy;
590
591 return row;
592 }
593
594
595 static const uint32_t *
fetch_bgrx_axis_aligned_linear(struct lp_linear_elem * elem)596 fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
597 {
598 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
599 const __m128i mask = _mm_set1_epi32(0xff000000);
600 uint32_t *dst_row = samp->row;
601 const uint32_t *src_row = fetch_bgra_axis_aligned_linear(&samp->base);
602 const int width = samp->width;
603
604 for (int i = 0; i < width; i += 4) {
605 __m128i bgra = *(__m128i *)&src_row[i];
606 __m128i bgrx = _mm_or_si128(bgra, mask);
607 *(__m128i *)&dst_row[i] = bgrx;
608 }
609
610 return dst_row;
611 }
612
613
614 static const uint32_t *
fetch_bgrx_clamp_linear(struct lp_linear_elem * elem)615 fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
616 {
617 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
618 const __m128i mask = _mm_set1_epi32(0xff000000);
619 uint32_t *row = samp->row;
620 const int width = samp->width;
621
622 fetch_bgra_clamp_linear(&samp->base);
623
624 for (int i = 0; i < width; i += 4) {
625 __m128i bgra = *(__m128i *)&row[i];
626 __m128i bgrx = _mm_or_si128(bgra, mask);
627 *(__m128i *)&row[i] = bgrx;
628 }
629
630 return row;
631 }
632
633
634 static const uint32_t *
fetch_bgrx_linear(struct lp_linear_elem * elem)635 fetch_bgrx_linear(struct lp_linear_elem *elem)
636 {
637 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
638 const __m128i mask = _mm_set1_epi32(0xff000000);
639 uint32_t *row = samp->row;
640 const int width = samp->width;
641
642 fetch_bgra_linear(&samp->base);
643
644 for (int i = 0; i < width; i += 4) {
645 __m128i bgra = *(__m128i *)&row[i];
646 __m128i bgrx = _mm_or_si128(bgra, mask);
647 *(__m128i *)&row[i] = bgrx;
648 }
649
650 return row;
651 }
652
653
654 static boolean
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,boolean minify)655 sampler_is_nearest(const struct lp_linear_sampler *samp,
656 const struct lp_sampler_static_state *sampler_state,
657 boolean minify)
658 {
659 unsigned img_filter;
660
661 if (minify)
662 img_filter = sampler_state->sampler_state.min_img_filter;
663 else
664 img_filter = sampler_state->sampler_state.mag_img_filter;
665
666 /* Is it obviously nearest?
667 */
668 if (img_filter == PIPE_TEX_FILTER_NEAREST)
669 return TRUE;
670
671 /* Otherwise look for linear samplers which devolve to nearest.
672 */
673
674 /* Needs to be axis aligned.
675 */
676 if (!samp->axis_aligned)
677 return FALSE;
678
679 if (0) {
680 /* For maximizing shaders, revert to nearest
681 */
682 if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
683 samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
684 return TRUE;
685
686 /* For severely minimising shaders, revert to nearest:
687 */
688 if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
689 (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
690 return TRUE;
691 }
692
693 /*
694 * Must be near a pixel center:
695 */
696 if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
697 !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
698 return FALSE;
699
700 /*
701 * Must make a full step between pixels:
702 */
703 if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
704 !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
705 return FALSE;
706
707 /* Treat it as nearest!
708 */
709 return TRUE;
710 }
711
712 /* XXX: Lots of static-state parameters being passed in here but very
713 * little info is extracted from each one. Consolidate it all down to
714 * something succinct in the prepare phase?
715 */
716 boolean
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4])717 lp_linear_init_sampler(struct lp_linear_sampler *samp,
718 const struct lp_tgsi_texture_info *info,
719 const struct lp_sampler_static_state *sampler_state,
720 const struct lp_jit_texture *texture,
721 int x0, int y0, int width, int height,
722 const float (*a0)[4],
723 const float (*dadx)[4],
724 const float (*dady)[4])
725 {
726 const struct lp_tgsi_channel_info *schan = &info->coord[0];
727 const struct lp_tgsi_channel_info *tchan = &info->coord[1];
728
729 assert(schan->file == TGSI_FILE_INPUT);
730 assert(tchan->file == TGSI_FILE_INPUT);
731
732 float w0 = a0[0][3];
733
734 int foo = 1;
735 float s0 = a0[schan->u.index+foo][schan->swizzle];
736 float dsdx = dadx[schan->u.index+foo][schan->swizzle];
737 float dsdy = dady[schan->u.index+foo][schan->swizzle];
738
739 float t0 = a0[tchan->u.index+foo][tchan->swizzle];
740 float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
741 float dtdy = dady[tchan->u.index+foo][tchan->swizzle];
742
743 int mins, mint, maxs, maxt;
744 float oow = 1.0f / w0;
745 float width_oow = texture->width * oow;
746 float height_oow = texture->height * oow;
747 float fdsdx = dsdx * width_oow;
748 float fdsdy = dsdy * width_oow;
749 float fdtdx = dtdx * height_oow;
750 float fdtdy = dtdy * height_oow;
751 int fetch_width;
752 int fetch_height;
753 boolean minify;
754 boolean need_wrap;
755 boolean is_nearest;
756
757 samp->texture = texture;
758 samp->width = width;
759
760 samp->s = float_to_fixed16(fdsdx * x0 +
761 fdsdy * y0 +
762 s0 * width_oow);
763
764 samp->t = float_to_fixed16(fdtdx * x0 +
765 fdtdy * y0 +
766 t0 * height_oow);
767
768 samp->dsdx = float_to_fixed16(fdsdx);
769 samp->dsdy = float_to_fixed16(fdsdy);
770 samp->dtdx = float_to_fixed16(fdtdx);
771 samp->dtdy = float_to_fixed16(fdtdy);
772
773
774 samp->axis_aligned = (samp->dsdy == 0 &&
775 samp->dtdx == 0); // TODO: could be relaxed
776
777 {
778 int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
779 int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
780 int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
781 int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
782 int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
783
784 minify = (rho > FIXED16_ONE);
785 }
786
787 is_nearest = sampler_is_nearest(samp, sampler_state, minify);
788
789 if (!is_nearest) {
790 samp->s -= FIXED16_HALF;
791 samp->t -= FIXED16_HALF;
792 }
793
794 /* Check for clamping. This rarely happens as we're rejecting interpolants
795 * which fall outside the 0..1 range.
796 */
797
798 if (is_nearest) {
799 /* Nearest fetch routines don't employ SSE and always operate one pixel
800 * at a time.
801 */
802 fetch_width = width - 1;
803 }
804 else {
805 /* Linear fetch routines employ SSE, and always fetch groups of four
806 * texels.
807 */
808 fetch_width = align(width, 4) - 1;
809 }
810 fetch_height = height - 1;
811
812 if (samp->axis_aligned) {
813 int s0 = samp->s;
814 int s1 = samp->s + fetch_width * samp->dsdx;
815 int t0 = samp->t;
816 int t1 = samp->t + fetch_height * samp->dtdy;
817
818 mins = MIN2(s0, s1);
819 mint = MIN2(t0, t1);
820 maxs = MAX2(s0, s1);
821 maxt = MAX2(t0, t1);
822 }
823 else {
824 int s0 = samp->s;
825 int s1 = samp->s + fetch_width * samp->dsdx;
826 int s2 = samp->s + fetch_height * samp->dsdy;
827 int s3 = samp->s + fetch_width * samp->dsdx + fetch_height * samp->dsdy;
828 int t0 = samp->t;
829 int t1 = samp->t + fetch_width * samp->dtdx;
830 int t2 = samp->t + fetch_height * samp->dtdy;
831 int t3 = samp->t + fetch_width * samp->dtdx + fetch_height * samp->dtdy;
832
833 mins = MIN4(s0, s1, s2, s3);
834 mint = MIN4(t0, t1, t2, t3);
835 maxs = MAX4(s0, s1, s2, s3);
836 maxt = MAX4(t0, t1, t2, t3);
837 }
838
839 if (is_nearest) {
840 need_wrap = (mins < 0 ||
841 mint < 0 ||
842 maxs >= (texture->width << FIXED16_SHIFT) ||
843 maxt >= (texture->height << FIXED16_SHIFT));
844 } else {
845 need_wrap = (mins < 0 ||
846 mint < 0 ||
847 maxs + FIXED16_ONE >= (texture->width << FIXED16_SHIFT) ||
848 maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
849 }
850
851 if (0 && need_wrap) {
852 debug_printf("%u x %u %s\n",
853 texture->width, texture->height,
854 is_nearest ? "nearest" : "linear");
855 debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
856 debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
857 debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
858 debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
859 debug_printf("\n");
860 }
861
862 /* We accept any mode below, but we only implement clamping.
863 */
864 if (need_wrap &&
865 (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
866 sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
867 return FALSE;
868 }
869
870 if (is_nearest) {
871 switch (sampler_state->texture_state.format) {
872 case PIPE_FORMAT_B8G8R8A8_UNORM:
873 if (need_wrap)
874 samp->base.fetch = fetch_bgra_clamp;
875 else if (!samp->axis_aligned)
876 samp->base.fetch = fetch_bgra;
877 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
878 samp->base.fetch = fetch_bgra_axis_aligned;
879 else
880 samp->base.fetch = fetch_bgra_memcpy;
881 return TRUE;
882 case PIPE_FORMAT_B8G8R8X8_UNORM:
883 if (need_wrap)
884 samp->base.fetch = fetch_bgrx_clamp;
885 else if (!samp->axis_aligned)
886 samp->base.fetch = fetch_bgrx;
887 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
888 samp->base.fetch = fetch_bgrx_axis_aligned;
889 else
890 samp->base.fetch = fetch_bgrx_memcpy;
891 return TRUE;
892 default:
893 break;
894 }
895
896 FAIL("unknown format for nearest");
897 }
898 else {
899 samp->stretched_row_y[0] = -1;
900 samp->stretched_row_y[1] = -1;
901 samp->stretched_row_index = 0;
902
903 switch (sampler_state->texture_state.format) {
904 case PIPE_FORMAT_B8G8R8A8_UNORM:
905 if (need_wrap)
906 samp->base.fetch = fetch_bgra_clamp_linear;
907 else if (!samp->axis_aligned)
908 samp->base.fetch = fetch_bgra_linear;
909 else
910 samp->base.fetch = fetch_bgra_axis_aligned_linear;
911 return TRUE;
912 case PIPE_FORMAT_B8G8R8X8_UNORM:
913 if (need_wrap)
914 samp->base.fetch = fetch_bgrx_clamp_linear;
915 else if (!samp->axis_aligned)
916 samp->base.fetch = fetch_bgrx_linear;
917 else
918 samp->base.fetch = fetch_bgrx_axis_aligned_linear;
919 return TRUE;
920 default:
921 break;
922 }
923
924 FAIL("unknown format");
925 }
926 }
927
928
929 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)930 fetch_noop(struct lp_linear_elem *elem)
931 {
932 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
933 return samp->row;
934 }
935
936
937 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)938 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
939 {
940 samp->base.fetch = fetch_noop;
941 }
942
943
944 /*
945 * Check the given sampler and texture info for linear path compatibility.
946 */
947 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)948 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
949 const struct lp_tgsi_texture_info *tex)
950 {
951 if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
952 return FALSE;
953
954 if (tex->target != TGSI_TEXTURE_2D)
955 return FALSE;
956
957 if (tex->coord[0].file != TGSI_FILE_INPUT ||
958 tex->coord[1].file != TGSI_FILE_INPUT)
959 return FALSE;
960
961 /* These are the only sampling modes we support at the moment.
962 *
963 * Actually we'll accept any mode as we're failing on any
964 * interpolant which exceeds 0..1. Clamping is applied only to
965 * avoid invalid reads.
966 */
967 if (!is_nearest_sampler(sampler) &&
968 !is_linear_sampler(sampler))
969 return FALSE;
970
971 /* These are the only texture formats we support at the moment
972 */
973 if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
974 sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM)
975 return FALSE;
976
977 /* We don't support sampler view swizzling on the linear path */
978 if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
979 sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
980 sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
981 sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
982 return FALSE;
983 }
984
985 return TRUE;
986 }
987
988 #else
989 boolean
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)990 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
991 const struct lp_tgsi_texture_info *tex)
992 {
993 return FALSE;
994 }
995 #endif
996