• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "SamplerCore.hpp"
16 
17 #include "Constants.hpp"
18 #include "PixelRoutine.hpp"
19 #include "System/Debug.hpp"
20 #include "Vulkan/VkSampler.hpp"
21 
22 namespace sw {
23 
SamplerCore(Pointer<Byte> & constants,const Sampler & state,SamplerFunction function)24 SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function)
25     : constants(constants)
26     , state(state)
27     , function(function)
28 {
29 }
30 
sampleTexture(Pointer<Byte> & texture,SIMD::Float uvwa[4],const SIMD::Float & dRef,const Float & lodOrBias,const SIMD::Float & dsx,const SIMD::Float & dsy,SIMD::Int offset[4],const SIMD::Int & sample)31 SIMD::Float4 SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample)
32 {
33 	SIMD::Float4 c;
34 
35 	for(int i = 0; i < SIMD::Width / 4; i++)
36 	{
37 		Float4 uvwa128[4];
38 		uvwa128[0] = Extract128(uvwa[0], i);
39 		uvwa128[1] = Extract128(uvwa[1], i);
40 		uvwa128[2] = Extract128(uvwa[2], i);
41 		uvwa128[3] = Extract128(uvwa[3], i);
42 
43 		Vector4i offset128;
44 		offset128[0] = Extract128(offset[0], i);
45 		offset128[1] = Extract128(offset[1], i);
46 		offset128[2] = Extract128(offset[2], i);
47 		offset128[3] = Extract128(offset[3], i);
48 
49 		Vector4f c128 = sampleTexture128(texture, uvwa128, Extract128(dRef, i), lodOrBias, Extract128(dsx, i), Extract128(dsy, i), offset128, Extract128(sample, i));
50 		c.x = Insert128(c.x, c128.x, i);
51 		c.y = Insert128(c.y, c128.y, i);
52 		c.z = Insert128(c.z, c128.z, i);
53 		c.w = Insert128(c.w, c128.w, i);
54 	}
55 
56 	return c;
57 }
58 
sampleTexture128(Pointer<Byte> & texture,Float4 uvwa[4],const Float4 & dRef,const Float & lodOrBias,const Float4 & dsx,const Float4 & dsy,Vector4i & offset,const Int4 & sample)59 Vector4f SamplerCore::sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample)
60 {
61 	Vector4f c;
62 
63 	Float4 u = uvwa[0];
64 	Float4 v = uvwa[1];
65 	Float4 w = uvwa[2];
66 	Float4 a;  // Array layer coordinate
67 	switch(state.textureType)
68 	{
69 	case VK_IMAGE_VIEW_TYPE_1D_ARRAY: a = uvwa[1]; break;
70 	case VK_IMAGE_VIEW_TYPE_2D_ARRAY: a = uvwa[2]; break;
71 	case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: a = uvwa[3]; break;
72 	default: break;
73 	}
74 
75 	Float lod;
76 	Float anisotropy;
77 	Float4 uDelta;
78 	Float4 vDelta;
79 	Float4 M;  // Major axis
80 
81 	if(state.isCube())
82 	{
83 		Int4 face = cubeFace(u, v, uvwa[0], uvwa[1], uvwa[2], M);
84 		w = As<Float4>(face);
85 	}
86 
87 	// Determine if we can skip the LOD computation. This is the case when the mipmap has only one level, except for LOD query,
88 	// where we have to return the computed value. Anisotropic filtering requires computing the anisotropy factor even for a single mipmap level.
89 	bool singleMipLevel = (state.minLod == state.maxLod);
90 	bool requiresLodComputation = (function == Query) || (state.textureFilter == FILTER_ANISOTROPIC);
91 	bool skipLodComputation = singleMipLevel && !requiresLodComputation;
92 
93 	if(skipLodComputation)
94 	{
95 		lod = state.minLod;
96 	}
97 	else if(function == Implicit || function == Bias || function == Grad || function == Query)
98 	{
99 		if(state.is1D())
100 		{
101 			computeLod1D(texture, lod, u, dsx, dsy);
102 		}
103 		else if(state.is2D())
104 		{
105 			computeLod2D(texture, lod, anisotropy, uDelta, vDelta, u, v, dsx, dsy);
106 		}
107 		else if(state.isCube())
108 		{
109 			computeLodCube(texture, lod, uvwa[0], uvwa[1], uvwa[2], dsx, dsy, M);
110 		}
111 		else
112 		{
113 			computeLod3D(texture, lod, u, v, w, dsx, dsy);
114 		}
115 
116 		Float bias = state.mipLodBias;
117 
118 		if(function == Bias)
119 		{
120 			// Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
121 			bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
122 		}
123 
124 		lod += bias;
125 	}
126 	else if(function == Lod)
127 	{
128 		// Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
129 		// Hence no explicit clamping to maxSamplerLodBias is required in this case.
130 		lod = lodOrBias + state.mipLodBias;
131 	}
132 	else if(function == Fetch)
133 	{
134 		// TODO: Eliminate int-float-int conversion.
135 		lod = Float(As<Int>(lodOrBias));
136 	}
137 	else if(function == Base || function == Gather)
138 	{
139 		lod = Float(0);
140 	}
141 	else
142 		UNREACHABLE("Sampler function %d", int(function));
143 
144 	if(function != Base && function != Fetch && function != Gather)
145 	{
146 		if(function == Query)
147 		{
148 			c.y = Float4(lod);  // Unclamped LOD.
149 		}
150 
151 		if(!skipLodComputation)
152 		{
153 			lod = Max(lod, state.minLod);
154 			lod = Min(lod, state.maxLod);
155 		}
156 
157 		if(function == Query)
158 		{
159 			if(state.mipmapFilter == MIPMAP_POINT)
160 			{
161 				lod = Round(lod);  // TODO: Preferred formula is ceil(lod + 0.5) - 1
162 			}
163 
164 			c.x = lod;
165 			//	c.y contains unclamped LOD.
166 
167 			return c;
168 		}
169 	}
170 
171 	bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
172 	bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
173 	                         state.isCube() || state.unnormalizedCoordinates || state.compareEnable ||
174 	                         borderModeActive() || (function == Gather) || (function == Fetch);
175 	int numComponents = (function == Gather) ? 4 : textureComponentCount();
176 
177 	if(use32BitFiltering)
178 	{
179 		c = sampleFloatFilter(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta);
180 	}
181 	else  // 16-bit filtering.
182 	{
183 		Vector4s cs = sampleFilter(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta);
184 
185 		for(int component = 0; component < numComponents; component++)
186 		{
187 			if(hasUnsignedTextureComponent(component))
188 			{
189 				c[component] = Float4(As<UShort4>(cs[component]));
190 			}
191 			else
192 			{
193 				c[component] = Float4(cs[component]);
194 			}
195 		}
196 	}
197 
198 	if(hasNormalizedFormat() && !state.compareEnable)
199 	{
200 		sw::float4 scale = getComponentScale();
201 
202 		for(int component = 0; component < numComponents; component++)
203 		{
204 			int texelComponent = (function == Gather) ? getGatherComponent() : component;
205 			c[component] *= Float4(1.0f / scale[texelComponent]);
206 		}
207 	}
208 
209 	if(state.textureFormat.isSignedNormalized())
210 	{
211 		for(int component = 0; component < numComponents; component++)
212 		{
213 			c[component] = Max(c[component], Float4(-1.0f));
214 		}
215 	}
216 
217 	if(state.textureFilter != FILTER_GATHER)
218 	{
219 		if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
220 		   (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
221 		   (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
222 		   (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
223 		{
224 			const Vector4f col = c;
225 			bool integer = hasUnnormalizedIntegerTexture();
226 			c.x = applySwizzle(col, state.swizzle.r, integer);
227 			c.y = applySwizzle(col, state.swizzle.g, integer);
228 			c.z = applySwizzle(col, state.swizzle.b, integer);
229 			c.w = applySwizzle(col, state.swizzle.a, integer);
230 		}
231 	}
232 	else  // Gather
233 	{
234 		VkComponentSwizzle swizzle = gatherSwizzle();
235 
236 		// R/G/B/A swizzles affect the component collected from each texel earlier.
237 		// Handle the ZERO and ONE cases here because we don't need to know the format.
238 
239 		if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
240 		{
241 			c.x = c.y = c.z = c.w = Float4(0);
242 		}
243 		else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
244 		{
245 			bool integer = hasUnnormalizedIntegerTexture();
246 			c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
247 		}
248 	}
249 
250 	return c;
251 }
252 
applySwizzle(const Vector4f & c,VkComponentSwizzle swizzle,bool integer)253 Float4 SamplerCore::applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer)
254 {
255 	switch(swizzle)
256 	{
257 	default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle);
258 	case VK_COMPONENT_SWIZZLE_R: return c.x;
259 	case VK_COMPONENT_SWIZZLE_G: return c.y;
260 	case VK_COMPONENT_SWIZZLE_B: return c.z;
261 	case VK_COMPONENT_SWIZZLE_A: return c.w;
262 	case VK_COMPONENT_SWIZZLE_ZERO: return Float4(0.0f, 0.0f, 0.0f, 0.0f);
263 	case VK_COMPONENT_SWIZZLE_ONE:
264 		if(integer)
265 		{
266 			return Float4(As<Float4>(sw::Int4(1, 1, 1, 1)));
267 		}
268 		else
269 		{
270 			return Float4(1.0f, 1.0f, 1.0f, 1.0f);
271 		}
272 		break;
273 	}
274 };
275 
offsetSample(Short4 & uvw,Pointer<Byte> & mipmap,int halfOffset,bool wrap,int count,Float & lod)276 Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
277 {
278 	Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
279 
280 	if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
281 	{
282 		offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
283 	}
284 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
285 	{
286 		offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
287 	}
288 
289 	if(wrap)
290 	{
291 		switch(count)
292 		{
293 		case -1: return uvw - offset;
294 		case 0: return uvw;
295 		case +1: return uvw + offset;
296 		case 2: return uvw + offset + offset;
297 		}
298 	}
299 	else  // Clamp or mirror
300 	{
301 		switch(count)
302 		{
303 		case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
304 		case 0: return uvw;
305 		case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
306 		case 2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
307 		}
308 	}
309 
310 	return uvw;
311 }
312 
sampleFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)313 Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
314 {
315 	Vector4s c = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, false);
316 
317 	if(function == Fetch)
318 	{
319 		return c;
320 	}
321 
322 	if(state.mipmapFilter == MIPMAP_LINEAR)
323 	{
324 		Vector4s cc = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, true);
325 
326 		lod *= Float(1 << 16);
327 
328 		UShort4 utri = UShort4(Float4(lod));  // TODO: Optimize
329 		Short4 stri = utri >> 1;              // TODO: Optimize
330 
331 		if(hasUnsignedTextureComponent(0))
332 			cc.x = MulHigh(As<UShort4>(cc.x), utri);
333 		else
334 			cc.x = MulHigh(cc.x, stri);
335 		if(hasUnsignedTextureComponent(1))
336 			cc.y = MulHigh(As<UShort4>(cc.y), utri);
337 		else
338 			cc.y = MulHigh(cc.y, stri);
339 		if(hasUnsignedTextureComponent(2))
340 			cc.z = MulHigh(As<UShort4>(cc.z), utri);
341 		else
342 			cc.z = MulHigh(cc.z, stri);
343 		if(hasUnsignedTextureComponent(3))
344 			cc.w = MulHigh(As<UShort4>(cc.w), utri);
345 		else
346 			cc.w = MulHigh(cc.w, stri);
347 
348 		utri = ~utri;
349 		stri = Short4(0x7FFF) - stri;
350 
351 		if(hasUnsignedTextureComponent(0))
352 			c.x = MulHigh(As<UShort4>(c.x), utri);
353 		else
354 			c.x = MulHigh(c.x, stri);
355 		if(hasUnsignedTextureComponent(1))
356 			c.y = MulHigh(As<UShort4>(c.y), utri);
357 		else
358 			c.y = MulHigh(c.y, stri);
359 		if(hasUnsignedTextureComponent(2))
360 			c.z = MulHigh(As<UShort4>(c.z), utri);
361 		else
362 			c.z = MulHigh(c.z, stri);
363 		if(hasUnsignedTextureComponent(3))
364 			c.w = MulHigh(As<UShort4>(c.w), utri);
365 		else
366 			c.w = MulHigh(c.w, stri);
367 
368 		c.x += cc.x;
369 		c.y += cc.y;
370 		c.z += cc.z;
371 		c.w += cc.w;
372 
373 		if(!hasUnsignedTextureComponent(0)) c.x += c.x;
374 		if(!hasUnsignedTextureComponent(1)) c.y += c.y;
375 		if(!hasUnsignedTextureComponent(2)) c.z += c.z;
376 		if(!hasUnsignedTextureComponent(3)) c.w += c.w;
377 	}
378 
379 	return c;
380 }
381 
sampleAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)382 Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
383 {
384 	Vector4s c;
385 
386 	if(state.textureFilter != FILTER_ANISOTROPIC)
387 	{
388 		c = sampleQuad(texture, u, v, w, a, offset, sample, lod, secondLOD);
389 	}
390 	else
391 	{
392 		Int N = RoundInt(anisotropy);
393 
394 		Vector4s cSum;
395 
396 		cSum.x = Short4(0);
397 		cSum.y = Short4(0);
398 		cSum.z = Short4(0);
399 		cSum.w = Short4(0);
400 
401 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
402 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
403 		UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants, cWeight) + 8 * N);
404 		Short4 sw = Short4(cw >> 1);
405 
406 		Float4 du = uDelta;
407 		Float4 dv = vDelta;
408 
409 		Float4 u0 = u + B * du;
410 		Float4 v0 = v + B * dv;
411 
412 		du *= A;
413 		dv *= A;
414 
415 		Int i = 0;
416 
417 		Do
418 		{
419 			c = sampleQuad(texture, u0, v0, w, a, offset, sample, lod, secondLOD);
420 
421 			u0 += du;
422 			v0 += dv;
423 
424 			if(hasUnsignedTextureComponent(0))
425 				cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw));
426 			else
427 				cSum.x += MulHigh(c.x, sw);
428 			if(hasUnsignedTextureComponent(1))
429 				cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw));
430 			else
431 				cSum.y += MulHigh(c.y, sw);
432 			if(hasUnsignedTextureComponent(2))
433 				cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw));
434 			else
435 				cSum.z += MulHigh(c.z, sw);
436 			if(hasUnsignedTextureComponent(3))
437 				cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw));
438 			else
439 				cSum.w += MulHigh(c.w, sw);
440 
441 			i++;
442 		}
443 		Until(i >= N);
444 
445 		if(hasUnsignedTextureComponent(0))
446 			c.x = cSum.x;
447 		else
448 			c.x = AddSat(cSum.x, cSum.x);
449 		if(hasUnsignedTextureComponent(1))
450 			c.y = cSum.y;
451 		else
452 			c.y = AddSat(cSum.y, cSum.y);
453 		if(hasUnsignedTextureComponent(2))
454 			c.z = cSum.z;
455 		else
456 			c.z = AddSat(cSum.z, cSum.z);
457 		if(hasUnsignedTextureComponent(3))
458 			c.w = cSum.w;
459 		else
460 			c.w = AddSat(cSum.w, cSum.w);
461 	}
462 
463 	return c;
464 }
465 
sampleQuad(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)466 Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
467 {
468 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
469 	{
470 		return sampleQuad2D(texture, u, v, w, a, offset, sample, lod, secondLOD);
471 	}
472 	else
473 	{
474 		return sample3D(texture, u, v, w, offset, sample, lod, secondLOD);
475 	}
476 }
477 
sampleQuad2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)478 Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
479 {
480 	Vector4s c;
481 
482 	int componentCount = textureComponentCount();
483 	bool gather = (state.textureFilter == FILTER_GATHER);
484 
485 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
486 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
487 
488 	Short4 uuuu = address(u, state.addressingModeU, mipmap);
489 	Short4 vvvv = address(v, state.addressingModeV, mipmap);
490 	Short4 wwww = address(w, state.addressingModeW, mipmap);
491 	Short4 layerIndex = computeLayerIndex16(a, mipmap);
492 
493 	if(state.textureFilter == FILTER_POINT)
494 	{
495 		c = sampleTexel(uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap, buffer);
496 	}
497 	else
498 	{
499 		Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
500 		Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
501 		Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
502 		Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
503 
504 		Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, layerIndex, offset, sample, mipmap, buffer);
505 		Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, layerIndex, offset, sample, mipmap, buffer);
506 		Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, layerIndex, offset, sample, mipmap, buffer);
507 		Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, layerIndex, offset, sample, mipmap, buffer);
508 
509 		if(!gather)  // Blend
510 		{
511 			// Fractions
512 			UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
513 			UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
514 
515 			UShort4 f1u = ~f0u;
516 			UShort4 f1v = ~f0v;
517 
518 			UShort4 f0u0v = MulHigh(f0u, f0v);
519 			UShort4 f1u0v = MulHigh(f1u, f0v);
520 			UShort4 f0u1v = MulHigh(f0u, f1v);
521 			UShort4 f1u1v = MulHigh(f1u, f1v);
522 
523 			// Signed fractions
524 			Short4 f1u1vs;
525 			Short4 f0u1vs;
526 			Short4 f1u0vs;
527 			Short4 f0u0vs;
528 
529 			if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
530 			{
531 				f1u1vs = f1u1v >> 1;
532 				f0u1vs = f0u1v >> 1;
533 				f1u0vs = f1u0v >> 1;
534 				f0u0vs = f0u0v >> 1;
535 			}
536 
537 			// Bilinear interpolation
538 			if(componentCount >= 1)
539 			{
540 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
541 				{
542 					c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
543 					c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
544 					c.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
545 				}
546 				else
547 				{
548 					if(hasUnsignedTextureComponent(0))
549 					{
550 						c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
551 						c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
552 						c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
553 						c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
554 					}
555 					else
556 					{
557 						c00.x = MulHigh(c00.x, f1u1vs);
558 						c10.x = MulHigh(c10.x, f0u1vs);
559 						c01.x = MulHigh(c01.x, f1u0vs);
560 						c11.x = MulHigh(c11.x, f0u0vs);
561 					}
562 
563 					c.x = (c00.x + c10.x) + (c01.x + c11.x);
564 					if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);  // Correct for signed fractions
565 				}
566 			}
567 
568 			if(componentCount >= 2)
569 			{
570 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
571 				{
572 					c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
573 					c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
574 					c.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
575 				}
576 				else
577 				{
578 					if(hasUnsignedTextureComponent(1))
579 					{
580 						c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
581 						c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
582 						c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
583 						c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
584 					}
585 					else
586 					{
587 						c00.y = MulHigh(c00.y, f1u1vs);
588 						c10.y = MulHigh(c10.y, f0u1vs);
589 						c01.y = MulHigh(c01.y, f1u0vs);
590 						c11.y = MulHigh(c11.y, f0u0vs);
591 					}
592 
593 					c.y = (c00.y + c10.y) + (c01.y + c11.y);
594 					if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);  // Correct for signed fractions
595 				}
596 			}
597 
598 			if(componentCount >= 3)
599 			{
600 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
601 				{
602 					c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
603 					c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
604 					c.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
605 				}
606 				else
607 				{
608 					if(hasUnsignedTextureComponent(2))
609 					{
610 						c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
611 						c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
612 						c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
613 						c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
614 					}
615 					else
616 					{
617 						c00.z = MulHigh(c00.z, f1u1vs);
618 						c10.z = MulHigh(c10.z, f0u1vs);
619 						c01.z = MulHigh(c01.z, f1u0vs);
620 						c11.z = MulHigh(c11.z, f0u0vs);
621 					}
622 
623 					c.z = (c00.z + c10.z) + (c01.z + c11.z);
624 					if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);  // Correct for signed fractions
625 				}
626 			}
627 
628 			if(componentCount >= 4)
629 			{
630 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
631 				{
632 					c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
633 					c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
634 					c.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
635 				}
636 				else
637 				{
638 					if(hasUnsignedTextureComponent(3))
639 					{
640 						c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
641 						c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
642 						c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
643 						c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
644 					}
645 					else
646 					{
647 						c00.w = MulHigh(c00.w, f1u1vs);
648 						c10.w = MulHigh(c10.w, f0u1vs);
649 						c01.w = MulHigh(c01.w, f1u0vs);
650 						c11.w = MulHigh(c11.w, f0u0vs);
651 					}
652 
653 					c.w = (c00.w + c10.w) + (c01.w + c11.w);
654 					if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);  // Correct for signed fractions
655 				}
656 			}
657 		}
658 		else  // Gather
659 		{
660 			VkComponentSwizzle swizzle = gatherSwizzle();
661 			switch(swizzle)
662 			{
663 			case VK_COMPONENT_SWIZZLE_ZERO:
664 			case VK_COMPONENT_SWIZZLE_ONE:
665 				// Handled at the final component swizzle.
666 				break;
667 			default:
668 				c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
669 				c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
670 				c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
671 				c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
672 				break;
673 			}
674 		}
675 	}
676 
677 	return c;
678 }
679 
sample3D(Pointer<Byte> & texture,Float4 & u_,Float4 & v_,Float4 & w_,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)680 Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
681 {
682 	Vector4s c_;
683 
684 	int componentCount = textureComponentCount();
685 
686 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
687 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
688 
689 	Short4 uuuu = address(u_, state.addressingModeU, mipmap);
690 	Short4 vvvv = address(v_, state.addressingModeV, mipmap);
691 	Short4 wwww = address(w_, state.addressingModeW, mipmap);
692 
693 	if(state.textureFilter == FILTER_POINT)
694 	{
695 		c_ = sampleTexel(uuuu, vvvv, wwww, 0, offset, sample, mipmap, buffer);
696 	}
697 	else
698 	{
699 		Vector4s c[2][2][2];
700 
701 		Short4 u[2][2][2];
702 		Short4 v[2][2][2];
703 		Short4 s[2][2][2];
704 
705 		for(int i = 0; i < 2; i++)
706 		{
707 			for(int j = 0; j < 2; j++)
708 			{
709 				for(int k = 0; k < 2; k++)
710 				{
711 					u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
712 					v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
713 					s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap, wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
714 				}
715 			}
716 		}
717 
718 		// Fractions
719 		UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)));
720 		UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)));
721 		UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, depth)));
722 
723 		UShort4 f1u = ~f0u;
724 		UShort4 f1v = ~f0v;
725 		UShort4 f1s = ~f0s;
726 
727 		UShort4 f[2][2][2];
728 		Short4 fs[2][2][2];
729 
730 		f[1][1][1] = MulHigh(f1u, f1v);
731 		f[0][1][1] = MulHigh(f0u, f1v);
732 		f[1][0][1] = MulHigh(f1u, f0v);
733 		f[0][0][1] = MulHigh(f0u, f0v);
734 		f[1][1][0] = MulHigh(f1u, f1v);
735 		f[0][1][0] = MulHigh(f0u, f1v);
736 		f[1][0][0] = MulHigh(f1u, f0v);
737 		f[0][0][0] = MulHigh(f0u, f0v);
738 
739 		f[1][1][1] = MulHigh(f[1][1][1], f1s);
740 		f[0][1][1] = MulHigh(f[0][1][1], f1s);
741 		f[1][0][1] = MulHigh(f[1][0][1], f1s);
742 		f[0][0][1] = MulHigh(f[0][0][1], f1s);
743 		f[1][1][0] = MulHigh(f[1][1][0], f0s);
744 		f[0][1][0] = MulHigh(f[0][1][0], f0s);
745 		f[1][0][0] = MulHigh(f[1][0][0], f0s);
746 		f[0][0][0] = MulHigh(f[0][0][0], f0s);
747 
748 		// Signed fractions
749 		if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
750 		{
751 			fs[0][0][0] = f[0][0][0] >> 1;
752 			fs[0][0][1] = f[0][0][1] >> 1;
753 			fs[0][1][0] = f[0][1][0] >> 1;
754 			fs[0][1][1] = f[0][1][1] >> 1;
755 			fs[1][0][0] = f[1][0][0] >> 1;
756 			fs[1][0][1] = f[1][0][1] >> 1;
757 			fs[1][1][0] = f[1][1][0] >> 1;
758 			fs[1][1][1] = f[1][1][1] >> 1;
759 		}
760 
761 		for(int i = 0; i < 2; i++)
762 		{
763 			for(int j = 0; j < 2; j++)
764 			{
765 				for(int k = 0; k < 2; k++)
766 				{
767 					c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], 0, offset, sample, mipmap, buffer);
768 
769 					if(componentCount >= 1)
770 					{
771 						if(hasUnsignedTextureComponent(0))
772 							c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]);
773 						else
774 							c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]);
775 					}
776 					if(componentCount >= 2)
777 					{
778 						if(hasUnsignedTextureComponent(1))
779 							c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]);
780 						else
781 							c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]);
782 					}
783 					if(componentCount >= 3)
784 					{
785 						if(hasUnsignedTextureComponent(2))
786 							c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]);
787 						else
788 							c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]);
789 					}
790 					if(componentCount >= 4)
791 					{
792 						if(hasUnsignedTextureComponent(3))
793 							c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]);
794 						else
795 							c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]);
796 					}
797 
798 					if(i != 0 || j != 0 || k != 0)
799 					{
800 						if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
801 						if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
802 						if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
803 						if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
804 					}
805 				}
806 			}
807 		}
808 
809 		if(componentCount >= 1) c_.x = c[0][0][0].x;
810 		if(componentCount >= 2) c_.y = c[0][0][0].y;
811 		if(componentCount >= 3) c_.z = c[0][0][0].z;
812 		if(componentCount >= 4) c_.w = c[0][0][0].w;
813 
814 		// Correct for signed fractions
815 		if(componentCount >= 1)
816 			if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
817 		if(componentCount >= 2)
818 			if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
819 		if(componentCount >= 3)
820 			if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
821 		if(componentCount >= 4)
822 			if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
823 	}
824 
825 	return c_;
826 }
827 
sampleFloatFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta)828 Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
829 {
830 	Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false);
831 
832 	if(function == Fetch)
833 	{
834 		return c;
835 	}
836 
837 	if(state.mipmapFilter == MIPMAP_LINEAR)
838 	{
839 		Vector4f cc = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, true);
840 
841 		Float4 lod4 = Float4(Frac(lod));
842 
843 		c.x = (cc.x - c.x) * lod4 + c.x;
844 		c.y = (cc.y - c.y) * lod4 + c.y;
845 		c.z = (cc.z - c.z) * lod4 + c.z;
846 		c.w = (cc.w - c.w) * lod4 + c.w;
847 	}
848 
849 	return c;
850 }
851 
sampleFloatAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD)852 Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
853 {
854 	Vector4f c;
855 
856 	if(state.textureFilter != FILTER_ANISOTROPIC)
857 	{
858 		c = sampleFloat(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
859 	}
860 	else
861 	{
862 		Int N = RoundInt(anisotropy);
863 
864 		Vector4f cSum;
865 
866 		cSum.x = Float4(0.0f);
867 		cSum.y = Float4(0.0f);
868 		cSum.z = Float4(0.0f);
869 		cSum.w = Float4(0.0f);
870 
871 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
872 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
873 
874 		Float4 du = uDelta;
875 		Float4 dv = vDelta;
876 
877 		Float4 u0 = u + B * du;
878 		Float4 v0 = v + B * dv;
879 
880 		du *= A;
881 		dv *= A;
882 
883 		Int i = 0;
884 
885 		Do
886 		{
887 			c = sampleFloat(texture, u0, v0, w, a, dRef, offset, sample, lod, secondLOD);
888 
889 			u0 += du;
890 			v0 += dv;
891 
892 			cSum.x += c.x * A;
893 			cSum.y += c.y * A;
894 			cSum.z += c.z * A;
895 			cSum.w += c.w * A;
896 
897 			i++;
898 		}
899 		Until(i >= N);
900 
901 		c.x = cSum.x;
902 		c.y = cSum.y;
903 		c.z = cSum.z;
904 		c.w = cSum.w;
905 	}
906 
907 	return c;
908 }
909 
sampleFloat(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)910 Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
911 {
912 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
913 	{
914 		return sampleFloat2D(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD);
915 	}
916 	else
917 	{
918 		return sampleFloat3D(texture, u, v, w, dRef, offset, sample, lod, secondLOD);
919 	}
920 }
921 
sampleFloat2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)922 Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
923 {
924 	Vector4f c;
925 
926 	int componentCount = textureComponentCount();
927 	bool gather = (state.textureFilter == FILTER_GATHER);
928 
929 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
930 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
931 
932 	Int4 x0, x1, y0, y1;
933 	Float4 fu, fv;
934 	Int4 filter = computeFilterOffset(lod);
935 	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU);
936 	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV);
937 
938 	Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
939 	y0 *= pitchP;
940 
941 	Int4 z;
942 	if(state.isCube() || state.isArrayed())
943 	{
944 		Int4 face = As<Int4>(w);
945 		Int4 layerIndex = computeLayerIndex(a, mipmap);
946 
947 		// For cube maps, the layer argument is per cube, each of which has 6 layers
948 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
949 		{
950 			layerIndex *= Int4(6);
951 		}
952 
953 		z = state.isCube() ? face : layerIndex;
954 
955 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
956 		{
957 			z += layerIndex;
958 		}
959 
960 		z *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
961 	}
962 
963 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
964 	{
965 		c = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
966 	}
967 	else
968 	{
969 		y1 *= pitchP;
970 
971 		Vector4f c00 = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer);
972 		Vector4f c10 = sampleTexel(x1, y0, z, dRef, sample, mipmap, buffer);
973 		Vector4f c01 = sampleTexel(x0, y1, z, dRef, sample, mipmap, buffer);
974 		Vector4f c11 = sampleTexel(x1, y1, z, dRef, sample, mipmap, buffer);
975 
976 		if(!gather)  // Blend
977 		{
978 			if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
979 			if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
980 			if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
981 			if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
982 
983 			if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
984 			if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
985 			if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
986 			if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
987 
988 			if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
989 			if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
990 			if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
991 			if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
992 		}
993 		else  // Gather
994 		{
995 			VkComponentSwizzle swizzle = gatherSwizzle();
996 			switch(swizzle)
997 			{
998 			case VK_COMPONENT_SWIZZLE_ZERO:
999 			case VK_COMPONENT_SWIZZLE_ONE:
1000 				// Handled at the final component swizzle.
1001 				break;
1002 			default:
1003 				c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
1004 				c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
1005 				c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
1006 				c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
1007 				break;
1008 			}
1009 		}
1010 	}
1011 
1012 	return c;
1013 }
1014 
sampleFloat3D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD)1015 Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
1016 {
1017 	Vector4f c;
1018 
1019 	int componentCount = textureComponentCount();
1020 
1021 	Pointer<Byte> mipmap = selectMipmap(texture, lod, secondLOD);
1022 	Pointer<Byte> buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
1023 
1024 	Int4 x0, x1, y0, y1, z0, z1;
1025 	Float4 fu, fv, fw;
1026 	Int4 filter = computeFilterOffset(lod);
1027 	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU);
1028 	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV);
1029 	address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW);
1030 
1031 	Int4 pitchP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, pitchP), 16));
1032 	Int4 sliceP = As<Int4>(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP), 16));
1033 	y0 *= pitchP;
1034 	z0 *= sliceP;
1035 
1036 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
1037 	{
1038 		c = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1039 	}
1040 	else
1041 	{
1042 		y1 *= pitchP;
1043 		z1 *= sliceP;
1044 
1045 		Vector4f c000 = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer);
1046 		Vector4f c100 = sampleTexel(x1, y0, z0, dRef, sample, mipmap, buffer);
1047 		Vector4f c010 = sampleTexel(x0, y1, z0, dRef, sample, mipmap, buffer);
1048 		Vector4f c110 = sampleTexel(x1, y1, z0, dRef, sample, mipmap, buffer);
1049 		Vector4f c001 = sampleTexel(x0, y0, z1, dRef, sample, mipmap, buffer);
1050 		Vector4f c101 = sampleTexel(x1, y0, z1, dRef, sample, mipmap, buffer);
1051 		Vector4f c011 = sampleTexel(x0, y1, z1, dRef, sample, mipmap, buffer);
1052 		Vector4f c111 = sampleTexel(x1, y1, z1, dRef, sample, mipmap, buffer);
1053 
1054 		// Blend first slice
1055 		if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
1056 		if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
1057 		if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
1058 		if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
1059 
1060 		if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
1061 		if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
1062 		if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
1063 		if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
1064 
1065 		if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
1066 		if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
1067 		if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
1068 		if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
1069 
1070 		// Blend second slice
1071 		if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
1072 		if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
1073 		if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
1074 		if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
1075 
1076 		if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
1077 		if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
1078 		if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
1079 		if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
1080 
1081 		if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
1082 		if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
1083 		if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
1084 		if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
1085 
1086 		// Blend slices
1087 		if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
1088 		if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
1089 		if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
1090 		if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
1091 	}
1092 
1093 	return c;
1094 }
1095 
log2sqrt(Float lod)1096 static Float log2sqrt(Float lod)
1097 {
1098 	// log2(sqrt(lod))                              // Equals 0.25 * log2(lod^2).
1099 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1100 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1101 	lod *= As<Float>(Int(0x33000000));              // Scale by 0.25 * 2^-23 (mantissa length).
1102 
1103 	return lod;
1104 }
1105 
log2(Float lod)1106 static Float log2(Float lod)
1107 {
1108 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1109 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1110 	lod *= As<Float>(Int(0x33800000));              // Scale by 0.5 * 2^-23 (mantissa length).
1111 
1112 	return lod;
1113 }
1114 
computeLod1D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,const Float4 & dsx,const Float4 & dsy)1115 void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, const Float4 &dsx, const Float4 &dsy)
1116 {
1117 	Float4 dudxy;
1118 
1119 	if(function != Grad)  // Implicit
1120 	{
1121 		dudxy = uuuu.yz - uuuu.xx;
1122 	}
1123 	else
1124 	{
1125 		dudxy = UnpackLow(dsx, dsy);
1126 	}
1127 
1128 	// Scale by texture dimensions.
1129 	Float4 dUdxy = dudxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1130 
1131 	// Note we could take the absolute value here and omit the square root below,
1132 	// but this is more consistent with the 2D calculation and still cheap.
1133 	Float4 dU2dxy = dUdxy * dUdxy;
1134 
1135 	lod = Max(Float(dU2dxy.x), Float(dU2dxy.y));
1136 	lod = log2sqrt(lod);
1137 }
1138 
computeLod2D(Pointer<Byte> & texture,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,Float4 & uuuu,Float4 & vvvv,const Float4 & dsx,const Float4 & dsy)1139 void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float4 &dsx, const Float4 &dsy)
1140 {
1141 	Float4 duvdxy;
1142 
1143 	if(function != Grad)  // Implicit
1144 	{
1145 		duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
1146 	}
1147 	else
1148 	{
1149 		Float4 dudxy = Float4(dsx.xx, dsy.xx);
1150 		Float4 dvdxy = Float4(dsx.yy, dsy.yy);
1151 
1152 		duvdxy = Float4(dudxy.xz, dvdxy.xz);
1153 	}
1154 
1155 	// Scale by texture dimensions.
1156 	Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1157 
1158 	Float4 dUV2dxy = dUVdxy * dUVdxy;
1159 	Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
1160 
1161 	lod = Max(Float(dUV2.x), Float(dUV2.y));  // Square length of major axis
1162 
1163 	if(state.textureFilter == FILTER_ANISOTROPIC)
1164 	{
1165 		Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
1166 
1167 		Float4 dudx = duvdxy.xxxx;
1168 		Float4 dudy = duvdxy.yyyy;
1169 		Float4 dvdx = duvdxy.zzzz;
1170 		Float4 dvdy = duvdxy.wwww;
1171 
1172 		Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
1173 		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
1174 		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
1175 
1176 		anisotropy = lod * Rcp(det, true /* relaxedPrecision */);
1177 		anisotropy = Min(anisotropy, state.maxAnisotropy);
1178 
1179 		// TODO(b/151263485): While we always need `lod` above, when there's only
1180 		// a single mipmap level the following calculations could be skipped.
1181 		lod *= Rcp(anisotropy * anisotropy, true /* relaxedPrecision */);
1182 	}
1183 
1184 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1185 }
1186 
computeLodCube(Pointer<Byte> & texture,Float & lod,Float4 & u,Float4 & v,Float4 & w,const Float4 & dsx,const Float4 & dsy,Float4 & M)1187 void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M)
1188 {
1189 	Float4 dudxy, dvdxy, dsdxy;
1190 
1191 	if(function != Grad)  // Implicit
1192 	{
1193 		Float4 U = u * M;
1194 		Float4 V = v * M;
1195 		Float4 W = w * M;
1196 
1197 		dudxy = Abs(U - U.xxxx);
1198 		dvdxy = Abs(V - V.xxxx);
1199 		dsdxy = Abs(W - W.xxxx);
1200 	}
1201 	else
1202 	{
1203 		dudxy = Float4(dsx.xx, dsy.xx);
1204 		dvdxy = Float4(dsx.yy, dsy.yy);
1205 		dsdxy = Float4(dsx.zz, dsy.zz);
1206 
1207 		dudxy = Abs(dudxy * Float4(M.x));
1208 		dvdxy = Abs(dvdxy * Float4(M.x));
1209 		dsdxy = Abs(dsdxy * Float4(M.x));
1210 	}
1211 
1212 	// Compute the largest Manhattan distance in two dimensions.
1213 	// This takes the footprint across adjacent faces into account.
1214 	Float4 duvdxy = dudxy + dvdxy;
1215 	Float4 dusdxy = dudxy + dsdxy;
1216 	Float4 dvsdxy = dvdxy + dsdxy;
1217 
1218 	dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
1219 
1220 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // TODO: Max(dudxy.y, dudxy.z);
1221 
1222 	// Scale by texture dimension.
1223 	lod *= *Pointer<Float>(texture + OFFSET(Texture, width));
1224 
1225 	lod = log2(lod);
1226 }
1227 
computeLod3D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,Float4 & vvvv,Float4 & wwww,const Float4 & dsx,const Float4 & dsy)1228 void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float4 &dsx, const Float4 &dsy)
1229 {
1230 	Float4 dudxy, dvdxy, dsdxy;
1231 
1232 	if(function != Grad)  // Implicit
1233 	{
1234 		dudxy = uuuu - uuuu.xxxx;
1235 		dvdxy = vvvv - vvvv.xxxx;
1236 		dsdxy = wwww - wwww.xxxx;
1237 	}
1238 	else
1239 	{
1240 		dudxy = Float4(dsx.xx, dsy.xx);
1241 		dvdxy = Float4(dsx.yy, dsy.yy);
1242 		dsdxy = Float4(dsx.zz, dsy.zz);
1243 	}
1244 
1245 	// Scale by texture dimensions.
1246 	dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
1247 	dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
1248 	dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
1249 
1250 	dudxy *= dudxy;
1251 	dvdxy *= dvdxy;
1252 	dsdxy *= dsdxy;
1253 
1254 	dudxy += dvdxy;
1255 	dudxy += dsdxy;
1256 
1257 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // TODO: Max(dudxy.y, dudxy.z);
1258 
1259 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1260 }
1261 
cubeFace(Float4 & U,Float4 & V,Float4 & x,Float4 & y,Float4 & z,Float4 & M)1262 Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
1263 {
1264 	// TODO: Comply with Vulkan recommendation:
1265 	// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
1266 
1267 	Int4 xn = CmpLT(x, 0.0f);  // x < 0
1268 	Int4 yn = CmpLT(y, 0.0f);  // y < 0
1269 	Int4 zn = CmpLT(z, 0.0f);  // z < 0
1270 
1271 	Float4 absX = Abs(x);
1272 	Float4 absY = Abs(y);
1273 	Float4 absZ = Abs(z);
1274 
1275 	Int4 xy = CmpNLE(absX, absY);  // abs(x) > abs(y)
1276 	Int4 yz = CmpNLE(absY, absZ);  // abs(y) > abs(z)
1277 	Int4 zx = CmpNLE(absZ, absX);  // abs(z) > abs(x)
1278 	Int4 xMajor = xy & ~zx;        // abs(x) > abs(y) && abs(x) > abs(z)
1279 	Int4 yMajor = yz & ~xy;        // abs(y) > abs(z) && abs(y) > abs(x)
1280 	Int4 zMajor = zx & ~yz;        // abs(z) > abs(x) && abs(z) > abs(y)
1281 
1282 	// FACE_POSITIVE_X = 000b
1283 	// FACE_NEGATIVE_X = 001b
1284 	// FACE_POSITIVE_Y = 010b
1285 	// FACE_NEGATIVE_Y = 011b
1286 	// FACE_POSITIVE_Z = 100b
1287 	// FACE_NEGATIVE_Z = 101b
1288 
1289 	Int yAxis = SignMask(yMajor);
1290 	Int zAxis = SignMask(zMajor);
1291 
1292 	Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
1293 	Int negative = SignMask(n);
1294 
1295 	Int faces = *Pointer<Int>(constants + OFFSET(Constants, transposeBit0) + negative * 4);
1296 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit1) + yAxis * 4);
1297 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit2) + zAxis * 4);
1298 
1299 	Int4 face;
1300 	face.x = faces & 0x7;
1301 	face.y = (faces >> 4) & 0x7;
1302 	face.z = (faces >> 8) & 0x7;
1303 	face.w = (faces >> 12) & 0x7;
1304 
1305 	M = Max(Max(absX, absY), absZ);
1306 
1307 	// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
1308 	U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
1309 
1310 	// V = !yMajor ? -y : (n ^ z)
1311 	V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
1312 
1313 	M = reciprocal(M) * 0.5f;
1314 	U = U * M + 0.5f;
1315 	V = V * M + 0.5f;
1316 
1317 	return face;
1318 }
1319 
applyOffset(Short4 & uvw,Int4 & offset,const Int4 & whd,AddressingMode mode)1320 Short4 SamplerCore::applyOffset(Short4 &uvw, Int4 &offset, const Int4 &whd, AddressingMode mode)
1321 {
1322 	Int4 tmp = Int4(As<UShort4>(uvw));
1323 	tmp = tmp + offset;
1324 
1325 	switch(mode)
1326 	{
1327 	case AddressingMode::ADDRESSING_WRAP:
1328 		tmp = (tmp + whd * Int4(-MIN_TEXEL_OFFSET)) % whd;
1329 		break;
1330 	case AddressingMode::ADDRESSING_CLAMP:
1331 	case AddressingMode::ADDRESSING_MIRROR:
1332 	case AddressingMode::ADDRESSING_MIRRORONCE:
1333 	case AddressingMode::ADDRESSING_BORDER:  // TODO(b/29069044): Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
1334 		tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
1335 		break;
1336 	case AddressingMode::ADDRESSING_SEAMLESS:
1337 		ASSERT(false);  // Cube sampling doesn't support offset.
1338 	default:
1339 		ASSERT(false);
1340 	}
1341 
1342 	return As<Short4>(UShort4(tmp));
1343 }
1344 
computeIndices(UInt index[4],Short4 uuuu,Short4 vvvv,Short4 wwww,const Short4 & layerIndex,Vector4i & offset,const Int4 & sample,const Pointer<Byte> & mipmap)1345 void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &layerIndex, Vector4i &offset, const Int4 &sample, const Pointer<Byte> &mipmap)
1346 {
1347 	uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, width))));
1348 
1349 	if(function.offset)
1350 	{
1351 		uuuu = applyOffset(uuuu, offset.x, *Pointer<UInt4>(mipmap + OFFSET(Mipmap, width)), state.addressingModeU);
1352 	}
1353 
1354 	UInt4 indices = Int4(uuuu);
1355 
1356 	if(state.is2D() || state.is3D() || state.isCube())
1357 	{
1358 		vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<UInt4>(mipmap + OFFSET(Mipmap, height))));
1359 
1360 		if(function.offset)
1361 		{
1362 			vvvv = applyOffset(vvvv, offset.y, *Pointer<UInt4>(mipmap + OFFSET(Mipmap, height)), state.addressingModeV);
1363 		}
1364 
1365 		Short4 uv0uv1 = As<Short4>(UnpackLow(uuuu, vvvv));
1366 		Short4 uv2uv3 = As<Short4>(UnpackHigh(uuuu, vvvv));
1367 		Int2 i01 = MulAdd(uv0uv1, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1368 		Int2 i23 = MulAdd(uv2uv3, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1369 
1370 		indices = UInt4(As<UInt2>(i01), As<UInt2>(i23));
1371 	}
1372 
1373 	if(state.is3D())
1374 	{
1375 		wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
1376 
1377 		if(function.offset)
1378 		{
1379 			wwww = applyOffset(wwww, offset.z, *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth)), state.addressingModeW);
1380 		}
1381 
1382 		indices += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1383 	}
1384 
1385 	if(state.isArrayed())
1386 	{
1387 		Int4 layer = Int4(As<UShort4>(layerIndex));
1388 
1389 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1390 		{
1391 			layer *= Int4(6);
1392 		}
1393 
1394 		UInt4 layerOffset = As<UInt4>(layer) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1395 
1396 		indices += layerOffset;
1397 	}
1398 
1399 	if(function.sample)
1400 	{
1401 		UInt4 sampleOffset = Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1402 		                     *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1403 		indices += sampleOffset;
1404 	}
1405 
1406 	index[0] = Extract(indices, 0);
1407 	index[1] = Extract(indices, 1);
1408 	index[2] = Extract(indices, 2);
1409 	index[3] = Extract(indices, 3);
1410 }
1411 
computeIndices(UInt index[4],Int4 uuuu,Int4 vvvv,Int4 wwww,const Int4 & sample,Int4 valid,const Pointer<Byte> & mipmap)1412 void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap)
1413 {
1414 	UInt4 indices = uuuu;
1415 
1416 	if(state.is2D() || state.is3D() || state.isCube())
1417 	{
1418 		indices += As<UInt4>(vvvv);
1419 	}
1420 
1421 	if(state.is3D() || state.isCube() || state.isArrayed())
1422 	{
1423 		indices += As<UInt4>(wwww);
1424 	}
1425 
1426 	if(function.sample)
1427 	{
1428 		indices += Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1429 		           *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1430 	}
1431 
1432 	if(borderModeActive())
1433 	{
1434 		// Texels out of range are still sampled before being replaced
1435 		// with the border color, so sample them at linear index 0.
1436 		indices &= As<UInt4>(valid);
1437 	}
1438 
1439 	for(int i = 0; i < 4; i++)
1440 	{
1441 		index[i] = Extract(As<Int4>(indices), i);
1442 	}
1443 }
1444 
sampleTexel(UInt index[4],Pointer<Byte> buffer)1445 Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
1446 {
1447 	Vector4s c;
1448 
1449 	if(has16bitPackedTextureFormat())
1450 	{
1451 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1452 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1453 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1454 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1455 
1456 		switch(state.textureFormat)
1457 		{
1458 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
1459 			c.z = (c.x & Short4(0x001Fu)) << 11;
1460 			c.y = (c.x & Short4(0x07E0u)) << 5;
1461 			c.x = (c.x & Short4(0xF800u));
1462 			break;
1463 		case VK_FORMAT_B5G6R5_UNORM_PACK16:
1464 			c.z = (c.x & Short4(0xF800u));
1465 			c.y = (c.x & Short4(0x07E0u)) << 5;
1466 			c.x = (c.x & Short4(0x001Fu)) << 11;
1467 			break;
1468 		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1469 			c.w = (c.x << 12) & Short4(0xF000u);
1470 			c.z = (c.x << 8) & Short4(0xF000u);
1471 			c.y = (c.x << 4) & Short4(0xF000u);
1472 			c.x = (c.x) & Short4(0xF000u);
1473 			break;
1474 		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1475 			c.w = (c.x << 12) & Short4(0xF000u);
1476 			c.z = (c.x) & Short4(0xF000u);
1477 			c.y = (c.x << 4) & Short4(0xF000u);
1478 			c.x = (c.x << 8) & Short4(0xF000u);
1479 			break;
1480 		case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1481 			c.w = (c.x) & Short4(0xF000u);
1482 			c.z = (c.x << 12) & Short4(0xF000u);
1483 			c.y = (c.x << 8) & Short4(0xF000u);
1484 			c.x = (c.x << 4) & Short4(0xF000u);
1485 			break;
1486 		case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1487 			c.w = (c.x) & Short4(0xF000u);
1488 			c.z = (c.x << 4) & Short4(0xF000u);
1489 			c.y = (c.x << 8) & Short4(0xF000u);
1490 			c.x = (c.x << 12) & Short4(0xF000u);
1491 			break;
1492 		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1493 			c.w = (c.x << 15) & Short4(0x8000u);
1494 			c.z = (c.x << 10) & Short4(0xF800u);
1495 			c.y = (c.x << 5) & Short4(0xF800u);
1496 			c.x = (c.x) & Short4(0xF800u);
1497 			break;
1498 		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1499 			c.w = (c.x << 15) & Short4(0x8000u);
1500 			c.z = (c.x) & Short4(0xF800u);
1501 			c.y = (c.x << 5) & Short4(0xF800u);
1502 			c.x = (c.x << 10) & Short4(0xF800u);
1503 			break;
1504 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1505 			c.w = (c.x) & Short4(0x8000u);
1506 			c.z = (c.x << 11) & Short4(0xF800u);
1507 			c.y = (c.x << 6) & Short4(0xF800u);
1508 			c.x = (c.x << 1) & Short4(0xF800u);
1509 			break;
1510 		default:
1511 			ASSERT(false);
1512 		}
1513 	}
1514 	else if(has8bitTextureComponents())
1515 	{
1516 		switch(textureComponentCount())
1517 		{
1518 		case 4:
1519 			{
1520 				Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
1521 				Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
1522 				Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
1523 				Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
1524 				c.x = Unpack(c0, c1);
1525 				c.y = Unpack(c2, c3);
1526 
1527 				switch(state.textureFormat)
1528 				{
1529 				case VK_FORMAT_B8G8R8A8_UNORM:
1530 				case VK_FORMAT_B8G8R8A8_SRGB:
1531 					c.z = As<Short4>(UnpackLow(c.x, c.y));
1532 					c.x = As<Short4>(UnpackHigh(c.x, c.y));
1533 					c.y = c.z;
1534 					c.w = c.x;
1535 					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1536 					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1537 					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1538 					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1539 					break;
1540 				case VK_FORMAT_R8G8B8A8_UNORM:
1541 				case VK_FORMAT_R8G8B8A8_SNORM:
1542 				case VK_FORMAT_R8G8B8A8_SINT:
1543 				case VK_FORMAT_R8G8B8A8_SRGB:
1544 				case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1545 				case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1546 				case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1547 				case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1548 					c.z = As<Short4>(UnpackHigh(c.x, c.y));
1549 					c.x = As<Short4>(UnpackLow(c.x, c.y));
1550 					c.y = c.x;
1551 					c.w = c.z;
1552 					c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1553 					c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1554 					c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1555 					c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1556 					// Propagate sign bit
1557 					if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT ||
1558 					   state.textureFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32)
1559 					{
1560 						c.x >>= 8;
1561 						c.y >>= 8;
1562 						c.z >>= 8;
1563 						c.w >>= 8;
1564 					}
1565 					break;
1566 				case VK_FORMAT_R8G8B8A8_UINT:
1567 				case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1568 					c.z = As<Short4>(UnpackHigh(c.x, c.y));
1569 					c.x = As<Short4>(UnpackLow(c.x, c.y));
1570 					c.y = c.x;
1571 					c.w = c.z;
1572 					c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
1573 					c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
1574 					c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
1575 					c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
1576 					break;
1577 				default:
1578 					ASSERT(false);
1579 				}
1580 			}
1581 			break;
1582 		case 2:
1583 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1584 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1585 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1586 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1587 
1588 			switch(state.textureFormat)
1589 			{
1590 			case VK_FORMAT_R8G8_UNORM:
1591 			case VK_FORMAT_R8G8_SNORM:
1592 			case VK_FORMAT_R8G8_SRGB:
1593 				c.y = (c.x & Short4(0xFF00u));
1594 				c.x = (c.x << 8);
1595 				break;
1596 			case VK_FORMAT_R8G8_SINT:
1597 				c.y = c.x >> 8;
1598 				c.x = (c.x << 8) >> 8;  // Propagate sign bit
1599 				break;
1600 			case VK_FORMAT_R8G8_UINT:
1601 				c.y = As<Short4>(As<UShort4>(c.x) >> 8);
1602 				c.x &= Short4(0x00FFu);
1603 				break;
1604 			default:
1605 				ASSERT(false);
1606 			}
1607 			break;
1608 		case 1:
1609 			{
1610 				Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
1611 				Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
1612 				Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
1613 				Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
1614 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1615 
1616 				switch(state.textureFormat)
1617 				{
1618 				case VK_FORMAT_R8_SINT:
1619 				case VK_FORMAT_R8_UINT:
1620 				case VK_FORMAT_S8_UINT:
1621 					{
1622 						Int zero(0);
1623 						c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
1624 						// Propagate sign bit
1625 						if(state.textureFormat == VK_FORMAT_R8_SINT)
1626 						{
1627 							c.x = (c.x << 8) >> 8;
1628 						}
1629 					}
1630 					break;
1631 				case VK_FORMAT_R8_SNORM:
1632 				case VK_FORMAT_R8_UNORM:
1633 				case VK_FORMAT_R8_SRGB:
1634 					// TODO: avoid populating the low bits at all.
1635 					c.x = Unpack(As<Byte4>(c0));
1636 					c.x &= Short4(0xFF00u);
1637 					break;
1638 				default:
1639 					c.x = Unpack(As<Byte4>(c0));
1640 					break;
1641 				}
1642 			}
1643 			break;
1644 		default:
1645 			ASSERT(false);
1646 		}
1647 	}
1648 	else if(has16bitTextureComponents())
1649 	{
1650 		switch(textureComponentCount())
1651 		{
1652 		case 4:
1653 			c.x = Pointer<Short4>(buffer)[index[0]];
1654 			c.y = Pointer<Short4>(buffer)[index[1]];
1655 			c.z = Pointer<Short4>(buffer)[index[2]];
1656 			c.w = Pointer<Short4>(buffer)[index[3]];
1657 			transpose4x4(c.x, c.y, c.z, c.w);
1658 			break;
1659 		case 2:
1660 			c.x = *Pointer<Short4>(buffer + 4 * index[0]);
1661 			c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
1662 			c.z = *Pointer<Short4>(buffer + 4 * index[2]);
1663 			c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
1664 			c.y = c.x;
1665 			c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
1666 			c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
1667 			break;
1668 		case 1:
1669 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1670 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1671 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1672 			c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1673 			break;
1674 		default:
1675 			ASSERT(false);
1676 		}
1677 	}
1678 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
1679 	{
1680 		Int4 cc;
1681 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1682 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1683 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1684 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1685 
1686 		c.x = Short4(cc << 6) & Short4(0xFFC0u);
1687 		c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1688 		c.z = Short4(cc >> 14) & Short4(0xFFC0u);
1689 		c.w = Short4(cc >> 16) & Short4(0xC000u);
1690 	}
1691 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UNORM_PACK32)
1692 	{
1693 		Int4 cc;
1694 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1695 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1696 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1697 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1698 
1699 		c.x = Short4(cc >> 14) & Short4(0xFFC0u);
1700 		c.y = Short4(cc >> 4) & Short4(0xFFC0u);
1701 		c.z = Short4(cc << 6) & Short4(0xFFC0u);
1702 		c.w = Short4(cc >> 16) & Short4(0xC000u);
1703 	}
1704 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
1705 	{
1706 		Int4 cc;
1707 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1708 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1709 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1710 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1711 
1712 		c.x = Short4(cc & Int4(0x3FF));
1713 		c.y = Short4((cc >> 10) & Int4(0x3FF));
1714 		c.z = Short4((cc >> 20) & Int4(0x3FF));
1715 		c.w = Short4((cc >> 30) & Int4(0x3));
1716 	}
1717 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UINT_PACK32)
1718 	{
1719 		Int4 cc;
1720 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1721 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1722 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1723 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1724 
1725 		c.z = Short4((cc & Int4(0x3FF)));
1726 		c.y = Short4(((cc >> 10) & Int4(0x3FF)));
1727 		c.x = Short4(((cc >> 20) & Int4(0x3FF)));
1728 		c.w = Short4(((cc >> 30) & Int4(0x3)));
1729 	}
1730 	else
1731 		ASSERT(false);
1732 
1733 	if(state.textureFormat.isSRGBformat())
1734 	{
1735 		for(int i = 0; i < textureComponentCount(); i++)
1736 		{
1737 			if(isRGBComponent(i))
1738 			{
1739 				// The current table-based sRGB conversion requires 0xFF00 to represent 1.0.
1740 				ASSERT(state.textureFormat.has8bitTextureComponents());
1741 
1742 				sRGBtoLinearFF00(c[i]);
1743 			}
1744 		}
1745 	}
1746 
1747 	return c;
1748 }
1749 
sampleTexel(Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,Vector4i & offset,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)1750 Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, Vector4i &offset, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
1751 {
1752 	Vector4s c;
1753 
1754 	UInt index[4];
1755 	computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap);
1756 
1757 	if(isYcbcrFormat())
1758 	{
1759 		// Generates 15-bit output.
1760 
1761 		// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
1762 		Pointer<Byte> bufferY = buffer;                                                                         // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
1763 		Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
1764 		Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
1765 
1766 		// Luminance (either 8-bit or 10-bit in bottom bits).
1767 		UShort4 Y;
1768 		{
1769 			switch(state.textureFormat)
1770 			{
1771 			case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1772 			case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1773 				{
1774 					Y = Insert(Y, UShort(bufferY[index[0]]), 0);
1775 					Y = Insert(Y, UShort(bufferY[index[1]]), 1);
1776 					Y = Insert(Y, UShort(bufferY[index[2]]), 2);
1777 					Y = Insert(Y, UShort(bufferY[index[3]]), 3);
1778 				}
1779 				break;
1780 			case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
1781 				{
1782 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[0]], 0);
1783 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[1]], 1);
1784 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[2]], 2);
1785 					Y = Insert(Y, Pointer<UShort>(bufferY)[index[3]], 3);
1786 					// Top 10 bits of each 16 bits:
1787 					Y = (Y & UShort4(0xFFC0u)) >> 6;
1788 				}
1789 				break;
1790 			default:
1791 				UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
1792 				break;
1793 			}
1794 		}
1795 
1796 		// Chroma (either 8-bit or 10-bit in bottom bits).
1797 		UShort4 Cb, Cr;
1798 		{
1799 			computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap + sizeof(Mipmap));
1800 			UShort4 U, V;
1801 
1802 			switch(state.textureFormat)
1803 			{
1804 			case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1805 				{
1806 					U = Insert(U, UShort(bufferU[index[0]]), 0);
1807 					U = Insert(U, UShort(bufferU[index[1]]), 1);
1808 					U = Insert(U, UShort(bufferU[index[2]]), 2);
1809 					U = Insert(U, UShort(bufferU[index[3]]), 3);
1810 
1811 					V = Insert(V, UShort(bufferV[index[0]]), 0);
1812 					V = Insert(V, UShort(bufferV[index[1]]), 1);
1813 					V = Insert(V, UShort(bufferV[index[2]]), 2);
1814 					V = Insert(V, UShort(bufferV[index[3]]), 3);
1815 				}
1816 				break;
1817 			case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1818 				{
1819 					UShort4 UV;
1820 					UV = Insert(UV, Pointer<UShort>(bufferU)[index[0]], 0);
1821 					UV = Insert(UV, Pointer<UShort>(bufferU)[index[1]], 1);
1822 					UV = Insert(UV, Pointer<UShort>(bufferU)[index[2]], 2);
1823 					UV = Insert(UV, Pointer<UShort>(bufferU)[index[3]], 3);
1824 
1825 					U = (UV & UShort4(0x00FFu));
1826 					V = (UV & UShort4(0xFF00u)) >> 8;
1827 				}
1828 				break;
1829 			case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
1830 				{
1831 					UInt4 UV;
1832 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[0]], 0);
1833 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[1]], 1);
1834 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[2]], 2);
1835 					UV = Insert(UV, Pointer<UInt>(bufferU)[index[3]], 3);
1836 					// Top 10 bits of first 16-bits:
1837 					U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6);
1838 					// Top 10 bits of second 16-bits:
1839 					V = UShort4((UV & UInt4(0xFFC00000u)) >> 22);
1840 				}
1841 				break;
1842 			default:
1843 				UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
1844 				break;
1845 			}
1846 
1847 			if(!state.swappedChroma)
1848 			{
1849 				Cb = U;
1850 				Cr = V;
1851 			}
1852 			else
1853 			{
1854 				Cb = V;
1855 				Cr = U;
1856 			}
1857 		}
1858 
1859 		uint8_t lumaBits = 8;
1860 		uint8_t chromaBits = 8;
1861 		switch(state.textureFormat)
1862 		{
1863 		case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1864 		case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1865 			lumaBits = 8;
1866 			chromaBits = 8;
1867 			break;
1868 		case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
1869 			lumaBits = 10;
1870 			chromaBits = 10;
1871 			break;
1872 		default:
1873 			UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
1874 			break;
1875 		}
1876 
1877 		if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
1878 		{
1879 			// Scale to the output 15-bit.
1880 			c.x = Cr << (15 - chromaBits);
1881 			c.y = Y << (15 - lumaBits);
1882 			c.z = Cb << (15 - chromaBits);
1883 		}
1884 		else
1885 		{
1886 			const float twoPowLumaBits = static_cast<float>(0x1u << lumaBits);
1887 			const float twoPowLumaBitsMinus8 = static_cast<float>(0x1u << (lumaBits - 8));
1888 			const float twoPowChromaBits = static_cast<float>(0x1u << chromaBits);
1889 			const float twoPowChromaBitsMinus1 = static_cast<float>(0x1u << (chromaBits - 1));
1890 			const float twoPowChromaBitsMinus8 = static_cast<float>(0x1u << (chromaBits - 8));
1891 
1892 			Float4 y = Float4(Y);
1893 			Float4 u = Float4(Cb);
1894 			Float4 v = Float4(Cr);
1895 
1896 			if(state.studioSwing)
1897 			{
1898 				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_NARROW
1899 				y = ((y / Float4(twoPowLumaBitsMinus8)) - Float4(16.0f)) / Float4(219.0f);
1900 				u = ((u / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
1901 				v = ((v / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f);
1902 			}
1903 			else
1904 			{
1905 				// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_FULL
1906 				y = y / Float4(twoPowLumaBits - 1.0f);
1907 				u = (u - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
1908 				v = (v - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f);
1909 			}
1910 
1911 			// Now, `y` is in [0, 1] and `u` and `v` are in [-0.5, 0.5].
1912 
1913 			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
1914 			{
1915 				c.x = Short4(v * static_cast<float>(0x7FFF));
1916 				c.y = Short4(y * static_cast<float>(0x7FFF));
1917 				c.z = Short4(u * static_cast<float>(0x7FFF));
1918 			}
1919 			else
1920 			{
1921 				// Generic YCbCr to RGB transformation:
1922 				// R = Y                               +           2 * (1 - Kr) * Cr
1923 				// G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
1924 				// B = Y +           2 * (1 - Kb) * Cb
1925 
1926 				float Kb = 0.114f;
1927 				float Kr = 0.299f;
1928 
1929 				switch(state.ycbcrModel)
1930 				{
1931 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
1932 					Kb = 0.0722f;
1933 					Kr = 0.2126f;
1934 					break;
1935 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
1936 					Kb = 0.114f;
1937 					Kr = 0.299f;
1938 					break;
1939 				case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
1940 					Kb = 0.0593f;
1941 					Kr = 0.2627f;
1942 					break;
1943 				default:
1944 					UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
1945 				}
1946 
1947 				const float Kg = 1.0f - Kr - Kb;
1948 
1949 				const float Rr = 2 * (1 - Kr);
1950 				const float Gb = -2 * Kb * (1 - Kb) / Kg;
1951 				const float Gr = -2 * Kr * (1 - Kr) / Kg;
1952 				const float Bb = 2 * (1 - Kb);
1953 
1954 				Float4 r = y + Float4(Rr) * v;
1955 				Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
1956 				Float4 b = y + Float4(Bb) * u;
1957 
1958 				c.x = Short4(r * static_cast<float>(0x7FFF));
1959 				c.y = Short4(g * static_cast<float>(0x7FFF));
1960 				c.z = Short4(b * static_cast<float>(0x7FFF));
1961 			}
1962 		}
1963 	}
1964 	else
1965 	{
1966 		return sampleTexel(index, buffer);
1967 	}
1968 
1969 	return c;
1970 }
1971 
sampleTexel(Int4 & uuuu,Int4 & vvvv,Int4 & wwww,const Float4 & dRef,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer)1972 Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
1973 {
1974 	Int4 valid;
1975 
1976 	if(borderModeActive())
1977 	{
1978 		// Valid texels have positive coordinates.
1979 		Int4 negative = uuuu;
1980 		if(state.is2D() || state.is3D() || state.isCube()) negative |= vvvv;
1981 		if(state.is3D() || state.isCube() || state.isArrayed()) negative |= wwww;
1982 		valid = CmpNLT(negative, Int4(0));
1983 	}
1984 
1985 	UInt index[4];
1986 	computeIndices(index, uuuu, vvvv, wwww, sample, valid, mipmap);
1987 
1988 	Vector4f c;
1989 
1990 	if(hasFloatTexture() || has32bitIntegerTextureComponents())
1991 	{
1992 		UInt4 t0, t1, t2, t3;
1993 
1994 		switch(state.textureFormat)
1995 		{
1996 		case VK_FORMAT_R16_SFLOAT:
1997 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
1998 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
1999 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
2000 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
2001 
2002 			c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
2003 			c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
2004 			c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
2005 			c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
2006 			break;
2007 		case VK_FORMAT_R16G16_SFLOAT:
2008 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
2009 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
2010 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
2011 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
2012 
2013 			// TODO: shuffles
2014 			c.x = As<Float4>(halfToFloatBits(t0));
2015 			c.y = As<Float4>(halfToFloatBits(t1));
2016 			c.z = As<Float4>(halfToFloatBits(t2));
2017 			c.w = As<Float4>(halfToFloatBits(t3));
2018 			transpose4x4(c.x, c.y, c.z, c.w);
2019 			break;
2020 		case VK_FORMAT_R16G16B16A16_SFLOAT:
2021 			t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
2022 			t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
2023 			t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
2024 			t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
2025 
2026 			c.x = As<Float4>(halfToFloatBits(t0));
2027 			c.y = As<Float4>(halfToFloatBits(t1));
2028 			c.z = As<Float4>(halfToFloatBits(t2));
2029 			c.w = As<Float4>(halfToFloatBits(t3));
2030 			transpose4x4(c.x, c.y, c.z, c.w);
2031 			break;
2032 		case VK_FORMAT_R32_SFLOAT:
2033 		case VK_FORMAT_R32_SINT:
2034 		case VK_FORMAT_R32_UINT:
2035 		case VK_FORMAT_D32_SFLOAT:
2036 			// TODO: Optimal shuffling?
2037 			c.x.x = *Pointer<Float>(buffer + index[0] * 4);
2038 			c.x.y = *Pointer<Float>(buffer + index[1] * 4);
2039 			c.x.z = *Pointer<Float>(buffer + index[2] * 4);
2040 			c.x.w = *Pointer<Float>(buffer + index[3] * 4);
2041 			break;
2042 		case VK_FORMAT_R32G32_SFLOAT:
2043 		case VK_FORMAT_R32G32_SINT:
2044 		case VK_FORMAT_R32G32_UINT:
2045 			// TODO: Optimal shuffling?
2046 			c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
2047 			c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
2048 			c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
2049 			c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
2050 			c.y = c.x;
2051 			c.x = Float4(c.x.xz, c.z.xz);
2052 			c.y = Float4(c.y.yw, c.z.yw);
2053 			break;
2054 		case VK_FORMAT_R32G32B32A32_SFLOAT:
2055 		case VK_FORMAT_R32G32B32A32_SINT:
2056 		case VK_FORMAT_R32G32B32A32_UINT:
2057 			c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
2058 			c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
2059 			c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
2060 			c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
2061 			transpose4x4(c.x, c.y, c.z, c.w);
2062 			break;
2063 		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2064 			{
2065 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2066 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2067 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2068 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2069 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2070 				t0 = As<UInt4>(t);
2071 				c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
2072 				c.x = Float4(t0 & UInt4(0x1FF)) * c.w;
2073 				c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
2074 				c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
2075 			}
2076 			break;
2077 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2078 			{
2079 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2080 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2081 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2082 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2083 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2084 				t0 = As<UInt4>(t);
2085 				c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
2086 				c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
2087 				c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
2088 			}
2089 			break;
2090 		default:
2091 			UNSUPPORTED("Format %d", VkFormat(state.textureFormat));
2092 		}
2093 	}
2094 	else
2095 	{
2096 		ASSERT(!isYcbcrFormat());
2097 
2098 		Vector4s cs = sampleTexel(index, buffer);
2099 
2100 		bool isInteger = state.textureFormat.isUnnormalizedInteger();
2101 		int componentCount = textureComponentCount();
2102 		for(int n = 0; n < componentCount; n++)
2103 		{
2104 			if(hasUnsignedTextureComponent(n))
2105 			{
2106 				if(isInteger)
2107 				{
2108 					c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
2109 				}
2110 				else
2111 				{
2112 					c[n] = Float4(As<UShort4>(cs[n]));
2113 				}
2114 			}
2115 			else
2116 			{
2117 				if(isInteger)
2118 				{
2119 					c[n] = As<Float4>(Int4(cs[n]));
2120 				}
2121 				else
2122 				{
2123 					c[n] = Float4(cs[n]);
2124 				}
2125 			}
2126 		}
2127 	}
2128 
2129 	if(borderModeActive())
2130 	{
2131 		c = replaceBorderTexel(c, valid);
2132 	}
2133 
2134 	if(state.compareEnable)
2135 	{
2136 		Float4 ref = dRef;
2137 
2138 		if(!hasFloatTexture())
2139 		{
2140 			// D16_UNORM: clamp reference, normalize texel value
2141 			ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
2142 			c.x = c.x * Float4(1.0f / 0xFFFF);
2143 		}
2144 
2145 		Int4 boolean;
2146 
2147 		switch(state.compareOp)
2148 		{
2149 		case VK_COMPARE_OP_LESS_OR_EQUAL: boolean = CmpLE(ref, c.x); break;
2150 		case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
2151 		case VK_COMPARE_OP_LESS: boolean = CmpLT(ref, c.x); break;
2152 		case VK_COMPARE_OP_GREATER: boolean = CmpNLE(ref, c.x); break;
2153 		case VK_COMPARE_OP_EQUAL: boolean = CmpEQ(ref, c.x); break;
2154 		case VK_COMPARE_OP_NOT_EQUAL: boolean = CmpNEQ(ref, c.x); break;
2155 		case VK_COMPARE_OP_ALWAYS: boolean = Int4(-1); break;
2156 		case VK_COMPARE_OP_NEVER: boolean = Int4(0); break;
2157 		default: ASSERT(false);
2158 		}
2159 
2160 		c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
2161 		c.y = Float4(0.0f);
2162 		c.z = Float4(0.0f);
2163 		c.w = Float4(1.0f);
2164 	}
2165 
2166 	return c;
2167 }
2168 
replaceBorderTexel(const Vector4f & c,Int4 valid)2169 Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
2170 {
2171 	Vector4i border;
2172 
2173 	const bool scaled = hasNormalizedFormat();
2174 	const sw::float4 scaleComp = scaled ? getComponentScale() : sw::float4(1.0f, 1.0f, 1.0f, 1.0f);
2175 
2176 	switch(state.border)
2177 	{
2178 	case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
2179 	case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
2180 		border.x = Int4(0);
2181 		border.y = Int4(0);
2182 		border.z = Int4(0);
2183 		border.w = Int4(0);
2184 		break;
2185 	case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
2186 		border.x = Int4(0);
2187 		border.y = Int4(0);
2188 		border.z = Int4(0);
2189 		border.w = Int4(bit_cast<int>(scaleComp.w));
2190 		break;
2191 	case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
2192 		border.x = Int4(0);
2193 		border.y = Int4(0);
2194 		border.z = Int4(0);
2195 		border.w = Int4(1);
2196 		break;
2197 	case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
2198 		border.x = Int4(bit_cast<int>(scaleComp.x));
2199 		border.y = Int4(bit_cast<int>(scaleComp.y));
2200 		border.z = Int4(bit_cast<int>(scaleComp.z));
2201 		border.w = Int4(bit_cast<int>(scaleComp.w));
2202 		break;
2203 	case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
2204 		border.x = Int4(1);
2205 		border.y = Int4(1);
2206 		border.z = Int4(1);
2207 		border.w = Int4(1);
2208 		break;
2209 	case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
2210 		// This bit-casts from float to int in C++ code instead of Reactor code
2211 		// because Reactor does not guarantee preserving infinity (b/140302841).
2212 		border.x = Int4(bit_cast<int>(scaleComp.x * state.customBorder.float32[0]));
2213 		border.y = Int4(bit_cast<int>(scaleComp.y * state.customBorder.float32[1]));
2214 		border.z = Int4(bit_cast<int>(scaleComp.z * state.customBorder.float32[2]));
2215 		border.w = Int4(bit_cast<int>(scaleComp.w * state.customBorder.float32[3]));
2216 		break;
2217 	case VK_BORDER_COLOR_INT_CUSTOM_EXT:
2218 		border.x = Int4(state.customBorder.int32[0]);
2219 		border.y = Int4(state.customBorder.int32[1]);
2220 		border.z = Int4(state.customBorder.int32[2]);
2221 		border.w = Int4(state.customBorder.int32[3]);
2222 		break;
2223 	default:
2224 		UNSUPPORTED("sint/uint/sfloat border: %u", state.border);
2225 	}
2226 
2227 	Vector4f out;
2228 	out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & border.x));  // TODO: IfThenElse()
2229 	out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & border.y));
2230 	out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & border.z));
2231 	out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & border.w));
2232 
2233 	return out;
2234 }
2235 
selectMipmap(const Pointer<Byte> & texture,const Float & lod,bool secondLOD)2236 Pointer<Byte> SamplerCore::selectMipmap(const Pointer<Byte> &texture, const Float &lod, bool secondLOD)
2237 {
2238 	Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
2239 
2240 	if(state.mipmapFilter == MIPMAP_NONE)
2241 	{
2242 		return mipmap0;
2243 	}
2244 
2245 	Int ilod;
2246 
2247 	if(state.mipmapFilter == MIPMAP_POINT)
2248 	{
2249 		// TODO: Preferred formula is ceil(lod + 0.5) - 1
2250 		ilod = RoundInt(lod);
2251 	}
2252 	else  // MIPMAP_LINEAR
2253 	{
2254 		ilod = Int(lod);
2255 	}
2256 
2257 	return mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
2258 }
2259 
computeFilterOffset(Float & lod)2260 Int4 SamplerCore::computeFilterOffset(Float &lod)
2261 {
2262 	if(state.textureFilter == FILTER_POINT)
2263 	{
2264 		return Int4(0);
2265 	}
2266 	else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2267 	{
2268 		return CmpNLE(Float4(lod), Float4(0.0f));
2269 	}
2270 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
2271 	{
2272 		return CmpLE(Float4(lod), Float4(0.0f));
2273 	}
2274 
2275 	return Int4(~0);
2276 }
2277 
address(const Float4 & uw,AddressingMode addressingMode,Pointer<Byte> & mipmap)2278 Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
2279 {
2280 	if(addressingMode == ADDRESSING_UNUSED)
2281 	{
2282 		return Short4(0);  // TODO(b/134669567): Optimize for 1D filtering
2283 	}
2284 	else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
2285 	{
2286 		Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
2287 
2288 		return Short4(Int4(clamp * Float4(1 << 16)));
2289 	}
2290 	else if(addressingMode == ADDRESSING_MIRROR)
2291 	{
2292 		Int4 convert = Int4(uw * Float4(1 << 16));
2293 		Int4 mirror = (convert << 15) >> 31;
2294 
2295 		convert ^= mirror;
2296 
2297 		return Short4(convert);
2298 	}
2299 	else if(addressingMode == ADDRESSING_MIRRORONCE)
2300 	{
2301 		// Absolute value
2302 		Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
2303 
2304 		// Clamp
2305 		convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
2306 		convert = As<Int4>(PackSigned(convert, convert));
2307 
2308 		return As<Short4>(Int2(convert)) + Short4(0x8000u);
2309 	}
2310 	else  // Wrap
2311 	{
2312 		return Short4(Int4(uw * Float4(1 << 16)));
2313 	}
2314 }
2315 
computeLayerIndex16(const Float4 & a,Pointer<Byte> & mipmap)2316 Short4 SamplerCore::computeLayerIndex16(const Float4 &a, Pointer<Byte> &mipmap)
2317 {
2318 	if(!state.isArrayed())
2319 	{
2320 		return {};
2321 	}
2322 
2323 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
2324 
2325 	return Short4(Min(Max(RoundInt(a), Int4(0)), layers - Int4(1)));
2326 }
2327 
2328 // TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
mirror(Int4 n)2329 static Int4 mirror(Int4 n)
2330 {
2331 	auto positive = CmpNLT(n, Int4(0));
2332 	return (positive & n) | (~positive & (-(Int4(1) + n)));
2333 }
2334 
mod(Int4 n,Int4 d)2335 static Int4 mod(Int4 n, Int4 d)
2336 {
2337 	auto x = n % d;
2338 	auto positive = CmpNLT(x, Int4(0));
2339 	return (positive & x) | (~positive & (x + d));
2340 }
2341 
address(const Float4 & uvw,Int4 & xyz0,Int4 & xyz1,Float4 & f,Pointer<Byte> & mipmap,Int4 & offset,Int4 & filter,int whd,AddressingMode addressingMode)2342 void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Int4 &offset, Int4 &filter, int whd, AddressingMode addressingMode)
2343 {
2344 	if(addressingMode == ADDRESSING_UNUSED)
2345 	{
2346 		f = Float4(0.0f);  // TODO(b/134669567): Optimize for 1D filtering
2347 		return;
2348 	}
2349 
2350 	Int4 dim = As<Int4>(*Pointer<UInt4>(mipmap + whd, 16));
2351 	Int4 maxXYZ = dim - Int4(1);
2352 
2353 	if(function == Fetch)  // Unnormalized coordinates
2354 	{
2355 		Int4 xyz = function.offset ? As<Int4>(uvw) + offset : As<Int4>(uvw);
2356 		xyz0 = Min(Max(xyz, Int4(0)), maxXYZ);
2357 
2358 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2359 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2360 		// If the above clamping altered the result, the access is out-of-bounds.
2361 		// In that case set the coordinate to -1 to perform texel replacement later.
2362 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2363 		xyz0 |= outOfBounds;
2364 	}
2365 	else if(addressingMode == ADDRESSING_CUBEFACE)
2366 	{
2367 		xyz0 = As<Int4>(uvw);
2368 	}
2369 	else
2370 	{
2371 		const int oneBits = 0x3F7FFFFF;  // Value just under 1.0f
2372 
2373 		Float4 coord = uvw;
2374 
2375 		if(state.unnormalizedCoordinates)
2376 		{
2377 			switch(addressingMode)
2378 			{
2379 			case ADDRESSING_CLAMP:
2380 				coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
2381 				break;
2382 			case ADDRESSING_BORDER:
2383 				// Don't map to a valid range here.
2384 				break;
2385 			default:
2386 				// "If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
2387 				//  either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER"
2388 				UNREACHABLE("addressingMode %d", int(addressingMode));
2389 				break;
2390 			}
2391 		}
2392 		else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
2393 		{
2394 			// Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
2395 			// Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
2396 			// Here we handle that case separately by doing the mirroring per texel coordinate.
2397 			// TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
2398 
2399 			coord = coord * Float4(dim);
2400 			coord -= Float4(0.5f);
2401 			Float4 floor = Floor(coord);
2402 			xyz0 = Int4(floor);
2403 
2404 			if(function.offset)
2405 			{
2406 				xyz0 += offset;
2407 			}
2408 
2409 			xyz1 = xyz0 + Int4(1);
2410 
2411 			xyz0 = (maxXYZ)-mirror(mod(xyz0, Int4(2) * dim) - dim);
2412 			xyz1 = (maxXYZ)-mirror(mod(xyz1, Int4(2) * dim) - dim);
2413 
2414 			return;
2415 		}
2416 		else
2417 		{
2418 			if(!function.offset)
2419 			{
2420 				switch(addressingMode)
2421 				{
2422 				case ADDRESSING_CLAMP:
2423 				case ADDRESSING_SEAMLESS:
2424 					// While cube face coordinates are nominally already in the [0.0, 1.0] range
2425 					// due to the projection, and numerical imprecision is tolerated due to the
2426 					// border of pixels for seamless filtering, the projection doesn't cause
2427 					// range normalization for Inf and NaN values. So we always clamp.
2428 					{
2429 						Float4 one = As<Float4>(Int4(oneBits));
2430 						coord = Min(Max(coord, Float4(0.0f)), one);
2431 					}
2432 					break;
2433 				case ADDRESSING_MIRROR:
2434 					{
2435 						Float4 one = As<Float4>(Int4(oneBits));
2436 						coord = coord * Float4(0.5f);
2437 						coord = Float4(2.0f) * Abs(coord - Round(coord));
2438 						coord = Min(coord, one);
2439 					}
2440 					break;
2441 				case ADDRESSING_MIRRORONCE:
2442 					{
2443 						Float4 one = As<Float4>(Int4(oneBits));
2444 						coord = Min(Abs(coord), one);
2445 					}
2446 					break;
2447 				case ADDRESSING_BORDER:
2448 					// Don't map to a valid range here.
2449 					break;
2450 				default:  // Wrap
2451 					coord = Frac(coord);
2452 					break;
2453 				}
2454 			}
2455 
2456 			coord = coord * Float4(dim);
2457 		}
2458 
2459 		if(state.textureFilter == FILTER_POINT)
2460 		{
2461 			if(addressingMode == ADDRESSING_BORDER || function.offset)
2462 			{
2463 				xyz0 = Int4(Floor(coord));
2464 			}
2465 			else  // Can't have negative coordinates, so floor() is redundant when casting to int.
2466 			{
2467 				xyz0 = Int4(coord);
2468 			}
2469 		}
2470 		else
2471 		{
2472 			if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
2473 			   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2474 			{
2475 				coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
2476 			}
2477 			else
2478 			{
2479 				coord -= Float4(0.5f);
2480 			}
2481 
2482 			Float4 floor = Floor(coord);
2483 			xyz0 = Int4(floor);
2484 			f = coord - floor;
2485 		}
2486 
2487 		if(function.offset)
2488 		{
2489 			xyz0 += offset;
2490 		}
2491 
2492 		if(addressingMode == ADDRESSING_SEAMLESS)  // Adjust for border.
2493 		{
2494 			xyz0 += Int4(1);
2495 		}
2496 
2497 		xyz1 = xyz0 - filter;  // Increment
2498 
2499 		if(addressingMode == ADDRESSING_BORDER)
2500 		{
2501 			// Replace the coordinates with -1 if they're out of range.
2502 			Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
2503 			Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
2504 			xyz0 |= border0;
2505 			xyz1 |= border1;
2506 		}
2507 		else if(function.offset)
2508 		{
2509 			switch(addressingMode)
2510 			{
2511 			case ADDRESSING_SEAMLESS:
2512 				UNREACHABLE("addressingMode %d", int(addressingMode));  // Cube sampling doesn't support offset.
2513 			case ADDRESSING_MIRROR:
2514 			case ADDRESSING_MIRRORONCE:
2515 				// TODO(b/29069044): Implement ADDRESSING_MIRROR and ADDRESSING_MIRRORONCE.
2516 				// Fall through to Clamp.
2517 			case ADDRESSING_CLAMP:
2518 				xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
2519 				xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
2520 				break;
2521 			default:  // Wrap
2522 				xyz0 = mod(xyz0, dim);
2523 				xyz1 = mod(xyz1, dim);
2524 				break;
2525 			}
2526 		}
2527 		else if(state.textureFilter != FILTER_POINT)
2528 		{
2529 			switch(addressingMode)
2530 			{
2531 			case ADDRESSING_SEAMLESS:
2532 				break;
2533 			case ADDRESSING_MIRROR:
2534 			case ADDRESSING_MIRRORONCE:
2535 			case ADDRESSING_CLAMP:
2536 				xyz0 = Max(xyz0, Int4(0));
2537 				xyz1 = Min(xyz1, maxXYZ);
2538 				break;
2539 			default:  // Wrap
2540 				{
2541 					Int4 under = CmpLT(xyz0, Int4(0));
2542 					xyz0 = (under & maxXYZ) | (~under & xyz0);  // xyz < 0 ? dim - 1 : xyz   // TODO: IfThenElse()
2543 
2544 					Int4 nover = CmpLT(xyz1, dim);
2545 					xyz1 = nover & xyz1;  // xyz >= dim ? 0 : xyz
2546 				}
2547 				break;
2548 			}
2549 		}
2550 	}
2551 }
2552 
computeLayerIndex(const Float4 & a,Pointer<Byte> & mipmap)2553 Int4 SamplerCore::computeLayerIndex(const Float4 &a, Pointer<Byte> &mipmap)
2554 {
2555 	if(!state.isArrayed())
2556 	{
2557 		return {};
2558 	}
2559 
2560 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth), 16);
2561 	Int4 maxLayer = layers - Int4(1);
2562 
2563 	if(function == Fetch)  // Unnormalized coordinates
2564 	{
2565 		Int4 xyz = As<Int4>(a);
2566 		Int4 xyz0 = Min(Max(xyz, Int4(0)), maxLayer);
2567 
2568 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2569 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2570 		// If the above clamping altered the result, the access is out-of-bounds.
2571 		// In that case set the coordinate to -1 to perform texel replacement later.
2572 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2573 		xyz0 |= outOfBounds;
2574 
2575 		return xyz0;
2576 	}
2577 	else
2578 	{
2579 		return Min(Max(RoundInt(a), Int4(0)), maxLayer);
2580 	}
2581 }
2582 
sRGBtoLinearFF00(Short4 & c)2583 void SamplerCore::sRGBtoLinearFF00(Short4 &c)
2584 {
2585 	c = As<UShort4>(c) >> 8;
2586 
2587 	Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants, sRGBtoLinearFF_FF00));
2588 
2589 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
2590 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
2591 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
2592 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
2593 }
2594 
hasNormalizedFormat() const2595 bool SamplerCore::hasNormalizedFormat() const
2596 {
2597 	return state.textureFormat.isSignedNormalized() || state.textureFormat.isUnsignedNormalized();
2598 }
2599 
hasFloatTexture() const2600 bool SamplerCore::hasFloatTexture() const
2601 {
2602 	return state.textureFormat.isFloatFormat();
2603 }
2604 
hasUnnormalizedIntegerTexture() const2605 bool SamplerCore::hasUnnormalizedIntegerTexture() const
2606 {
2607 	return state.textureFormat.isUnnormalizedInteger();
2608 }
2609 
hasUnsignedTextureComponent(int component) const2610 bool SamplerCore::hasUnsignedTextureComponent(int component) const
2611 {
2612 	return state.textureFormat.isUnsignedComponent(component);
2613 }
2614 
textureComponentCount() const2615 int SamplerCore::textureComponentCount() const
2616 {
2617 	return state.textureFormat.componentCount();
2618 }
2619 
has16bitPackedTextureFormat() const2620 bool SamplerCore::has16bitPackedTextureFormat() const
2621 {
2622 	return state.textureFormat.has16bitPackedTextureFormat();
2623 }
2624 
has8bitTextureComponents() const2625 bool SamplerCore::has8bitTextureComponents() const
2626 {
2627 	return state.textureFormat.has8bitTextureComponents();
2628 }
2629 
has16bitTextureComponents() const2630 bool SamplerCore::has16bitTextureComponents() const
2631 {
2632 	return state.textureFormat.has16bitTextureComponents();
2633 }
2634 
has32bitIntegerTextureComponents() const2635 bool SamplerCore::has32bitIntegerTextureComponents() const
2636 {
2637 	return state.textureFormat.has32bitIntegerTextureComponents();
2638 }
2639 
isYcbcrFormat() const2640 bool SamplerCore::isYcbcrFormat() const
2641 {
2642 	return state.textureFormat.isYcbcrFormat();
2643 }
2644 
isRGBComponent(int component) const2645 bool SamplerCore::isRGBComponent(int component) const
2646 {
2647 	return state.textureFormat.isRGBComponent(component);
2648 }
2649 
borderModeActive() const2650 bool SamplerCore::borderModeActive() const
2651 {
2652 	return state.addressingModeU == ADDRESSING_BORDER ||
2653 	       state.addressingModeV == ADDRESSING_BORDER ||
2654 	       state.addressingModeW == ADDRESSING_BORDER;
2655 }
2656 
gatherSwizzle() const2657 VkComponentSwizzle SamplerCore::gatherSwizzle() const
2658 {
2659 	switch(state.gatherComponent)
2660 	{
2661 	case 0: return state.swizzle.r;
2662 	case 1: return state.swizzle.g;
2663 	case 2: return state.swizzle.b;
2664 	case 3: return state.swizzle.a;
2665 	default:
2666 		UNREACHABLE("Invalid component");
2667 		return VK_COMPONENT_SWIZZLE_R;
2668 	}
2669 }
2670 
getComponentScale() const2671 sw::float4 SamplerCore::getComponentScale() const
2672 {
2673 	// TODO(b/204709464): Unlike other formats, the fixed-point representation of the formats below are handled with bit extension.
2674 	// This special handling of such formats should be removed later.
2675 	switch(state.textureFormat)
2676 	{
2677 	case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
2678 	case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
2679 	case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16:
2680 		return sw::float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF);
2681 	default:
2682 		break;
2683 	};
2684 
2685 	const sw::int4 bits = state.textureFormat.bitsPerComponent();
2686 	const sw::int4 shift = sw::int4(16 - bits.x, 16 - bits.y, 16 - bits.z, 16 - bits.w);
2687 	const uint16_t sign = state.textureFormat.isUnsigned() ? 0xFFFF : 0x7FFF;
2688 
2689 	return sw::float4(static_cast<uint16_t>(0xFFFF << shift.x) & sign,
2690 	                  static_cast<uint16_t>(0xFFFF << shift.y) & sign,
2691 	                  static_cast<uint16_t>(0xFFFF << shift.z) & sign,
2692 	                  static_cast<uint16_t>(0xFFFF << shift.w) & sign);
2693 }
2694 
getGatherComponent() const2695 int SamplerCore::getGatherComponent() const
2696 {
2697 	VkComponentSwizzle swizzle = gatherSwizzle();
2698 
2699 	switch(swizzle)
2700 	{
2701 	default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle); return 0;
2702 	case VK_COMPONENT_SWIZZLE_R:
2703 	case VK_COMPONENT_SWIZZLE_G:
2704 	case VK_COMPONENT_SWIZZLE_B:
2705 	case VK_COMPONENT_SWIZZLE_A:
2706 		// Normalize all components using the gather component scale.
2707 		return swizzle - VK_COMPONENT_SWIZZLE_R;
2708 	case VK_COMPONENT_SWIZZLE_ZERO:
2709 	case VK_COMPONENT_SWIZZLE_ONE:
2710 		// These cases are handled later.
2711 		return 0;
2712 	}
2713 
2714 	return 0;
2715 }
2716 
2717 }  // namespace sw
2718