• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 	#include <xmmintrin.h>
30 	#include <emmintrin.h>
31 #endif
32 
33 #undef min
34 #undef max
35 
36 namespace sw
37 {
38 	extern bool quadLayoutEnabled;
39 	extern bool complementaryDepthBuffer;
40 	extern TranscendentalPrecision logPrecision;
41 
42 	unsigned int *Surface::palette = 0;
43 	unsigned int Surface::paletteID = 0;
44 
write(int x,int y,int z,const Color<float> & color)45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
46 	{
47 		ASSERT((x >= -border) && (x < (width + border)));
48 		ASSERT((y >= -border) && (y < (height + border)));
49 		ASSERT((z >= 0) && (z < depth));
50 
51 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
52 
53 		for(int i = 0; i < samples; i++)
54 		{
55 			write(element, color);
56 			element += sliceB;
57 		}
58 	}
59 
write(int x,int y,const Color<float> & color)60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
61 	{
62 		ASSERT((x >= -border) && (x < (width + border)));
63 		ASSERT((y >= -border) && (y < (height + border)));
64 
65 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
66 
67 		for(int i = 0; i < samples; i++)
68 		{
69 			write(element, color);
70 			element += sliceB;
71 		}
72 	}
73 
write(void * element,const Color<float> & color)74 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
75 	{
76 		float r = color.r;
77 		float g = color.g;
78 		float b = color.b;
79 		float a = color.a;
80 
81 		if(isSRGBformat(format))
82 		{
83 			r = linearToSRGB(r);
84 			g = linearToSRGB(g);
85 			b = linearToSRGB(b);
86 		}
87 
88 		switch(format)
89 		{
90 		case FORMAT_A8:
91 			*(unsigned char*)element = unorm<8>(a);
92 			break;
93 		case FORMAT_R8_SNORM:
94 			*(char*)element = snorm<8>(r);
95 			break;
96 		case FORMAT_R8:
97 			*(unsigned char*)element = unorm<8>(r);
98 			break;
99 		case FORMAT_R8I:
100 			*(char*)element = scast<8>(r);
101 			break;
102 		case FORMAT_R8UI:
103 			*(unsigned char*)element = ucast<8>(r);
104 			break;
105 		case FORMAT_R16I:
106 			*(short*)element = scast<16>(r);
107 			break;
108 		case FORMAT_R16UI:
109 			*(unsigned short*)element = ucast<16>(r);
110 			break;
111 		case FORMAT_R32I:
112 			*(int*)element = static_cast<int>(r);
113 			break;
114 		case FORMAT_R32UI:
115 			*(unsigned int*)element = static_cast<unsigned int>(r);
116 			break;
117 		case FORMAT_R3G3B2:
118 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
119 			break;
120 		case FORMAT_A8R3G3B2:
121 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
122 			break;
123 		case FORMAT_X4R4G4B4:
124 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
125 			break;
126 		case FORMAT_A4R4G4B4:
127 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
128 			break;
129 		case FORMAT_R4G4B4A4:
130 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
131 			break;
132 		case FORMAT_R5G6B5:
133 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
134 			break;
135 		case FORMAT_A1R5G5B5:
136 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
137 			break;
138 		case FORMAT_R5G5B5A1:
139 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
140 			break;
141 		case FORMAT_X1R5G5B5:
142 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
143 			break;
144 		case FORMAT_A8R8G8B8:
145 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
146 			break;
147 		case FORMAT_X8R8G8B8:
148 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
149 			break;
150 		case FORMAT_A8B8G8R8_SNORM:
151 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
152 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
153 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
154 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
155 			break;
156 		case FORMAT_A8B8G8R8:
157 		case FORMAT_SRGB8_A8:
158 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
159 			break;
160 		case FORMAT_A8B8G8R8I:
161 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
162 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
163 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
164 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
165 			break;
166 		case FORMAT_A8B8G8R8UI:
167 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
168 			break;
169 		case FORMAT_X8B8G8R8_SNORM:
170 			*(unsigned int*)element = 0x7F000000 |
171 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
172 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
173 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
174 			break;
175 		case FORMAT_X8B8G8R8:
176 		case FORMAT_SRGB8_X8:
177 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
178 			break;
179 		case FORMAT_X8B8G8R8I:
180 			*(unsigned int*)element = 0x7F000000 |
181 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
182 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
183 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
184 		case FORMAT_X8B8G8R8UI:
185 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
186 			break;
187 		case FORMAT_A2R10G10B10:
188 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
189 			break;
190 		case FORMAT_A2B10G10R10:
191 		case FORMAT_A2B10G10R10UI:
192 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
193 			break;
194 		case FORMAT_G8R8_SNORM:
195 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
196 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
197 			break;
198 		case FORMAT_G8R8:
199 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
200 			break;
201 		case FORMAT_G8R8I:
202 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
203 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
204 			break;
205 		case FORMAT_G8R8UI:
206 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
207 			break;
208 		case FORMAT_G16R16:
209 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
210 			break;
211 		case FORMAT_G16R16I:
212 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
213 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
214 			break;
215 		case FORMAT_G16R16UI:
216 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
217 			break;
218 		case FORMAT_G32R32I:
219 		case FORMAT_G32R32UI:
220 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
221 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
222 			break;
223 		case FORMAT_A16B16G16R16:
224 			((unsigned short*)element)[0] = unorm<16>(r);
225 			((unsigned short*)element)[1] = unorm<16>(g);
226 			((unsigned short*)element)[2] = unorm<16>(b);
227 			((unsigned short*)element)[3] = unorm<16>(a);
228 			break;
229 		case FORMAT_A16B16G16R16I:
230 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
231 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
232 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
233 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
234 			break;
235 		case FORMAT_A16B16G16R16UI:
236 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
237 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
238 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
239 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
240 			break;
241 		case FORMAT_X16B16G16R16I:
242 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
243 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
244 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
245 			break;
246 		case FORMAT_X16B16G16R16UI:
247 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
248 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
249 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
250 			break;
251 		case FORMAT_A32B32G32R32I:
252 		case FORMAT_A32B32G32R32UI:
253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
256 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
257 			break;
258 		case FORMAT_X32B32G32R32I:
259 		case FORMAT_X32B32G32R32UI:
260 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
261 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
262 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
263 			break;
264 		case FORMAT_V8U8:
265 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
266 			break;
267 		case FORMAT_L6V5U5:
268 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
269 			break;
270 		case FORMAT_Q8W8V8U8:
271 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
272 			break;
273 		case FORMAT_X8L8V8U8:
274 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
275 			break;
276 		case FORMAT_V16U16:
277 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
278 			break;
279 		case FORMAT_A2W10V10U10:
280 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
281 			break;
282 		case FORMAT_A16W16V16U16:
283 			((unsigned short*)element)[0] = snorm<16>(r);
284 			((unsigned short*)element)[1] = snorm<16>(g);
285 			((unsigned short*)element)[2] = snorm<16>(b);
286 			((unsigned short*)element)[3] = unorm<16>(a);
287 			break;
288 		case FORMAT_Q16W16V16U16:
289 			((unsigned short*)element)[0] = snorm<16>(r);
290 			((unsigned short*)element)[1] = snorm<16>(g);
291 			((unsigned short*)element)[2] = snorm<16>(b);
292 			((unsigned short*)element)[3] = snorm<16>(a);
293 			break;
294 		case FORMAT_R8G8B8:
295 			((unsigned char*)element)[0] = unorm<8>(b);
296 			((unsigned char*)element)[1] = unorm<8>(g);
297 			((unsigned char*)element)[2] = unorm<8>(r);
298 			break;
299 		case FORMAT_B8G8R8:
300 			((unsigned char*)element)[0] = unorm<8>(r);
301 			((unsigned char*)element)[1] = unorm<8>(g);
302 			((unsigned char*)element)[2] = unorm<8>(b);
303 			break;
304 		case FORMAT_R16F:
305 			*(half*)element = (half)r;
306 			break;
307 		case FORMAT_A16F:
308 			*(half*)element = (half)a;
309 			break;
310 		case FORMAT_G16R16F:
311 			((half*)element)[0] = (half)r;
312 			((half*)element)[1] = (half)g;
313 			break;
314 		case FORMAT_X16B16G16R16F_UNSIGNED:
315 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
316 			// Fall through to FORMAT_X16B16G16R16F.
317 		case FORMAT_X16B16G16R16F:
318 			((half*)element)[3] = 1.0f;
319 			// Fall through to FORMAT_B16G16R16F.
320 		case FORMAT_B16G16R16F:
321 			((half*)element)[0] = (half)r;
322 			((half*)element)[1] = (half)g;
323 			((half*)element)[2] = (half)b;
324 			break;
325 		case FORMAT_A16B16G16R16F:
326 			((half*)element)[0] = (half)r;
327 			((half*)element)[1] = (half)g;
328 			((half*)element)[2] = (half)b;
329 			((half*)element)[3] = (half)a;
330 			break;
331 		case FORMAT_A32F:
332 			*(float*)element = a;
333 			break;
334 		case FORMAT_R32F:
335 			*(float*)element = r;
336 			break;
337 		case FORMAT_G32R32F:
338 			((float*)element)[0] = r;
339 			((float*)element)[1] = g;
340 			break;
341 		case FORMAT_X32B32G32R32F_UNSIGNED:
342 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
343 			// Fall through to FORMAT_X32B32G32R32F.
344 		case FORMAT_X32B32G32R32F:
345 			((float*)element)[3] = 1.0f;
346 			// Fall through to FORMAT_B32G32R32F.
347 		case FORMAT_B32G32R32F:
348 			((float*)element)[0] = r;
349 			((float*)element)[1] = g;
350 			((float*)element)[2] = b;
351 			break;
352 		case FORMAT_A32B32G32R32F:
353 			((float*)element)[0] = r;
354 			((float*)element)[1] = g;
355 			((float*)element)[2] = b;
356 			((float*)element)[3] = a;
357 			break;
358 		case FORMAT_D32F:
359 		case FORMAT_D32FS8:
360 		case FORMAT_D32F_LOCKABLE:
361 		case FORMAT_D32FS8_TEXTURE:
362 		case FORMAT_D32F_SHADOW:
363 		case FORMAT_D32FS8_SHADOW:
364 			*((float*)element) = r;
365 			break;
366 		case FORMAT_D32F_COMPLEMENTARY:
367 		case FORMAT_D32FS8_COMPLEMENTARY:
368 			*((float*)element) = 1 - r;
369 			break;
370 		case FORMAT_S8:
371 			*((unsigned char*)element) = unorm<8>(r);
372 			break;
373 		case FORMAT_L8:
374 			*(unsigned char*)element = unorm<8>(r);
375 			break;
376 		case FORMAT_A4L4:
377 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
378 			break;
379 		case FORMAT_L16:
380 			*(unsigned short*)element = unorm<16>(r);
381 			break;
382 		case FORMAT_A8L8:
383 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
384 			break;
385 		case FORMAT_L16F:
386 			*(half*)element = (half)r;
387 			break;
388 		case FORMAT_A16L16F:
389 			((half*)element)[0] = (half)r;
390 			((half*)element)[1] = (half)a;
391 			break;
392 		case FORMAT_L32F:
393 			*(float*)element = r;
394 			break;
395 		case FORMAT_A32L32F:
396 			((float*)element)[0] = r;
397 			((float*)element)[1] = a;
398 			break;
399 		default:
400 			ASSERT(false);
401 		}
402 	}
403 
read(int x,int y,int z) const404 	Color<float> Surface::Buffer::read(int x, int y, int z) const
405 	{
406 		ASSERT((x >= -border) && (x < (width + border)));
407 		ASSERT((y >= -border) && (y < (height + border)));
408 		ASSERT((z >= 0) && (z < depth));
409 
410 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
411 
412 		return read(element);
413 	}
414 
read(int x,int y) const415 	Color<float> Surface::Buffer::read(int x, int y) const
416 	{
417 		ASSERT((x >= -border) && (x < (width + border)));
418 		ASSERT((y >= -border) && (y < (height + border)));
419 
420 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
421 
422 		return read(element);
423 	}
424 
read(void * element) const425 	inline Color<float> Surface::Buffer::read(void *element) const
426 	{
427 		float r = 0.0f;
428 		float g = 0.0f;
429 		float b = 0.0f;
430 		float a = 1.0f;
431 
432 		switch(format)
433 		{
434 		case FORMAT_P8:
435 			{
436 				ASSERT(palette);
437 
438 				unsigned int abgr = palette[*(unsigned char*)element];
439 
440 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
441 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
442 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
443 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
444 			}
445 			break;
446 		case FORMAT_A8P8:
447 			{
448 				ASSERT(palette);
449 
450 				unsigned int bgr = palette[((unsigned char*)element)[0]];
451 
452 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
453 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
454 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
455 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
456 			}
457 			break;
458 		case FORMAT_A8:
459 			r = 0;
460 			g = 0;
461 			b = 0;
462 			a = *(unsigned char*)element * (1.0f / 0xFF);
463 			break;
464 		case FORMAT_R8_SNORM:
465 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
466 			break;
467 		case FORMAT_R8:
468 			r = *(unsigned char*)element * (1.0f / 0xFF);
469 			break;
470 		case FORMAT_R8I:
471 			r = *(signed char*)element;
472 			break;
473 		case FORMAT_R8UI:
474 			r = *(unsigned char*)element;
475 			break;
476 		case FORMAT_R3G3B2:
477 			{
478 				unsigned char rgb = *(unsigned char*)element;
479 
480 				r = (rgb & 0xE0) * (1.0f / 0xE0);
481 				g = (rgb & 0x1C) * (1.0f / 0x1C);
482 				b = (rgb & 0x03) * (1.0f / 0x03);
483 			}
484 			break;
485 		case FORMAT_A8R3G3B2:
486 			{
487 				unsigned short argb = *(unsigned short*)element;
488 
489 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
490 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
491 				g = (argb & 0x001C) * (1.0f / 0x001C);
492 				b = (argb & 0x0003) * (1.0f / 0x0003);
493 			}
494 			break;
495 		case FORMAT_X4R4G4B4:
496 			{
497 				unsigned short rgb = *(unsigned short*)element;
498 
499 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
500 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
501 				b = (rgb & 0x000F) * (1.0f / 0x000F);
502 			}
503 			break;
504 		case FORMAT_A4R4G4B4:
505 			{
506 				unsigned short argb = *(unsigned short*)element;
507 
508 				a = (argb & 0xF000) * (1.0f / 0xF000);
509 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
510 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
511 				b = (argb & 0x000F) * (1.0f / 0x000F);
512 			}
513 			break;
514 		case FORMAT_R4G4B4A4:
515 			{
516 				unsigned short rgba = *(unsigned short*)element;
517 
518 				r = (rgba & 0xF000) * (1.0f / 0xF000);
519 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
520 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
521 				a = (rgba & 0x000F) * (1.0f / 0x000F);
522 			}
523 			break;
524 		case FORMAT_R5G6B5:
525 			{
526 				unsigned short rgb = *(unsigned short*)element;
527 
528 				r = (rgb & 0xF800) * (1.0f / 0xF800);
529 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
530 				b = (rgb & 0x001F) * (1.0f / 0x001F);
531 			}
532 			break;
533 		case FORMAT_A1R5G5B5:
534 			{
535 				unsigned short argb = *(unsigned short*)element;
536 
537 				a = (argb & 0x8000) * (1.0f / 0x8000);
538 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
539 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
540 				b = (argb & 0x001F) * (1.0f / 0x001F);
541 			}
542 			break;
543 		case FORMAT_R5G5B5A1:
544 			{
545 				unsigned short rgba = *(unsigned short*)element;
546 
547 				r = (rgba & 0xF800) * (1.0f / 0xF800);
548 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
549 				b = (rgba & 0x003E) * (1.0f / 0x003E);
550 				a = (rgba & 0x0001) * (1.0f / 0x0001);
551 			}
552 			break;
553 		case FORMAT_X1R5G5B5:
554 			{
555 				unsigned short xrgb = *(unsigned short*)element;
556 
557 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
558 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
559 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
560 			}
561 			break;
562 		case FORMAT_A8R8G8B8:
563 			{
564 				unsigned int argb = *(unsigned int*)element;
565 
566 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
567 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
568 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
569 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
570 			}
571 			break;
572 		case FORMAT_X8R8G8B8:
573 			{
574 				unsigned int xrgb = *(unsigned int*)element;
575 
576 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
577 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
578 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
579 			}
580 			break;
581 		case FORMAT_A8B8G8R8_SNORM:
582 			{
583 				signed char* abgr = (signed char*)element;
584 
585 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
586 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
587 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
588 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
589 			}
590 			break;
591 		case FORMAT_A8B8G8R8:
592 		case FORMAT_SRGB8_A8:
593 			{
594 				unsigned int abgr = *(unsigned int*)element;
595 
596 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
597 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
598 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
599 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
600 			}
601 			break;
602 		case FORMAT_A8B8G8R8I:
603 			{
604 				signed char* abgr = (signed char*)element;
605 
606 				r = abgr[0];
607 				g = abgr[1];
608 				b = abgr[2];
609 				a = abgr[3];
610 			}
611 			break;
612 		case FORMAT_A8B8G8R8UI:
613 			{
614 				unsigned char* abgr = (unsigned char*)element;
615 
616 				r = abgr[0];
617 				g = abgr[1];
618 				b = abgr[2];
619 				a = abgr[3];
620 			}
621 			break;
622 		case FORMAT_X8B8G8R8_SNORM:
623 			{
624 				signed char* bgr = (signed char*)element;
625 
626 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
627 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
628 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
629 			}
630 			break;
631 		case FORMAT_X8B8G8R8:
632 		case FORMAT_SRGB8_X8:
633 			{
634 				unsigned int xbgr = *(unsigned int*)element;
635 
636 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
637 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
638 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
639 			}
640 			break;
641 		case FORMAT_X8B8G8R8I:
642 			{
643 				signed char* bgr = (signed char*)element;
644 
645 				r = bgr[0];
646 				g = bgr[1];
647 				b = bgr[2];
648 			}
649 			break;
650 		case FORMAT_X8B8G8R8UI:
651 			{
652 				unsigned char* bgr = (unsigned char*)element;
653 
654 				r = bgr[0];
655 				g = bgr[1];
656 				b = bgr[2];
657 			}
658 			break;
659 		case FORMAT_G8R8_SNORM:
660 			{
661 				signed char* gr = (signed char*)element;
662 
663 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
664 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
665 			}
666 			break;
667 		case FORMAT_G8R8:
668 			{
669 				unsigned short gr = *(unsigned short*)element;
670 
671 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
672 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
673 			}
674 			break;
675 		case FORMAT_G8R8I:
676 			{
677 				signed char* gr = (signed char*)element;
678 
679 				r = gr[0];
680 				g = gr[1];
681 			}
682 			break;
683 		case FORMAT_G8R8UI:
684 			{
685 				unsigned char* gr = (unsigned char*)element;
686 
687 				r = gr[0];
688 				g = gr[1];
689 			}
690 			break;
691 		case FORMAT_R16I:
692 			r = *((short*)element);
693 			break;
694 		case FORMAT_R16UI:
695 			r = *((unsigned short*)element);
696 			break;
697 		case FORMAT_G16R16I:
698 			{
699 				short* gr = (short*)element;
700 
701 				r = gr[0];
702 				g = gr[1];
703 			}
704 			break;
705 		case FORMAT_G16R16:
706 			{
707 				unsigned int gr = *(unsigned int*)element;
708 
709 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
710 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
711 			}
712 			break;
713 		case FORMAT_G16R16UI:
714 			{
715 				unsigned short* gr = (unsigned short*)element;
716 
717 				r = gr[0];
718 				g = gr[1];
719 			}
720 			break;
721 		case FORMAT_A2R10G10B10:
722 			{
723 				unsigned int argb = *(unsigned int*)element;
724 
725 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
726 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
727 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
728 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
729 			}
730 			break;
731 		case FORMAT_A2B10G10R10:
732 			{
733 				unsigned int abgr = *(unsigned int*)element;
734 
735 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
736 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
737 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
738 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
739 			}
740 			break;
741 		case FORMAT_A2B10G10R10UI:
742 			{
743 				unsigned int abgr = *(unsigned int*)element;
744 
745 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
746 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
747 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
748 				r = static_cast<float>(abgr & 0x000003FF);
749 			}
750 			break;
751 		case FORMAT_A16B16G16R16I:
752 			{
753 				short* abgr = (short*)element;
754 
755 				r = abgr[0];
756 				g = abgr[1];
757 				b = abgr[2];
758 				a = abgr[3];
759 			}
760 			break;
761 		case FORMAT_A16B16G16R16:
762 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
763 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
764 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
765 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
766 			break;
767 		case FORMAT_A16B16G16R16UI:
768 			{
769 				unsigned short* abgr = (unsigned short*)element;
770 
771 				r = abgr[0];
772 				g = abgr[1];
773 				b = abgr[2];
774 				a = abgr[3];
775 			}
776 			break;
777 		case FORMAT_X16B16G16R16I:
778 			{
779 				short* bgr = (short*)element;
780 
781 				r = bgr[0];
782 				g = bgr[1];
783 				b = bgr[2];
784 			}
785 			break;
786 		case FORMAT_X16B16G16R16UI:
787 			{
788 				unsigned short* bgr = (unsigned short*)element;
789 
790 				r = bgr[0];
791 				g = bgr[1];
792 				b = bgr[2];
793 			}
794 			break;
795 		case FORMAT_A32B32G32R32I:
796 			{
797 				int* abgr = (int*)element;
798 
799 				r = static_cast<float>(abgr[0]);
800 				g = static_cast<float>(abgr[1]);
801 				b = static_cast<float>(abgr[2]);
802 				a = static_cast<float>(abgr[3]);
803 			}
804 			break;
805 		case FORMAT_A32B32G32R32UI:
806 			{
807 				unsigned int* abgr = (unsigned int*)element;
808 
809 				r = static_cast<float>(abgr[0]);
810 				g = static_cast<float>(abgr[1]);
811 				b = static_cast<float>(abgr[2]);
812 				a = static_cast<float>(abgr[3]);
813 			}
814 			break;
815 		case FORMAT_X32B32G32R32I:
816 			{
817 				int* bgr = (int*)element;
818 
819 				r = static_cast<float>(bgr[0]);
820 				g = static_cast<float>(bgr[1]);
821 				b = static_cast<float>(bgr[2]);
822 			}
823 			break;
824 		case FORMAT_X32B32G32R32UI:
825 			{
826 				unsigned int* bgr = (unsigned int*)element;
827 
828 				r = static_cast<float>(bgr[0]);
829 				g = static_cast<float>(bgr[1]);
830 				b = static_cast<float>(bgr[2]);
831 			}
832 			break;
833 		case FORMAT_G32R32I:
834 			{
835 				int* gr = (int*)element;
836 
837 				r = static_cast<float>(gr[0]);
838 				g = static_cast<float>(gr[1]);
839 			}
840 			break;
841 		case FORMAT_G32R32UI:
842 			{
843 				unsigned int* gr = (unsigned int*)element;
844 
845 				r = static_cast<float>(gr[0]);
846 				g = static_cast<float>(gr[1]);
847 			}
848 			break;
849 		case FORMAT_R32I:
850 			r = static_cast<float>(*((int*)element));
851 			break;
852 		case FORMAT_R32UI:
853 			r = static_cast<float>(*((unsigned int*)element));
854 			break;
855 		case FORMAT_V8U8:
856 			{
857 				unsigned short vu = *(unsigned short*)element;
858 
859 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
860 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
861 			}
862 			break;
863 		case FORMAT_L6V5U5:
864 			{
865 				unsigned short lvu = *(unsigned short*)element;
866 
867 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
868 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
869 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
870 			}
871 			break;
872 		case FORMAT_Q8W8V8U8:
873 			{
874 				unsigned int qwvu = *(unsigned int*)element;
875 
876 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
877 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
878 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
879 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
880 			}
881 			break;
882 		case FORMAT_X8L8V8U8:
883 			{
884 				unsigned int xlvu = *(unsigned int*)element;
885 
886 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
887 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
888 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
889 			}
890 			break;
891 		case FORMAT_R8G8B8:
892 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
893 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
894 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
895 			break;
896 		case FORMAT_B8G8R8:
897 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
898 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
899 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
900 			break;
901 		case FORMAT_V16U16:
902 			{
903 				unsigned int vu = *(unsigned int*)element;
904 
905 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
906 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
907 			}
908 			break;
909 		case FORMAT_A2W10V10U10:
910 			{
911 				unsigned int awvu = *(unsigned int*)element;
912 
913 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
914 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
915 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
916 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
917 			}
918 			break;
919 		case FORMAT_A16W16V16U16:
920 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
921 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
922 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
923 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
924 			break;
925 		case FORMAT_Q16W16V16U16:
926 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
927 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
928 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
929 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
930 			break;
931 		case FORMAT_L8:
932 			r =
933 			g =
934 			b = *(unsigned char*)element * (1.0f / 0xFF);
935 			break;
936 		case FORMAT_A4L4:
937 			{
938 				unsigned char al = *(unsigned char*)element;
939 
940 				r =
941 				g =
942 				b = (al & 0x0F) * (1.0f / 0x0F);
943 				a = (al & 0xF0) * (1.0f / 0xF0);
944 			}
945 			break;
946 		case FORMAT_L16:
947 			r =
948 			g =
949 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
950 			break;
951 		case FORMAT_A8L8:
952 			r =
953 			g =
954 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
955 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
956 			break;
957 		case FORMAT_L16F:
958 			r =
959 			g =
960 			b = *(half*)element;
961 			break;
962 		case FORMAT_A16L16F:
963 			r =
964 			g =
965 			b = ((half*)element)[0];
966 			a = ((half*)element)[1];
967 			break;
968 		case FORMAT_L32F:
969 			r =
970 			g =
971 			b = *(float*)element;
972 			break;
973 		case FORMAT_A32L32F:
974 			r =
975 			g =
976 			b = ((float*)element)[0];
977 			a = ((float*)element)[1];
978 			break;
979 		case FORMAT_A16F:
980 			a = *(half*)element;
981 			break;
982 		case FORMAT_R16F:
983 			r = *(half*)element;
984 			break;
985 		case FORMAT_G16R16F:
986 			r = ((half*)element)[0];
987 			g = ((half*)element)[1];
988 			break;
989 		case FORMAT_X16B16G16R16F:
990 		case FORMAT_X16B16G16R16F_UNSIGNED:
991 		case FORMAT_B16G16R16F:
992 			r = ((half*)element)[0];
993 			g = ((half*)element)[1];
994 			b = ((half*)element)[2];
995 			break;
996 		case FORMAT_A16B16G16R16F:
997 			r = ((half*)element)[0];
998 			g = ((half*)element)[1];
999 			b = ((half*)element)[2];
1000 			a = ((half*)element)[3];
1001 			break;
1002 		case FORMAT_A32F:
1003 			a = *(float*)element;
1004 			break;
1005 		case FORMAT_R32F:
1006 			r = *(float*)element;
1007 			break;
1008 		case FORMAT_G32R32F:
1009 			r = ((float*)element)[0];
1010 			g = ((float*)element)[1];
1011 			break;
1012 		case FORMAT_X32B32G32R32F:
1013 		case FORMAT_X32B32G32R32F_UNSIGNED:
1014 		case FORMAT_B32G32R32F:
1015 			r = ((float*)element)[0];
1016 			g = ((float*)element)[1];
1017 			b = ((float*)element)[2];
1018 			break;
1019 		case FORMAT_A32B32G32R32F:
1020 			r = ((float*)element)[0];
1021 			g = ((float*)element)[1];
1022 			b = ((float*)element)[2];
1023 			a = ((float*)element)[3];
1024 			break;
1025 		case FORMAT_D32F:
1026 		case FORMAT_D32FS8:
1027 		case FORMAT_D32F_LOCKABLE:
1028 		case FORMAT_D32FS8_TEXTURE:
1029 		case FORMAT_D32F_SHADOW:
1030 		case FORMAT_D32FS8_SHADOW:
1031 			r = *(float*)element;
1032 			g = r;
1033 			b = r;
1034 			a = r;
1035 			break;
1036 		case FORMAT_D32F_COMPLEMENTARY:
1037 		case FORMAT_D32FS8_COMPLEMENTARY:
1038 			r = 1.0f - *(float*)element;
1039 			g = r;
1040 			b = r;
1041 			a = r;
1042 			break;
1043 		case FORMAT_S8:
1044 			r = *(unsigned char*)element * (1.0f / 0xFF);
1045 			break;
1046 		default:
1047 			ASSERT(false);
1048 		}
1049 
1050 		if(isSRGBformat(format))
1051 		{
1052 			r = sRGBtoLinear(r);
1053 			g = sRGBtoLinear(g);
1054 			b = sRGBtoLinear(b);
1055 		}
1056 
1057 		return Color<float>(r, g, b, a);
1058 	}
1059 
sample(float x,float y,float z) const1060 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1061 	{
1062 		x -= 0.5f;
1063 		y -= 0.5f;
1064 		z -= 0.5f;
1065 
1066 		int x0 = clamp((int)x, 0, width - 1);
1067 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1068 
1069 		int y0 = clamp((int)y, 0, height - 1);
1070 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1071 
1072 		int z0 = clamp((int)z, 0, depth - 1);
1073 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1074 
1075 		Color<float> c000 = read(x0, y0, z0);
1076 		Color<float> c100 = read(x1, y0, z0);
1077 		Color<float> c010 = read(x0, y1, z0);
1078 		Color<float> c110 = read(x1, y1, z0);
1079 		Color<float> c001 = read(x0, y0, z1);
1080 		Color<float> c101 = read(x1, y0, z1);
1081 		Color<float> c011 = read(x0, y1, z1);
1082 		Color<float> c111 = read(x1, y1, z1);
1083 
1084 		float fx = x - x0;
1085 		float fy = y - y0;
1086 		float fz = z - z0;
1087 
1088 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1089 		c100 *= fx * (1 - fy) * (1 - fz);
1090 		c010 *= (1 - fx) * fy * (1 - fz);
1091 		c110 *= fx * fy * (1 - fz);
1092 		c001 *= (1 - fx) * (1 - fy) * fz;
1093 		c101 *= fx * (1 - fy) * fz;
1094 		c011 *= (1 - fx) * fy * fz;
1095 		c111 *= fx * fy * fz;
1096 
1097 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1098 	}
1099 
sample(float x,float y,int layer) const1100 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
1101 	{
1102 		x -= 0.5f;
1103 		y -= 0.5f;
1104 
1105 		int x0 = clamp((int)x, 0, width - 1);
1106 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1107 
1108 		int y0 = clamp((int)y, 0, height - 1);
1109 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1110 
1111 		Color<float> c00 = read(x0, y0, layer);
1112 		Color<float> c10 = read(x1, y0, layer);
1113 		Color<float> c01 = read(x0, y1, layer);
1114 		Color<float> c11 = read(x1, y1, layer);
1115 
1116 		float fx = x - x0;
1117 		float fy = y - y0;
1118 
1119 		c00 *= (1 - fx) * (1 - fy);
1120 		c10 *= fx * (1 - fy);
1121 		c01 *= (1 - fx) * fy;
1122 		c11 *= fx * fy;
1123 
1124 		return c00 + c10 + c01 + c11;
1125 	}
1126 
lockRect(int x,int y,int z,Lock lock)1127 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1128 	{
1129 		this->lock = lock;
1130 
1131 		switch(lock)
1132 		{
1133 		case LOCK_UNLOCKED:
1134 		case LOCK_READONLY:
1135 		case LOCK_UPDATE:
1136 			break;
1137 		case LOCK_WRITEONLY:
1138 		case LOCK_READWRITE:
1139 		case LOCK_DISCARD:
1140 			dirty = true;
1141 			break;
1142 		default:
1143 			ASSERT(false);
1144 		}
1145 
1146 		if(buffer)
1147 		{
1148 			x += border;
1149 			y += border;
1150 
1151 			switch(format)
1152 			{
1153 			case FORMAT_DXT1:
1154 			case FORMAT_ATI1:
1155 			case FORMAT_ETC1:
1156 			case FORMAT_R11_EAC:
1157 			case FORMAT_SIGNED_R11_EAC:
1158 			case FORMAT_RGB8_ETC2:
1159 			case FORMAT_SRGB8_ETC2:
1160 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1161 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1162 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1163 			case FORMAT_RG11_EAC:
1164 			case FORMAT_SIGNED_RG11_EAC:
1165 			case FORMAT_RGBA8_ETC2_EAC:
1166 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1167 			case FORMAT_RGBA_ASTC_4x4_KHR:
1168 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1169 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1170 			case FORMAT_RGBA_ASTC_5x4_KHR:
1171 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1172 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1173 			case FORMAT_RGBA_ASTC_5x5_KHR:
1174 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1175 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1176 			case FORMAT_RGBA_ASTC_6x5_KHR:
1177 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1178 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1179 			case FORMAT_RGBA_ASTC_6x6_KHR:
1180 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1181 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1182 			case FORMAT_RGBA_ASTC_8x5_KHR:
1183 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1184 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1185 			case FORMAT_RGBA_ASTC_8x6_KHR:
1186 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1187 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1188 			case FORMAT_RGBA_ASTC_8x8_KHR:
1189 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1190 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1191 			case FORMAT_RGBA_ASTC_10x5_KHR:
1192 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1193 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1194 			case FORMAT_RGBA_ASTC_10x6_KHR:
1195 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1196 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1197 			case FORMAT_RGBA_ASTC_10x8_KHR:
1198 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1199 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1200 			case FORMAT_RGBA_ASTC_10x10_KHR:
1201 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1202 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1203 			case FORMAT_RGBA_ASTC_12x10_KHR:
1204 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1205 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1206 			case FORMAT_RGBA_ASTC_12x12_KHR:
1207 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1208 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1209 			case FORMAT_DXT3:
1210 			case FORMAT_DXT5:
1211 			case FORMAT_ATI2:
1212 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1213 			default:
1214 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
1215 			}
1216 		}
1217 
1218 		return nullptr;
1219 	}
1220 
unlockRect()1221 	void Surface::Buffer::unlockRect()
1222 	{
1223 		lock = LOCK_UNLOCKED;
1224 	}
1225 
1226 	class SurfaceImplementation : public Surface
1227 	{
1228 	public:
SurfaceImplementation(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1229 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1230 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
SurfaceImplementation(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchP=0)1231 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1232 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
~SurfaceImplementation()1233 		~SurfaceImplementation() override {}
1234 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1235 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1236 		{
1237 			return Surface::lockInternal(x, y, z, lock, client);
1238 		}
1239 
unlockInternal()1240 		void unlockInternal() override
1241 		{
1242 			Surface::unlockInternal();
1243 		}
1244 	};
1245 
create(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1246 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1247 	{
1248 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1249 	}
1250 
create(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1251 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1252 	{
1253 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
1254 	}
1255 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1256 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1257 	{
1258 		resource = new Resource(0);
1259 		hasParent = false;
1260 		ownExternal = false;
1261 		depth = max(1, depth);
1262 
1263 		external.buffer = pixels;
1264 		external.width = width;
1265 		external.height = height;
1266 		external.depth = depth;
1267 		external.samples = 1;
1268 		external.format = format;
1269 		external.bytes = bytes(external.format);
1270 		external.pitchB = pitch;
1271 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1272 		external.sliceB = slice;
1273 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1274 		external.border = 0;
1275 		external.lock = LOCK_UNLOCKED;
1276 		external.dirty = true;
1277 
1278 		internal.buffer = nullptr;
1279 		internal.width = width;
1280 		internal.height = height;
1281 		internal.depth = depth;
1282 		internal.samples = 1;
1283 		internal.format = selectInternalFormat(format);
1284 		internal.bytes = bytes(internal.format);
1285 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
1286 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
1287 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
1288 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
1289 		internal.border = 0;
1290 		internal.lock = LOCK_UNLOCKED;
1291 		internal.dirty = false;
1292 
1293 		stencil.buffer = nullptr;
1294 		stencil.width = width;
1295 		stencil.height = height;
1296 		stencil.depth = depth;
1297 		stencil.samples = 1;
1298 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1299 		stencil.bytes = bytes(stencil.format);
1300 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
1301 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
1302 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
1303 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
1304 		stencil.border = 0;
1305 		stencil.lock = LOCK_UNLOCKED;
1306 		stencil.dirty = false;
1307 
1308 		dirtyContents = true;
1309 		paletteUsed = 0;
1310 	}
1311 
Surface(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1312 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1313 	{
1314 		resource = texture ? texture : new Resource(0);
1315 		hasParent = texture != nullptr;
1316 		ownExternal = true;
1317 		depth = max(1, depth);
1318 		samples = max(1, samples);
1319 
1320 		external.buffer = nullptr;
1321 		external.width = width;
1322 		external.height = height;
1323 		external.depth = depth;
1324 		external.samples = (short)samples;
1325 		external.format = format;
1326 		external.bytes = bytes(external.format);
1327 		external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
1328 		external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
1329 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
1330 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
1331 		external.border = 0;
1332 		external.lock = LOCK_UNLOCKED;
1333 		external.dirty = false;
1334 
1335 		internal.buffer = nullptr;
1336 		internal.width = width;
1337 		internal.height = height;
1338 		internal.depth = depth;
1339 		internal.samples = (short)samples;
1340 		internal.format = selectInternalFormat(format);
1341 		internal.bytes = bytes(internal.format);
1342 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1343 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
1344 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
1345 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
1346 		internal.border = (short)border;
1347 		internal.lock = LOCK_UNLOCKED;
1348 		internal.dirty = false;
1349 
1350 		stencil.buffer = nullptr;
1351 		stencil.width = width;
1352 		stencil.height = height;
1353 		stencil.depth = depth;
1354 		stencil.samples = (short)samples;
1355 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1356 		stencil.bytes = bytes(stencil.format);
1357 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
1358 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
1359 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1360 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1361 		stencil.border = 0;
1362 		stencil.lock = LOCK_UNLOCKED;
1363 		stencil.dirty = false;
1364 
1365 		dirtyContents = true;
1366 		paletteUsed = 0;
1367 	}
1368 
~Surface()1369 	Surface::~Surface()
1370 	{
1371 		// sync() must be called before this destructor to ensure all locks have been released.
1372 		// We can't call it here because the parent resource may already have been destroyed.
1373 		ASSERT(isUnlocked());
1374 
1375 		if(!hasParent)
1376 		{
1377 			resource->destruct();
1378 		}
1379 
1380 		if(ownExternal)
1381 		{
1382 			deallocate(external.buffer);
1383 		}
1384 
1385 		if(internal.buffer != external.buffer)
1386 		{
1387 			deallocate(internal.buffer);
1388 		}
1389 
1390 		deallocate(stencil.buffer);
1391 
1392 		external.buffer = nullptr;
1393 		internal.buffer = nullptr;
1394 		stencil.buffer = nullptr;
1395 	}
1396 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1397 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1398 	{
1399 		resource->lock(client);
1400 
1401 		if(!external.buffer)
1402 		{
1403 			if(internal.buffer && identicalBuffers())
1404 			{
1405 				external.buffer = internal.buffer;
1406 			}
1407 			else
1408 			{
1409 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
1410 			}
1411 		}
1412 
1413 		if(internal.dirty)
1414 		{
1415 			if(lock != LOCK_DISCARD)
1416 			{
1417 				update(external, internal);
1418 			}
1419 
1420 			internal.dirty = false;
1421 		}
1422 
1423 		switch(lock)
1424 		{
1425 		case LOCK_READONLY:
1426 			break;
1427 		case LOCK_WRITEONLY:
1428 		case LOCK_READWRITE:
1429 		case LOCK_DISCARD:
1430 			dirtyContents = true;
1431 			break;
1432 		default:
1433 			ASSERT(false);
1434 		}
1435 
1436 		return external.lockRect(x, y, z, lock);
1437 	}
1438 
unlockExternal()1439 	void Surface::unlockExternal()
1440 	{
1441 		external.unlockRect();
1442 
1443 		resource->unlock();
1444 	}
1445 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1446 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1447 	{
1448 		if(lock != LOCK_UNLOCKED)
1449 		{
1450 			resource->lock(client);
1451 		}
1452 
1453 		if(!internal.buffer)
1454 		{
1455 			if(external.buffer && identicalBuffers())
1456 			{
1457 				internal.buffer = external.buffer;
1458 			}
1459 			else
1460 			{
1461 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
1462 			}
1463 		}
1464 
1465 		// FIXME: WHQL requires conversion to lower external precision and back
1466 		if(logPrecision >= WHQL)
1467 		{
1468 			if(internal.dirty && renderTarget && internal.format != external.format)
1469 			{
1470 				if(lock != LOCK_DISCARD)
1471 				{
1472 					switch(external.format)
1473 					{
1474 					case FORMAT_R3G3B2:
1475 					case FORMAT_A8R3G3B2:
1476 					case FORMAT_A1R5G5B5:
1477 					case FORMAT_A2R10G10B10:
1478 					case FORMAT_A2B10G10R10:
1479 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1480 						unlockExternal();
1481 						break;
1482 					default:
1483 						// Difference passes WHQL
1484 						break;
1485 					}
1486 				}
1487 			}
1488 		}
1489 
1490 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1491 		{
1492 			if(lock != LOCK_DISCARD)
1493 			{
1494 				update(internal, external);
1495 			}
1496 
1497 			external.dirty = false;
1498 			paletteUsed = Surface::paletteID;
1499 		}
1500 
1501 		switch(lock)
1502 		{
1503 		case LOCK_UNLOCKED:
1504 		case LOCK_READONLY:
1505 			break;
1506 		case LOCK_WRITEONLY:
1507 		case LOCK_READWRITE:
1508 		case LOCK_DISCARD:
1509 			dirtyContents = true;
1510 			break;
1511 		default:
1512 			ASSERT(false);
1513 		}
1514 
1515 		if(lock == LOCK_READONLY && client == PUBLIC)
1516 		{
1517 			resolve();
1518 		}
1519 
1520 		return internal.lockRect(x, y, z, lock);
1521 	}
1522 
unlockInternal()1523 	void Surface::unlockInternal()
1524 	{
1525 		internal.unlockRect();
1526 
1527 		resource->unlock();
1528 	}
1529 
lockStencil(int x,int y,int front,Accessor client)1530 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1531 	{
1532 		resource->lock(client);
1533 
1534 		if(stencil.format == FORMAT_NULL)
1535 		{
1536 			return nullptr;
1537 		}
1538 
1539 		if(!stencil.buffer)
1540 		{
1541 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
1542 		}
1543 
1544 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1545 	}
1546 
unlockStencil()1547 	void Surface::unlockStencil()
1548 	{
1549 		stencil.unlockRect();
1550 
1551 		resource->unlock();
1552 	}
1553 
bytes(Format format)1554 	int Surface::bytes(Format format)
1555 	{
1556 		switch(format)
1557 		{
1558 		case FORMAT_NULL:				return 0;
1559 		case FORMAT_P8:					return 1;
1560 		case FORMAT_A8P8:				return 2;
1561 		case FORMAT_A8:					return 1;
1562 		case FORMAT_R8I:				return 1;
1563 		case FORMAT_R8:					return 1;
1564 		case FORMAT_R3G3B2:				return 1;
1565 		case FORMAT_R16I:				return 2;
1566 		case FORMAT_R16UI:				return 2;
1567 		case FORMAT_A8R3G3B2:			return 2;
1568 		case FORMAT_R5G6B5:				return 2;
1569 		case FORMAT_A1R5G5B5:			return 2;
1570 		case FORMAT_X1R5G5B5:			return 2;
1571 		case FORMAT_R5G5B5A1:           return 2;
1572 		case FORMAT_X4R4G4B4:			return 2;
1573 		case FORMAT_A4R4G4B4:			return 2;
1574 		case FORMAT_R4G4B4A4:           return 2;
1575 		case FORMAT_R8G8B8:				return 3;
1576 		case FORMAT_B8G8R8:             return 3;
1577 		case FORMAT_R32I:				return 4;
1578 		case FORMAT_R32UI:				return 4;
1579 		case FORMAT_X8R8G8B8:			return 4;
1580 	//	case FORMAT_X8G8R8B8Q:			return 4;
1581 		case FORMAT_A8R8G8B8:			return 4;
1582 	//	case FORMAT_A8G8R8B8Q:			return 4;
1583 		case FORMAT_X8B8G8R8I:			return 4;
1584 		case FORMAT_X8B8G8R8:			return 4;
1585 		case FORMAT_SRGB8_X8:			return 4;
1586 		case FORMAT_SRGB8_A8:			return 4;
1587 		case FORMAT_A8B8G8R8I:			return 4;
1588 		case FORMAT_R8UI:				return 1;
1589 		case FORMAT_G8R8UI:				return 2;
1590 		case FORMAT_X8B8G8R8UI:			return 4;
1591 		case FORMAT_A8B8G8R8UI:			return 4;
1592 		case FORMAT_A8B8G8R8:			return 4;
1593 		case FORMAT_R8_SNORM:			return 1;
1594 		case FORMAT_G8R8_SNORM:		return 2;
1595 		case FORMAT_X8B8G8R8_SNORM:	return 4;
1596 		case FORMAT_A8B8G8R8_SNORM:	return 4;
1597 		case FORMAT_A2R10G10B10:		return 4;
1598 		case FORMAT_A2B10G10R10:		return 4;
1599 		case FORMAT_A2B10G10R10UI:		return 4;
1600 		case FORMAT_G8R8I:				return 2;
1601 		case FORMAT_G8R8:				return 2;
1602 		case FORMAT_G16R16I:			return 4;
1603 		case FORMAT_G16R16UI:			return 4;
1604 		case FORMAT_G16R16:				return 4;
1605 		case FORMAT_G32R32I:			return 8;
1606 		case FORMAT_G32R32UI:			return 8;
1607 		case FORMAT_X16B16G16R16I:		return 8;
1608 		case FORMAT_X16B16G16R16UI:		return 8;
1609 		case FORMAT_A16B16G16R16I:		return 8;
1610 		case FORMAT_A16B16G16R16UI:		return 8;
1611 		case FORMAT_A16B16G16R16:		return 8;
1612 		case FORMAT_X32B32G32R32I:		return 16;
1613 		case FORMAT_X32B32G32R32UI:		return 16;
1614 		case FORMAT_A32B32G32R32I:		return 16;
1615 		case FORMAT_A32B32G32R32UI:		return 16;
1616 		// Compressed formats
1617 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1618 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1619 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1620 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1621 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1622 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1623 		case FORMAT_R11_EAC:			return 2;
1624 		case FORMAT_SIGNED_R11_EAC:		return 2;
1625 		case FORMAT_RG11_EAC:			return 4;
1626 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1627 		case FORMAT_RGB8_ETC2:			return 2;
1628 		case FORMAT_SRGB8_ETC2:			return 2;
1629 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1630 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1631 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1632 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1633 		case FORMAT_RGBA_ASTC_4x4_KHR:
1634 		case FORMAT_RGBA_ASTC_5x4_KHR:
1635 		case FORMAT_RGBA_ASTC_5x5_KHR:
1636 		case FORMAT_RGBA_ASTC_6x5_KHR:
1637 		case FORMAT_RGBA_ASTC_6x6_KHR:
1638 		case FORMAT_RGBA_ASTC_8x5_KHR:
1639 		case FORMAT_RGBA_ASTC_8x6_KHR:
1640 		case FORMAT_RGBA_ASTC_8x8_KHR:
1641 		case FORMAT_RGBA_ASTC_10x5_KHR:
1642 		case FORMAT_RGBA_ASTC_10x6_KHR:
1643 		case FORMAT_RGBA_ASTC_10x8_KHR:
1644 		case FORMAT_RGBA_ASTC_10x10_KHR:
1645 		case FORMAT_RGBA_ASTC_12x10_KHR:
1646 		case FORMAT_RGBA_ASTC_12x12_KHR:
1647 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1648 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1649 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1650 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1651 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1652 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1653 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1654 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1655 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1656 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1657 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1658 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1659 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1660 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1661 		// Bumpmap formats
1662 		case FORMAT_V8U8:				return 2;
1663 		case FORMAT_L6V5U5:				return 2;
1664 		case FORMAT_Q8W8V8U8:			return 4;
1665 		case FORMAT_X8L8V8U8:			return 4;
1666 		case FORMAT_A2W10V10U10:		return 4;
1667 		case FORMAT_V16U16:				return 4;
1668 		case FORMAT_A16W16V16U16:		return 8;
1669 		case FORMAT_Q16W16V16U16:		return 8;
1670 		// Luminance formats
1671 		case FORMAT_L8:					return 1;
1672 		case FORMAT_A4L4:				return 1;
1673 		case FORMAT_L16:				return 2;
1674 		case FORMAT_A8L8:				return 2;
1675 		case FORMAT_L16F:               return 2;
1676 		case FORMAT_A16L16F:            return 4;
1677 		case FORMAT_L32F:               return 4;
1678 		case FORMAT_A32L32F:            return 8;
1679 		// Floating-point formats
1680 		case FORMAT_A16F:				return 2;
1681 		case FORMAT_R16F:				return 2;
1682 		case FORMAT_G16R16F:			return 4;
1683 		case FORMAT_B16G16R16F:			return 6;
1684 		case FORMAT_X16B16G16R16F:		return 8;
1685 		case FORMAT_A16B16G16R16F:		return 8;
1686 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
1687 		case FORMAT_A32F:				return 4;
1688 		case FORMAT_R32F:				return 4;
1689 		case FORMAT_G32R32F:			return 8;
1690 		case FORMAT_B32G32R32F:			return 12;
1691 		case FORMAT_X32B32G32R32F:		return 16;
1692 		case FORMAT_A32B32G32R32F:		return 16;
1693 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
1694 		// Depth/stencil formats
1695 		case FORMAT_D16:				return 2;
1696 		case FORMAT_D32:				return 4;
1697 		case FORMAT_D24X8:				return 4;
1698 		case FORMAT_D24S8:				return 4;
1699 		case FORMAT_D24FS8:				return 4;
1700 		case FORMAT_D32F:				return 4;
1701 		case FORMAT_D32FS8:				return 4;
1702 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1703 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
1704 		case FORMAT_D32F_LOCKABLE:		return 4;
1705 		case FORMAT_D32FS8_TEXTURE:		return 4;
1706 		case FORMAT_D32F_SHADOW:		return 4;
1707 		case FORMAT_D32FS8_SHADOW:		return 4;
1708 		case FORMAT_DF24S8:				return 4;
1709 		case FORMAT_DF16S8:				return 2;
1710 		case FORMAT_INTZ:				return 4;
1711 		case FORMAT_S8:					return 1;
1712 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1713 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1714 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1715 		default:
1716 			ASSERT(false);
1717 		}
1718 
1719 		return 0;
1720 	}
1721 
pitchB(int width,int border,Format format,bool target)1722 	int Surface::pitchB(int width, int border, Format format, bool target)
1723 	{
1724 		width += 2 * border;
1725 
1726 		// Render targets require 2x2 quads
1727 		if(target || isDepth(format) || isStencil(format))
1728 		{
1729 			width = align<2>(width);
1730 		}
1731 
1732 		switch(format)
1733 		{
1734 		case FORMAT_DXT1:
1735 		case FORMAT_ETC1:
1736 		case FORMAT_R11_EAC:
1737 		case FORMAT_SIGNED_R11_EAC:
1738 		case FORMAT_RGB8_ETC2:
1739 		case FORMAT_SRGB8_ETC2:
1740 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1741 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1742 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1743 		case FORMAT_RG11_EAC:
1744 		case FORMAT_SIGNED_RG11_EAC:
1745 		case FORMAT_RGBA8_ETC2_EAC:
1746 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1747 		case FORMAT_RGBA_ASTC_4x4_KHR:
1748 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1749 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1750 		case FORMAT_RGBA_ASTC_5x4_KHR:
1751 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1752 		case FORMAT_RGBA_ASTC_5x5_KHR:
1753 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1754 			return 16 * ((width + 4) / 5);
1755 		case FORMAT_RGBA_ASTC_6x5_KHR:
1756 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1757 		case FORMAT_RGBA_ASTC_6x6_KHR:
1758 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1759 			return 16 * ((width + 5) / 6);
1760 		case FORMAT_RGBA_ASTC_8x5_KHR:
1761 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1762 		case FORMAT_RGBA_ASTC_8x6_KHR:
1763 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1764 		case FORMAT_RGBA_ASTC_8x8_KHR:
1765 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1766 			return 16 * ((width + 7) / 8);
1767 		case FORMAT_RGBA_ASTC_10x5_KHR:
1768 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1769 		case FORMAT_RGBA_ASTC_10x6_KHR:
1770 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1771 		case FORMAT_RGBA_ASTC_10x8_KHR:
1772 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1773 		case FORMAT_RGBA_ASTC_10x10_KHR:
1774 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1775 			return 16 * ((width + 9) / 10);
1776 		case FORMAT_RGBA_ASTC_12x10_KHR:
1777 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1778 		case FORMAT_RGBA_ASTC_12x12_KHR:
1779 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1780 			return 16 * ((width + 11) / 12);
1781 		case FORMAT_DXT3:
1782 		case FORMAT_DXT5:
1783 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1784 		case FORMAT_ATI1:
1785 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1786 		case FORMAT_ATI2:
1787 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1788 		case FORMAT_YV12_BT601:
1789 		case FORMAT_YV12_BT709:
1790 		case FORMAT_YV12_JFIF:
1791 			return align<16>(width);
1792 		default:
1793 			return bytes(format) * width;
1794 		}
1795 	}
1796 
pitchP(int width,int border,Format format,bool target)1797 	int Surface::pitchP(int width, int border, Format format, bool target)
1798 	{
1799 		int B = bytes(format);
1800 
1801 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
1802 	}
1803 
sliceB(int width,int height,int border,Format format,bool target)1804 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
1805 	{
1806 		height += 2 * border;
1807 
1808 		// Render targets require 2x2 quads
1809 		if(target || isDepth(format) || isStencil(format))
1810 		{
1811 			height = align<2>(height);
1812 		}
1813 
1814 		switch(format)
1815 		{
1816 		case FORMAT_DXT1:
1817 		case FORMAT_DXT3:
1818 		case FORMAT_DXT5:
1819 		case FORMAT_ETC1:
1820 		case FORMAT_R11_EAC:
1821 		case FORMAT_SIGNED_R11_EAC:
1822 		case FORMAT_RG11_EAC:
1823 		case FORMAT_SIGNED_RG11_EAC:
1824 		case FORMAT_RGB8_ETC2:
1825 		case FORMAT_SRGB8_ETC2:
1826 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1827 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1828 		case FORMAT_RGBA8_ETC2_EAC:
1829 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1830 		case FORMAT_RGBA_ASTC_4x4_KHR:
1831 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1832 		case FORMAT_RGBA_ASTC_5x4_KHR:
1833 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1834 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1835 		case FORMAT_RGBA_ASTC_5x5_KHR:
1836 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1837 		case FORMAT_RGBA_ASTC_6x5_KHR:
1838 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1839 		case FORMAT_RGBA_ASTC_8x5_KHR:
1840 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1841 		case FORMAT_RGBA_ASTC_10x5_KHR:
1842 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1843 			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1844 		case FORMAT_RGBA_ASTC_6x6_KHR:
1845 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1846 		case FORMAT_RGBA_ASTC_8x6_KHR:
1847 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1848 		case FORMAT_RGBA_ASTC_10x6_KHR:
1849 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1850 			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1851 		case FORMAT_RGBA_ASTC_8x8_KHR:
1852 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1853 		case FORMAT_RGBA_ASTC_10x8_KHR:
1854 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1855 			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1856 		case FORMAT_RGBA_ASTC_10x10_KHR:
1857 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1858 		case FORMAT_RGBA_ASTC_12x10_KHR:
1859 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1860 			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1861 		case FORMAT_RGBA_ASTC_12x12_KHR:
1862 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1863 			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1864 		case FORMAT_ATI1:
1865 		case FORMAT_ATI2:
1866 			return pitchB(width, border, format, target) * align<4>(height);   // Pitch computed per row
1867 		default:
1868 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
1869 		}
1870 	}
1871 
sliceP(int width,int height,int border,Format format,bool target)1872 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
1873 	{
1874 		int B = bytes(format);
1875 
1876 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
1877 	}
1878 
update(Buffer & destination,Buffer & source)1879 	void Surface::update(Buffer &destination, Buffer &source)
1880 	{
1881 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1882 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1883 
1884 		if(destination.buffer != source.buffer)
1885 		{
1886 			ASSERT(source.dirty && !destination.dirty);
1887 
1888 			switch(source.format)
1889 			{
1890 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1891 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1892 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1893 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1894 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1895 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1896 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1897 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1898 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1899 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1900 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1901 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1902 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1903 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1904 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1905 			case FORMAT_ETC1:
1906 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1907 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1908 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1909 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1910 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1911 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1912 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1913 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1914 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1915 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1916 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1917 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1918 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1919 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1920 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1921 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1922 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1923 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1924 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1925 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1926 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1927 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1928 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1929 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1930 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1931 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1932 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1933 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1934 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1935 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1936 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1937 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1938 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1939 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1940 			default:				genericUpdate(destination, source);		break;
1941 			}
1942 		}
1943 	}
1944 
genericUpdate(Buffer & destination,Buffer & source)1945 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1946 	{
1947 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1948 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1949 
1950 		int depth = min(destination.depth, source.depth);
1951 		int height = min(destination.height, source.height);
1952 		int width = min(destination.width, source.width);
1953 		int rowBytes = width * source.bytes;
1954 
1955 		for(int z = 0; z < depth; z++)
1956 		{
1957 			unsigned char *sourceRow = sourceSlice;
1958 			unsigned char *destinationRow = destinationSlice;
1959 
1960 			for(int y = 0; y < height; y++)
1961 			{
1962 				if(source.format == destination.format)
1963 				{
1964 					memcpy(destinationRow, sourceRow, rowBytes);
1965 				}
1966 				else
1967 				{
1968 					unsigned char *sourceElement = sourceRow;
1969 					unsigned char *destinationElement = destinationRow;
1970 
1971 					for(int x = 0; x < width; x++)
1972 					{
1973 						Color<float> color = source.read(sourceElement);
1974 						destination.write(destinationElement, color);
1975 
1976 						sourceElement += source.bytes;
1977 						destinationElement += destination.bytes;
1978 					}
1979 				}
1980 
1981 				sourceRow += source.pitchB;
1982 				destinationRow += destination.pitchB;
1983 			}
1984 
1985 			sourceSlice += source.sliceB;
1986 			destinationSlice += destination.sliceB;
1987 		}
1988 
1989 		source.unlockRect();
1990 		destination.unlockRect();
1991 	}
1992 
decodeR8G8B8(Buffer & destination,Buffer & source)1993 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
1994 	{
1995 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1996 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1997 
1998 		int depth = min(destination.depth, source.depth);
1999 		int height = min(destination.height, source.height);
2000 		int width = min(destination.width, source.width);
2001 
2002 		for(int z = 0; z < depth; z++)
2003 		{
2004 			unsigned char *sourceRow = sourceSlice;
2005 			unsigned char *destinationRow = destinationSlice;
2006 
2007 			for(int y = 0; y < height; y++)
2008 			{
2009 				unsigned char *sourceElement = sourceRow;
2010 				unsigned char *destinationElement = destinationRow;
2011 
2012 				for(int x = 0; x < width; x++)
2013 				{
2014 					unsigned int b = sourceElement[0];
2015 					unsigned int g = sourceElement[1];
2016 					unsigned int r = sourceElement[2];
2017 
2018 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
2019 
2020 					sourceElement += source.bytes;
2021 					destinationElement += destination.bytes;
2022 				}
2023 
2024 				sourceRow += source.pitchB;
2025 				destinationRow += destination.pitchB;
2026 			}
2027 
2028 			sourceSlice += source.sliceB;
2029 			destinationSlice += destination.sliceB;
2030 		}
2031 
2032 		source.unlockRect();
2033 		destination.unlockRect();
2034 	}
2035 
decodeX1R5G5B5(Buffer & destination,Buffer & source)2036 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
2037 	{
2038 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2039 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2040 
2041 		int depth = min(destination.depth, source.depth);
2042 		int height = min(destination.height, source.height);
2043 		int width = min(destination.width, source.width);
2044 
2045 		for(int z = 0; z < depth; z++)
2046 		{
2047 			unsigned char *sourceRow = sourceSlice;
2048 			unsigned char *destinationRow = destinationSlice;
2049 
2050 			for(int y = 0; y < height; y++)
2051 			{
2052 				unsigned char *sourceElement = sourceRow;
2053 				unsigned char *destinationElement = destinationRow;
2054 
2055 				for(int x = 0; x < width; x++)
2056 				{
2057 					unsigned int xrgb = *(unsigned short*)sourceElement;
2058 
2059 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2060 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
2061 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
2062 
2063 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2064 
2065 					sourceElement += source.bytes;
2066 					destinationElement += destination.bytes;
2067 				}
2068 
2069 				sourceRow += source.pitchB;
2070 				destinationRow += destination.pitchB;
2071 			}
2072 
2073 			sourceSlice += source.sliceB;
2074 			destinationSlice += destination.sliceB;
2075 		}
2076 
2077 		source.unlockRect();
2078 		destination.unlockRect();
2079 	}
2080 
decodeA1R5G5B5(Buffer & destination,Buffer & source)2081 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
2082 	{
2083 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2084 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2085 
2086 		int depth = min(destination.depth, source.depth);
2087 		int height = min(destination.height, source.height);
2088 		int width = min(destination.width, source.width);
2089 
2090 		for(int z = 0; z < depth; z++)
2091 		{
2092 			unsigned char *sourceRow = sourceSlice;
2093 			unsigned char *destinationRow = destinationSlice;
2094 
2095 			for(int y = 0; y < height; y++)
2096 			{
2097 				unsigned char *sourceElement = sourceRow;
2098 				unsigned char *destinationElement = destinationRow;
2099 
2100 				for(int x = 0; x < width; x++)
2101 				{
2102 					unsigned int argb = *(unsigned short*)sourceElement;
2103 
2104 					unsigned int a =   (argb & 0x8000) * 130560;
2105 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2106 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
2107 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
2108 
2109 					*(unsigned int*)destinationElement = a | r | g | b;
2110 
2111 					sourceElement += source.bytes;
2112 					destinationElement += destination.bytes;
2113 				}
2114 
2115 				sourceRow += source.pitchB;
2116 				destinationRow += destination.pitchB;
2117 			}
2118 
2119 			sourceSlice += source.sliceB;
2120 			destinationSlice += destination.sliceB;
2121 		}
2122 
2123 		source.unlockRect();
2124 		destination.unlockRect();
2125 	}
2126 
decodeX4R4G4B4(Buffer & destination,Buffer & source)2127 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
2128 	{
2129 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2130 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2131 
2132 		int depth = min(destination.depth, source.depth);
2133 		int height = min(destination.height, source.height);
2134 		int width = min(destination.width, source.width);
2135 
2136 		for(int z = 0; z < depth; z++)
2137 		{
2138 			unsigned char *sourceRow = sourceSlice;
2139 			unsigned char *destinationRow = destinationSlice;
2140 
2141 			for(int y = 0; y < height; y++)
2142 			{
2143 				unsigned char *sourceElement = sourceRow;
2144 				unsigned char *destinationElement = destinationRow;
2145 
2146 				for(int x = 0; x < width; x++)
2147 				{
2148 					unsigned int xrgb = *(unsigned short*)sourceElement;
2149 
2150 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2151 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2152 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2153 
2154 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2155 
2156 					sourceElement += source.bytes;
2157 					destinationElement += destination.bytes;
2158 				}
2159 
2160 				sourceRow += source.pitchB;
2161 				destinationRow += destination.pitchB;
2162 			}
2163 
2164 			sourceSlice += source.sliceB;
2165 			destinationSlice += destination.sliceB;
2166 		}
2167 
2168 		source.unlockRect();
2169 		destination.unlockRect();
2170 	}
2171 
decodeA4R4G4B4(Buffer & destination,Buffer & source)2172 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
2173 	{
2174 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2175 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2176 
2177 		int depth = min(destination.depth, source.depth);
2178 		int height = min(destination.height, source.height);
2179 		int width = min(destination.width, source.width);
2180 
2181 		for(int z = 0; z < depth; z++)
2182 		{
2183 			unsigned char *sourceRow = sourceSlice;
2184 			unsigned char *destinationRow = destinationSlice;
2185 
2186 			for(int y = 0; y < height; y++)
2187 			{
2188 				unsigned char *sourceElement = sourceRow;
2189 				unsigned char *destinationElement = destinationRow;
2190 
2191 				for(int x = 0; x < width; x++)
2192 				{
2193 					unsigned int argb = *(unsigned short*)sourceElement;
2194 
2195 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2196 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2197 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2198 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2199 
2200 					*(unsigned int*)destinationElement = a | r | g | b;
2201 
2202 					sourceElement += source.bytes;
2203 					destinationElement += destination.bytes;
2204 				}
2205 
2206 				sourceRow += source.pitchB;
2207 				destinationRow += destination.pitchB;
2208 			}
2209 
2210 			sourceSlice += source.sliceB;
2211 			destinationSlice += destination.sliceB;
2212 		}
2213 
2214 		source.unlockRect();
2215 		destination.unlockRect();
2216 	}
2217 
decodeP8(Buffer & destination,Buffer & source)2218 	void Surface::decodeP8(Buffer &destination, Buffer &source)
2219 	{
2220 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2221 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2222 
2223 		int depth = min(destination.depth, source.depth);
2224 		int height = min(destination.height, source.height);
2225 		int width = min(destination.width, source.width);
2226 
2227 		for(int z = 0; z < depth; z++)
2228 		{
2229 			unsigned char *sourceRow = sourceSlice;
2230 			unsigned char *destinationRow = destinationSlice;
2231 
2232 			for(int y = 0; y < height; y++)
2233 			{
2234 				unsigned char *sourceElement = sourceRow;
2235 				unsigned char *destinationElement = destinationRow;
2236 
2237 				for(int x = 0; x < width; x++)
2238 				{
2239 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2240 
2241 					unsigned int r = (abgr & 0x000000FF) << 16;
2242 					unsigned int g = (abgr & 0x0000FF00) << 0;
2243 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2244 					unsigned int a = (abgr & 0xFF000000) >> 0;
2245 
2246 					*(unsigned int*)destinationElement = a | r | g | b;
2247 
2248 					sourceElement += source.bytes;
2249 					destinationElement += destination.bytes;
2250 				}
2251 
2252 				sourceRow += source.pitchB;
2253 				destinationRow += destination.pitchB;
2254 			}
2255 
2256 			sourceSlice += source.sliceB;
2257 			destinationSlice += destination.sliceB;
2258 		}
2259 
2260 		source.unlockRect();
2261 		destination.unlockRect();
2262 	}
2263 
decodeDXT1(Buffer & internal,Buffer & external)2264 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
2265 	{
2266 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2267 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2268 
2269 		for(int z = 0; z < external.depth; z++)
2270 		{
2271 			unsigned int *dest = destSlice;
2272 
2273 			for(int y = 0; y < external.height; y += 4)
2274 			{
2275 				for(int x = 0; x < external.width; x += 4)
2276 				{
2277 					Color<byte> c[4];
2278 
2279 					c[0] = source->c0;
2280 					c[1] = source->c1;
2281 
2282 					if(source->c0 > source->c1)   // No transparency
2283 					{
2284 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2285 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2286 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2287 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2288 						c[2].a = 0xFF;
2289 
2290 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2291 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2292 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2293 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2294 						c[3].a = 0xFF;
2295 					}
2296 					else   // c3 transparent
2297 					{
2298 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2299 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2300 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2301 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2302 						c[2].a = 0xFF;
2303 
2304 						c[3].r = 0;
2305 						c[3].g = 0;
2306 						c[3].b = 0;
2307 						c[3].a = 0;
2308 					}
2309 
2310 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2311 					{
2312 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2313 						{
2314 							dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2315 						}
2316 					}
2317 
2318 					source++;
2319 				}
2320 			}
2321 
2322 			(byte*&)destSlice += internal.sliceB;
2323 		}
2324 
2325 		external.unlockRect();
2326 		internal.unlockRect();
2327 	}
2328 
decodeDXT3(Buffer & internal,Buffer & external)2329 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
2330 	{
2331 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2332 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
2333 
2334 		for(int z = 0; z < external.depth; z++)
2335 		{
2336 			unsigned int *dest = destSlice;
2337 
2338 			for(int y = 0; y < external.height; y += 4)
2339 			{
2340 				for(int x = 0; x < external.width; x += 4)
2341 				{
2342 					Color<byte> c[4];
2343 
2344 					c[0] = source->c0;
2345 					c[1] = source->c1;
2346 
2347 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2348 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2349 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2350 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2351 
2352 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2353 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2354 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2355 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2356 
2357 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2358 					{
2359 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2360 						{
2361 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2362 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2363 
2364 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2365 						}
2366 					}
2367 
2368 					source++;
2369 				}
2370 			}
2371 
2372 			(byte*&)destSlice += internal.sliceB;
2373 		}
2374 
2375 		external.unlockRect();
2376 		internal.unlockRect();
2377 	}
2378 
decodeDXT5(Buffer & internal,Buffer & external)2379 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
2380 	{
2381 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2382 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
2383 
2384 		for(int z = 0; z < external.depth; z++)
2385 		{
2386 			unsigned int *dest = destSlice;
2387 
2388 			for(int y = 0; y < external.height; y += 4)
2389 			{
2390 				for(int x = 0; x < external.width; x += 4)
2391 				{
2392 					Color<byte> c[4];
2393 
2394 					c[0] = source->c0;
2395 					c[1] = source->c1;
2396 
2397 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2398 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2399 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2400 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2401 
2402 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2403 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2404 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2405 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2406 
2407 					byte a[8];
2408 
2409 					a[0] = source->a0;
2410 					a[1] = source->a1;
2411 
2412 					if(a[0] > a[1])
2413 					{
2414 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2415 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2416 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2417 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2418 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2419 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2420 					}
2421 					else
2422 					{
2423 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2424 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2425 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2426 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2427 						a[6] = 0;
2428 						a[7] = 0xFF;
2429 					}
2430 
2431 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2432 					{
2433 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2434 						{
2435 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2436 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2437 
2438 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2439 						}
2440 					}
2441 
2442 					source++;
2443 				}
2444 			}
2445 
2446 			(byte*&)destSlice += internal.sliceB;
2447 		}
2448 
2449 		external.unlockRect();
2450 		internal.unlockRect();
2451 	}
2452 
decodeATI1(Buffer & internal,Buffer & external)2453 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
2454 	{
2455 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2456 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2457 
2458 		for(int z = 0; z < external.depth; z++)
2459 		{
2460 			byte *dest = destSlice;
2461 
2462 			for(int y = 0; y < external.height; y += 4)
2463 			{
2464 				for(int x = 0; x < external.width; x += 4)
2465 				{
2466 					byte r[8];
2467 
2468 					r[0] = source->r0;
2469 					r[1] = source->r1;
2470 
2471 					if(r[0] > r[1])
2472 					{
2473 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2474 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2475 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2476 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2477 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2478 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2479 					}
2480 					else
2481 					{
2482 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2483 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2484 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2485 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2486 						r[6] = 0;
2487 						r[7] = 0xFF;
2488 					}
2489 
2490 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2491 					{
2492 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2493 						{
2494 							dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2495 						}
2496 					}
2497 
2498 					source++;
2499 				}
2500 			}
2501 
2502 			destSlice += internal.sliceB;
2503 		}
2504 
2505 		external.unlockRect();
2506 		internal.unlockRect();
2507 	}
2508 
decodeATI2(Buffer & internal,Buffer & external)2509 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
2510 	{
2511 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2512 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
2513 
2514 		for(int z = 0; z < external.depth; z++)
2515 		{
2516 			word *dest = destSlice;
2517 
2518 			for(int y = 0; y < external.height; y += 4)
2519 			{
2520 				for(int x = 0; x < external.width; x += 4)
2521 				{
2522 					byte X[8];
2523 
2524 					X[0] = source->x0;
2525 					X[1] = source->x1;
2526 
2527 					if(X[0] > X[1])
2528 					{
2529 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2530 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2531 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2532 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2533 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2534 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2535 					}
2536 					else
2537 					{
2538 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2539 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2540 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2541 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2542 						X[6] = 0;
2543 						X[7] = 0xFF;
2544 					}
2545 
2546 					byte Y[8];
2547 
2548 					Y[0] = source->y0;
2549 					Y[1] = source->y1;
2550 
2551 					if(Y[0] > Y[1])
2552 					{
2553 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2554 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2555 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2556 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2557 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2558 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2559 					}
2560 					else
2561 					{
2562 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2563 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2564 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2565 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2566 						Y[6] = 0;
2567 						Y[7] = 0xFF;
2568 					}
2569 
2570 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2571 					{
2572 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2573 						{
2574 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2575 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2576 
2577 							dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
2578 						}
2579 					}
2580 
2581 					source++;
2582 				}
2583 			}
2584 
2585 			(byte*&)destSlice += internal.sliceB;
2586 		}
2587 
2588 		external.unlockRect();
2589 		internal.unlockRect();
2590 	}
2591 
decodeETC2(Buffer & internal,Buffer & external,int nbAlphaBits,bool isSRGB)2592 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
2593 	{
2594 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2595 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2596 		external.unlockRect();
2597 		internal.unlockRect();
2598 
2599 		if(isSRGB)
2600 		{
2601 			static byte sRGBtoLinearTable[256];
2602 			static bool sRGBtoLinearTableDirty = true;
2603 			if(sRGBtoLinearTableDirty)
2604 			{
2605 				for(int i = 0; i < 256; i++)
2606 				{
2607 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2608 				}
2609 				sRGBtoLinearTableDirty = false;
2610 			}
2611 
2612 			// Perform sRGB conversion in place after decoding
2613 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2614 			for(int y = 0; y < internal.height; y++)
2615 			{
2616 				byte *srcRow = src + y * internal.pitchB;
2617 				for(int x = 0; x <  internal.width; x++)
2618 				{
2619 					byte *srcPix = srcRow + x * internal.bytes;
2620 					for(int i = 0; i < 3; i++)
2621 					{
2622 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2623 					}
2624 				}
2625 			}
2626 			internal.unlockRect();
2627 		}
2628 	}
2629 
decodeEAC(Buffer & internal,Buffer & external,int nbChannels,bool isSigned)2630 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
2631 	{
2632 		ASSERT(nbChannels == 1 || nbChannels == 2);
2633 
2634 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2635 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2636 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2637 		external.unlockRect();
2638 
2639 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
2640 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
2641 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
2642 		for(int y = 0; y < internal.height; y++)
2643 		{
2644 			byte* srcRow = src + y * internal.pitchB;
2645 			for(int x = internal.width - 1; x >= 0; x--)
2646 			{
2647 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
2648 				float* dstPix = reinterpret_cast<float*>(srcPix);
2649 				for(int c = nbChannels - 1; c >= 0; c--)
2650 				{
2651 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2652 				}
2653 			}
2654 		}
2655 
2656 		internal.unlockRect();
2657 	}
2658 
decodeASTC(Buffer & internal,Buffer & external,int xBlockSize,int yBlockSize,int zBlockSize,bool isSRGB)2659 	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2660 	{
2661 	}
2662 
size(int width,int height,int depth,int border,int samples,Format format)2663 	size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
2664 	{
2665 		samples = max(1, samples);
2666 
2667 		switch(format)
2668 		{
2669 		default:
2670 			{
2671 				uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
2672 
2673 				// We can only sample buffers smaller than 2 GiB, due to signed 32-bit offset calculations.
2674 				// Force an out-of-memory if larger, or let the caller report an error.
2675 				if(size >= 0x80000000u)
2676 				{
2677 					return std::numeric_limits<size_t>::max();
2678 				}
2679 
2680 				// Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
2681 				// and stencil operations also read 8 bytes per four 8-bit stencil values,
2682 				// so we have to allocate 4 extra bytes to avoid buffer overruns.
2683 			    // TODO(b/145229887): Eliminate if possible, or don't hard-code.
2684 				return size + 4;
2685 			}
2686 		case FORMAT_YV12_BT601:
2687 		case FORMAT_YV12_BT709:
2688 		case FORMAT_YV12_JFIF:
2689 			{
2690 				width += 2 * border;
2691 				height += 2 * border;
2692 
2693 				size_t YStride = align<16>(width);
2694 				size_t YSize = YStride * height;
2695 				size_t CStride = align<16>(YStride / 2);
2696 				size_t CSize = CStride * height / 2;
2697 
2698 				return YSize + 2 * CSize;
2699 			}
2700 		}
2701 	}
2702 
isStencil(Format format)2703 	bool Surface::isStencil(Format format)
2704 	{
2705 		switch(format)
2706 		{
2707 		case FORMAT_D32:
2708 		case FORMAT_D16:
2709 		case FORMAT_D24X8:
2710 		case FORMAT_D32F:
2711 		case FORMAT_D32F_COMPLEMENTARY:
2712 		case FORMAT_D32F_LOCKABLE:
2713 		case FORMAT_D32F_SHADOW:
2714 			return false;
2715 		case FORMAT_D24S8:
2716 		case FORMAT_D24FS8:
2717 		case FORMAT_S8:
2718 		case FORMAT_DF24S8:
2719 		case FORMAT_DF16S8:
2720 		case FORMAT_D32FS8_TEXTURE:
2721 		case FORMAT_D32FS8_SHADOW:
2722 		case FORMAT_D32FS8:
2723 		case FORMAT_D32FS8_COMPLEMENTARY:
2724 		case FORMAT_INTZ:
2725 			return true;
2726 		default:
2727 			return false;
2728 		}
2729 	}
2730 
isDepth(Format format)2731 	bool Surface::isDepth(Format format)
2732 	{
2733 		switch(format)
2734 		{
2735 		case FORMAT_D32:
2736 		case FORMAT_D16:
2737 		case FORMAT_D24X8:
2738 		case FORMAT_D24S8:
2739 		case FORMAT_D24FS8:
2740 		case FORMAT_D32F:
2741 		case FORMAT_D32FS8:
2742 		case FORMAT_D32F_COMPLEMENTARY:
2743 		case FORMAT_D32FS8_COMPLEMENTARY:
2744 		case FORMAT_D32F_LOCKABLE:
2745 		case FORMAT_DF24S8:
2746 		case FORMAT_DF16S8:
2747 		case FORMAT_D32FS8_TEXTURE:
2748 		case FORMAT_D32F_SHADOW:
2749 		case FORMAT_D32FS8_SHADOW:
2750 		case FORMAT_INTZ:
2751 			return true;
2752 		case FORMAT_S8:
2753 			return false;
2754 		default:
2755 			return false;
2756 		}
2757 	}
2758 
hasQuadLayout(Format format)2759 	bool Surface::hasQuadLayout(Format format)
2760 	{
2761 		switch(format)
2762 		{
2763 		case FORMAT_D32:
2764 		case FORMAT_D16:
2765 		case FORMAT_D24X8:
2766 		case FORMAT_D24S8:
2767 		case FORMAT_D24FS8:
2768 		case FORMAT_D32F:
2769 		case FORMAT_D32FS8:
2770 		case FORMAT_D32F_COMPLEMENTARY:
2771 		case FORMAT_D32FS8_COMPLEMENTARY:
2772 		case FORMAT_DF24S8:
2773 		case FORMAT_DF16S8:
2774 		case FORMAT_INTZ:
2775 		case FORMAT_S8:
2776 		case FORMAT_A8G8R8B8Q:
2777 		case FORMAT_X8G8R8B8Q:
2778 			return true;
2779 		case FORMAT_D32F_LOCKABLE:
2780 		case FORMAT_D32FS8_TEXTURE:
2781 		case FORMAT_D32F_SHADOW:
2782 		case FORMAT_D32FS8_SHADOW:
2783 		default:
2784 			break;
2785 		}
2786 
2787 		return false;
2788 	}
2789 
isPalette(Format format)2790 	bool Surface::isPalette(Format format)
2791 	{
2792 		switch(format)
2793 		{
2794 		case FORMAT_P8:
2795 		case FORMAT_A8P8:
2796 			return true;
2797 		default:
2798 			return false;
2799 		}
2800 	}
2801 
isFloatFormat(Format format)2802 	bool Surface::isFloatFormat(Format format)
2803 	{
2804 		switch(format)
2805 		{
2806 		case FORMAT_R5G6B5:
2807 		case FORMAT_R8G8B8:
2808 		case FORMAT_B8G8R8:
2809 		case FORMAT_X8R8G8B8:
2810 		case FORMAT_X8B8G8R8I:
2811 		case FORMAT_X8B8G8R8:
2812 		case FORMAT_A8R8G8B8:
2813 		case FORMAT_SRGB8_X8:
2814 		case FORMAT_SRGB8_A8:
2815 		case FORMAT_A8B8G8R8I:
2816 		case FORMAT_R8UI:
2817 		case FORMAT_G8R8UI:
2818 		case FORMAT_X8B8G8R8UI:
2819 		case FORMAT_A8B8G8R8UI:
2820 		case FORMAT_A8B8G8R8:
2821 		case FORMAT_G8R8I:
2822 		case FORMAT_G8R8:
2823 		case FORMAT_A2B10G10R10:
2824 		case FORMAT_A2B10G10R10UI:
2825 		case FORMAT_R8_SNORM:
2826 		case FORMAT_G8R8_SNORM:
2827 		case FORMAT_X8B8G8R8_SNORM:
2828 		case FORMAT_A8B8G8R8_SNORM:
2829 		case FORMAT_R16I:
2830 		case FORMAT_R16UI:
2831 		case FORMAT_G16R16I:
2832 		case FORMAT_G16R16UI:
2833 		case FORMAT_G16R16:
2834 		case FORMAT_X16B16G16R16I:
2835 		case FORMAT_X16B16G16R16UI:
2836 		case FORMAT_A16B16G16R16I:
2837 		case FORMAT_A16B16G16R16UI:
2838 		case FORMAT_A16B16G16R16:
2839 		case FORMAT_V8U8:
2840 		case FORMAT_Q8W8V8U8:
2841 		case FORMAT_X8L8V8U8:
2842 		case FORMAT_V16U16:
2843 		case FORMAT_A16W16V16U16:
2844 		case FORMAT_Q16W16V16U16:
2845 		case FORMAT_A8:
2846 		case FORMAT_R8I:
2847 		case FORMAT_R8:
2848 		case FORMAT_S8:
2849 		case FORMAT_L8:
2850 		case FORMAT_L16:
2851 		case FORMAT_A8L8:
2852 		case FORMAT_YV12_BT601:
2853 		case FORMAT_YV12_BT709:
2854 		case FORMAT_YV12_JFIF:
2855 		case FORMAT_R32I:
2856 		case FORMAT_R32UI:
2857 		case FORMAT_G32R32I:
2858 		case FORMAT_G32R32UI:
2859 		case FORMAT_X32B32G32R32I:
2860 		case FORMAT_X32B32G32R32UI:
2861 		case FORMAT_A32B32G32R32I:
2862 		case FORMAT_A32B32G32R32UI:
2863 			return false;
2864 		case FORMAT_R16F:
2865 		case FORMAT_G16R16F:
2866 		case FORMAT_B16G16R16F:
2867 		case FORMAT_X16B16G16R16F:
2868 		case FORMAT_A16B16G16R16F:
2869 		case FORMAT_X16B16G16R16F_UNSIGNED:
2870 		case FORMAT_R32F:
2871 		case FORMAT_G32R32F:
2872 		case FORMAT_B32G32R32F:
2873 		case FORMAT_X32B32G32R32F:
2874 		case FORMAT_A32B32G32R32F:
2875 		case FORMAT_X32B32G32R32F_UNSIGNED:
2876 		case FORMAT_D32F:
2877 		case FORMAT_D32FS8:
2878 		case FORMAT_D32F_COMPLEMENTARY:
2879 		case FORMAT_D32FS8_COMPLEMENTARY:
2880 		case FORMAT_D32F_LOCKABLE:
2881 		case FORMAT_D32FS8_TEXTURE:
2882 		case FORMAT_D32F_SHADOW:
2883 		case FORMAT_D32FS8_SHADOW:
2884 		case FORMAT_L16F:
2885 		case FORMAT_A16L16F:
2886 		case FORMAT_L32F:
2887 		case FORMAT_A32L32F:
2888 			return true;
2889 		default:
2890 			ASSERT(false);
2891 		}
2892 
2893 		return false;
2894 	}
2895 
isUnsignedComponent(Format format,int component)2896 	bool Surface::isUnsignedComponent(Format format, int component)
2897 	{
2898 		switch(format)
2899 		{
2900 		case FORMAT_NULL:
2901 		case FORMAT_R5G6B5:
2902 		case FORMAT_R8G8B8:
2903 		case FORMAT_B8G8R8:
2904 		case FORMAT_X8R8G8B8:
2905 		case FORMAT_X8B8G8R8:
2906 		case FORMAT_A8R8G8B8:
2907 		case FORMAT_A8B8G8R8:
2908 		case FORMAT_SRGB8_X8:
2909 		case FORMAT_SRGB8_A8:
2910 		case FORMAT_G8R8:
2911 		case FORMAT_A2B10G10R10:
2912 		case FORMAT_A2B10G10R10UI:
2913 		case FORMAT_R16UI:
2914 		case FORMAT_G16R16:
2915 		case FORMAT_G16R16UI:
2916 		case FORMAT_X16B16G16R16UI:
2917 		case FORMAT_A16B16G16R16:
2918 		case FORMAT_A16B16G16R16UI:
2919 		case FORMAT_R32UI:
2920 		case FORMAT_G32R32UI:
2921 		case FORMAT_X32B32G32R32UI:
2922 		case FORMAT_A32B32G32R32UI:
2923 		case FORMAT_X32B32G32R32F_UNSIGNED:
2924 		case FORMAT_R8UI:
2925 		case FORMAT_G8R8UI:
2926 		case FORMAT_X8B8G8R8UI:
2927 		case FORMAT_A8B8G8R8UI:
2928 		case FORMAT_D32F:
2929 		case FORMAT_D32FS8:
2930 		case FORMAT_D32F_COMPLEMENTARY:
2931 		case FORMAT_D32FS8_COMPLEMENTARY:
2932 		case FORMAT_D32F_LOCKABLE:
2933 		case FORMAT_D32FS8_TEXTURE:
2934 		case FORMAT_D32F_SHADOW:
2935 		case FORMAT_D32FS8_SHADOW:
2936 		case FORMAT_A8:
2937 		case FORMAT_R8:
2938 		case FORMAT_L8:
2939 		case FORMAT_L16:
2940 		case FORMAT_A8L8:
2941 		case FORMAT_YV12_BT601:
2942 		case FORMAT_YV12_BT709:
2943 		case FORMAT_YV12_JFIF:
2944 			return true;
2945 		case FORMAT_A8B8G8R8I:
2946 		case FORMAT_A16B16G16R16I:
2947 		case FORMAT_A32B32G32R32I:
2948 		case FORMAT_A8B8G8R8_SNORM:
2949 		case FORMAT_Q8W8V8U8:
2950 		case FORMAT_Q16W16V16U16:
2951 		case FORMAT_A32B32G32R32F:
2952 			return false;
2953 		case FORMAT_R32F:
2954 		case FORMAT_R8I:
2955 		case FORMAT_R16I:
2956 		case FORMAT_R32I:
2957 		case FORMAT_R8_SNORM:
2958 			return component >= 1;
2959 		case FORMAT_V8U8:
2960 		case FORMAT_X8L8V8U8:
2961 		case FORMAT_V16U16:
2962 		case FORMAT_G32R32F:
2963 		case FORMAT_G8R8I:
2964 		case FORMAT_G16R16I:
2965 		case FORMAT_G32R32I:
2966 		case FORMAT_G8R8_SNORM:
2967 			return component >= 2;
2968 		case FORMAT_A16W16V16U16:
2969 		case FORMAT_B32G32R32F:
2970 		case FORMAT_X32B32G32R32F:
2971 		case FORMAT_X8B8G8R8I:
2972 		case FORMAT_X16B16G16R16I:
2973 		case FORMAT_X32B32G32R32I:
2974 		case FORMAT_X8B8G8R8_SNORM:
2975 			return component >= 3;
2976 		default:
2977 			ASSERT(false);
2978 		}
2979 
2980 		return false;
2981 	}
2982 
isSRGBreadable(Format format)2983 	bool Surface::isSRGBreadable(Format format)
2984 	{
2985 		// Keep in sync with Capabilities::isSRGBreadable
2986 		switch(format)
2987 		{
2988 		case FORMAT_L8:
2989 		case FORMAT_A8L8:
2990 		case FORMAT_R8G8B8:
2991 		case FORMAT_A8R8G8B8:
2992 		case FORMAT_X8R8G8B8:
2993 		case FORMAT_A8B8G8R8:
2994 		case FORMAT_X8B8G8R8:
2995 		case FORMAT_SRGB8_X8:
2996 		case FORMAT_SRGB8_A8:
2997 		case FORMAT_R5G6B5:
2998 		case FORMAT_X1R5G5B5:
2999 		case FORMAT_A1R5G5B5:
3000 		case FORMAT_A4R4G4B4:
3001 		case FORMAT_DXT1:
3002 		case FORMAT_DXT3:
3003 		case FORMAT_DXT5:
3004 		case FORMAT_ATI1:
3005 		case FORMAT_ATI2:
3006 			return true;
3007 		default:
3008 			return false;
3009 		}
3010 	}
3011 
isSRGBwritable(Format format)3012 	bool Surface::isSRGBwritable(Format format)
3013 	{
3014 		// Keep in sync with Capabilities::isSRGBwritable
3015 		switch(format)
3016 		{
3017 		case FORMAT_NULL:
3018 		case FORMAT_A8R8G8B8:
3019 		case FORMAT_X8R8G8B8:
3020 		case FORMAT_A8B8G8R8:
3021 		case FORMAT_X8B8G8R8:
3022 		case FORMAT_SRGB8_X8:
3023 		case FORMAT_SRGB8_A8:
3024 		case FORMAT_R5G6B5:
3025 			return true;
3026 		default:
3027 			return false;
3028 		}
3029 	}
3030 
isSRGBformat(Format format)3031 	bool Surface::isSRGBformat(Format format)
3032 	{
3033 		switch(format)
3034 		{
3035 		case FORMAT_SRGB8_X8:
3036 		case FORMAT_SRGB8_A8:
3037 			return true;
3038 		default:
3039 			return false;
3040 		}
3041 	}
3042 
isCompressed(Format format)3043 	bool Surface::isCompressed(Format format)
3044 	{
3045 		switch(format)
3046 		{
3047 		case FORMAT_DXT1:
3048 		case FORMAT_DXT3:
3049 		case FORMAT_DXT5:
3050 		case FORMAT_ATI1:
3051 		case FORMAT_ATI2:
3052 		case FORMAT_ETC1:
3053 		case FORMAT_R11_EAC:
3054 		case FORMAT_SIGNED_R11_EAC:
3055 		case FORMAT_RG11_EAC:
3056 		case FORMAT_SIGNED_RG11_EAC:
3057 		case FORMAT_RGB8_ETC2:
3058 		case FORMAT_SRGB8_ETC2:
3059 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3060 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3061 		case FORMAT_RGBA8_ETC2_EAC:
3062 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3063 		case FORMAT_RGBA_ASTC_4x4_KHR:
3064 		case FORMAT_RGBA_ASTC_5x4_KHR:
3065 		case FORMAT_RGBA_ASTC_5x5_KHR:
3066 		case FORMAT_RGBA_ASTC_6x5_KHR:
3067 		case FORMAT_RGBA_ASTC_6x6_KHR:
3068 		case FORMAT_RGBA_ASTC_8x5_KHR:
3069 		case FORMAT_RGBA_ASTC_8x6_KHR:
3070 		case FORMAT_RGBA_ASTC_8x8_KHR:
3071 		case FORMAT_RGBA_ASTC_10x5_KHR:
3072 		case FORMAT_RGBA_ASTC_10x6_KHR:
3073 		case FORMAT_RGBA_ASTC_10x8_KHR:
3074 		case FORMAT_RGBA_ASTC_10x10_KHR:
3075 		case FORMAT_RGBA_ASTC_12x10_KHR:
3076 		case FORMAT_RGBA_ASTC_12x12_KHR:
3077 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3078 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3079 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3080 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3081 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3082 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3083 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3084 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3085 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3086 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3087 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3088 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3089 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3090 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3091 			return true;
3092 		default:
3093 			return false;
3094 		}
3095 	}
3096 
isSignedNonNormalizedInteger(Format format)3097 	bool Surface::isSignedNonNormalizedInteger(Format format)
3098 	{
3099 		switch(format)
3100 		{
3101 		case FORMAT_A8B8G8R8I:
3102 		case FORMAT_X8B8G8R8I:
3103 		case FORMAT_G8R8I:
3104 		case FORMAT_R8I:
3105 		case FORMAT_A16B16G16R16I:
3106 		case FORMAT_X16B16G16R16I:
3107 		case FORMAT_G16R16I:
3108 		case FORMAT_R16I:
3109 		case FORMAT_A32B32G32R32I:
3110 		case FORMAT_X32B32G32R32I:
3111 		case FORMAT_G32R32I:
3112 		case FORMAT_R32I:
3113 			return true;
3114 		default:
3115 			return false;
3116 		}
3117 	}
3118 
isUnsignedNonNormalizedInteger(Format format)3119 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
3120 	{
3121 		switch(format)
3122 		{
3123 		case FORMAT_A8B8G8R8UI:
3124 		case FORMAT_X8B8G8R8UI:
3125 		case FORMAT_G8R8UI:
3126 		case FORMAT_R8UI:
3127 		case FORMAT_A16B16G16R16UI:
3128 		case FORMAT_X16B16G16R16UI:
3129 		case FORMAT_G16R16UI:
3130 		case FORMAT_R16UI:
3131 		case FORMAT_A32B32G32R32UI:
3132 		case FORMAT_X32B32G32R32UI:
3133 		case FORMAT_G32R32UI:
3134 		case FORMAT_R32UI:
3135 			return true;
3136 		default:
3137 			return false;
3138 		}
3139 	}
3140 
isNonNormalizedInteger(Format format)3141 	bool Surface::isNonNormalizedInteger(Format format)
3142 	{
3143 		return isSignedNonNormalizedInteger(format) ||
3144 		       isUnsignedNonNormalizedInteger(format);
3145 	}
3146 
isNormalizedInteger(Format format)3147 	bool Surface::isNormalizedInteger(Format format)
3148 	{
3149 		return !isFloatFormat(format) &&
3150 		       !isNonNormalizedInteger(format) &&
3151 		       !isCompressed(format) &&
3152 		       !isDepth(format) &&
3153 		       !isStencil(format);
3154 	}
3155 
componentCount(Format format)3156 	int Surface::componentCount(Format format)
3157 	{
3158 		switch(format)
3159 		{
3160 		case FORMAT_R5G6B5:         return 3;
3161 		case FORMAT_X8R8G8B8:       return 3;
3162 		case FORMAT_X8B8G8R8I:      return 3;
3163 		case FORMAT_X8B8G8R8:       return 3;
3164 		case FORMAT_A8R8G8B8:       return 4;
3165 		case FORMAT_SRGB8_X8:       return 3;
3166 		case FORMAT_SRGB8_A8:       return 4;
3167 		case FORMAT_A8B8G8R8I:      return 4;
3168 		case FORMAT_A8B8G8R8:       return 4;
3169 		case FORMAT_G8R8I:          return 2;
3170 		case FORMAT_G8R8:           return 2;
3171 		case FORMAT_R8_SNORM:      return 1;
3172 		case FORMAT_G8R8_SNORM:    return 2;
3173 		case FORMAT_X8B8G8R8_SNORM:return 3;
3174 		case FORMAT_A8B8G8R8_SNORM:return 4;
3175 		case FORMAT_R8UI:           return 1;
3176 		case FORMAT_G8R8UI:         return 2;
3177 		case FORMAT_X8B8G8R8UI:     return 3;
3178 		case FORMAT_A8B8G8R8UI:     return 4;
3179 		case FORMAT_A2B10G10R10:    return 4;
3180 		case FORMAT_A2B10G10R10UI:  return 4;
3181 		case FORMAT_G16R16I:        return 2;
3182 		case FORMAT_G16R16UI:       return 2;
3183 		case FORMAT_G16R16:         return 2;
3184 		case FORMAT_G32R32I:        return 2;
3185 		case FORMAT_G32R32UI:       return 2;
3186 		case FORMAT_X16B16G16R16I:  return 3;
3187 		case FORMAT_X16B16G16R16UI: return 3;
3188 		case FORMAT_A16B16G16R16I:  return 4;
3189 		case FORMAT_A16B16G16R16UI: return 4;
3190 		case FORMAT_A16B16G16R16:   return 4;
3191 		case FORMAT_X32B32G32R32I:  return 3;
3192 		case FORMAT_X32B32G32R32UI: return 3;
3193 		case FORMAT_A32B32G32R32I:  return 4;
3194 		case FORMAT_A32B32G32R32UI: return 4;
3195 		case FORMAT_V8U8:           return 2;
3196 		case FORMAT_Q8W8V8U8:       return 4;
3197 		case FORMAT_X8L8V8U8:       return 3;
3198 		case FORMAT_V16U16:         return 2;
3199 		case FORMAT_A16W16V16U16:   return 4;
3200 		case FORMAT_Q16W16V16U16:   return 4;
3201 		case FORMAT_R32F:           return 1;
3202 		case FORMAT_G32R32F:        return 2;
3203 		case FORMAT_X32B32G32R32F:  return 3;
3204 		case FORMAT_A32B32G32R32F:  return 4;
3205 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
3206 		case FORMAT_D32F:           return 1;
3207 		case FORMAT_D32FS8:         return 1;
3208 		case FORMAT_D32F_LOCKABLE:  return 1;
3209 		case FORMAT_D32FS8_TEXTURE: return 1;
3210 		case FORMAT_D32F_SHADOW:    return 1;
3211 		case FORMAT_D32FS8_SHADOW:  return 1;
3212 		case FORMAT_A8:             return 1;
3213 		case FORMAT_R8I:            return 1;
3214 		case FORMAT_R8:             return 1;
3215 		case FORMAT_R16I:           return 1;
3216 		case FORMAT_R16UI:          return 1;
3217 		case FORMAT_R32I:           return 1;
3218 		case FORMAT_R32UI:          return 1;
3219 		case FORMAT_L8:             return 1;
3220 		case FORMAT_L16:            return 1;
3221 		case FORMAT_A8L8:           return 2;
3222 		case FORMAT_YV12_BT601:     return 3;
3223 		case FORMAT_YV12_BT709:     return 3;
3224 		case FORMAT_YV12_JFIF:      return 3;
3225 		default:
3226 			ASSERT(false);
3227 		}
3228 
3229 		return 1;
3230 	}
3231 
allocateBuffer(int width,int height,int depth,int border,int samples,Format format)3232 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
3233 	{
3234 		return allocate(size(width, height, depth, border, samples, format));
3235 	}
3236 
memfill4(void * buffer,int pattern,int bytes)3237 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3238 	{
3239 		while((size_t)buffer & 0x1 && bytes >= 1)
3240 		{
3241 			*(char*)buffer = (char)pattern;
3242 			(char*&)buffer += 1;
3243 			bytes -= 1;
3244 		}
3245 
3246 		while((size_t)buffer & 0x3 && bytes >= 2)
3247 		{
3248 			*(short*)buffer = (short)pattern;
3249 			(short*&)buffer += 1;
3250 			bytes -= 2;
3251 		}
3252 
3253 		#if defined(__i386__) || defined(__x86_64__)
3254 			if(CPUID::supportsSSE())
3255 			{
3256 				while((size_t)buffer & 0xF && bytes >= 4)
3257 				{
3258 					*(int*)buffer = pattern;
3259 					(int*&)buffer += 1;
3260 					bytes -= 4;
3261 				}
3262 
3263 				__m128 quad = _mm_set_ps1((float&)pattern);
3264 
3265 				float *pointer = (float*)buffer;
3266 				int qxwords = bytes / 64;
3267 				bytes -= qxwords * 64;
3268 
3269 				while(qxwords--)
3270 				{
3271 					_mm_stream_ps(pointer + 0, quad);
3272 					_mm_stream_ps(pointer + 4, quad);
3273 					_mm_stream_ps(pointer + 8, quad);
3274 					_mm_stream_ps(pointer + 12, quad);
3275 
3276 					pointer += 16;
3277 				}
3278 
3279 				buffer = pointer;
3280 			}
3281 		#endif
3282 
3283 		while(bytes >= 4)
3284 		{
3285 			*(int*)buffer = (int)pattern;
3286 			(int*&)buffer += 1;
3287 			bytes -= 4;
3288 		}
3289 
3290 		while(bytes >= 2)
3291 		{
3292 			*(short*)buffer = (short)pattern;
3293 			(short*&)buffer += 1;
3294 			bytes -= 2;
3295 		}
3296 
3297 		while(bytes >= 1)
3298 		{
3299 			*(char*)buffer = (char)pattern;
3300 			(char*&)buffer += 1;
3301 			bytes -= 1;
3302 		}
3303 	}
3304 
sync()3305 	void Surface::sync()
3306 	{
3307 		resource->lock(EXCLUSIVE);
3308 		resource->unlock();
3309 	}
3310 
isEntire(const Rect & rect) const3311 	bool Surface::isEntire(const Rect& rect) const
3312 	{
3313 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3314 	}
3315 
getRect() const3316 	Rect Surface::getRect() const
3317 	{
3318 		return Rect(0, 0, internal.width, internal.height);
3319 	}
3320 
clearDepth(float depth,int x0,int y0,int width,int height)3321 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3322 	{
3323 		if(width == 0 || height == 0)
3324 		{
3325 			return;
3326 		}
3327 
3328 		if(internal.format == FORMAT_NULL)
3329 		{
3330 			return;
3331 		}
3332 
3333 		// Not overlapping
3334 		if(x0 > internal.width) return;
3335 		if(y0 > internal.height) return;
3336 		if(x0 + width < 0) return;
3337 		if(y0 + height < 0) return;
3338 
3339 		// Clip against dimensions
3340 		if(x0 < 0) {width += x0; x0 = 0;}
3341 		if(x0 + width > internal.width) width = internal.width - x0;
3342 		if(y0 < 0) {height += y0; y0 = 0;}
3343 		if(y0 + height > internal.height) height = internal.height - y0;
3344 
3345 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3346 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3347 
3348 		int x1 = x0 + width;
3349 		int y1 = y0 + height;
3350 
3351 		if(!hasQuadLayout(internal.format))
3352 		{
3353 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
3354 
3355 			for(int z = 0; z < internal.samples; z++)
3356 			{
3357 				float *row = target;
3358 				for(int y = y0; y < y1; y++)
3359 				{
3360 					memfill4(row, (int&)depth, width * sizeof(float));
3361 					row += internal.pitchP;
3362 				}
3363 				target += internal.sliceP;
3364 			}
3365 
3366 			unlockInternal();
3367 		}
3368 		else   // Quad layout
3369 		{
3370 			if(complementaryDepthBuffer)
3371 			{
3372 				depth = 1 - depth;
3373 			}
3374 
3375 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3376 
3377 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3378 			int oddX1 = (x1 & ~1) * 2;
3379 			int evenX0 = ((x0 + 1) & ~1) * 2;
3380 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3381 
3382 			for(int z = 0; z < internal.samples; z++)
3383 			{
3384 				for(int y = y0; y < y1; y++)
3385 				{
3386 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
3387 
3388 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3389 					{
3390 						if((x0 & 1) != 0)
3391 						{
3392 							target[oddX0 + 0] = depth;
3393 							target[oddX0 + 2] = depth;
3394 						}
3395 
3396 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3397 					//	{
3398 					//		target[x2 + 0] = depth;
3399 					//		target[x2 + 1] = depth;
3400 					//		target[x2 + 2] = depth;
3401 					//		target[x2 + 3] = depth;
3402 					//	}
3403 
3404 					//	__asm
3405 					//	{
3406 					//		movss xmm0, depth
3407 					//		shufps xmm0, xmm0, 0x00
3408 					//
3409 					//		mov eax, x0
3410 					//		add eax, 1
3411 					//		and eax, 0xFFFFFFFE
3412 					//		cmp eax, x1
3413 					//		jge qEnd
3414 					//
3415 					//		mov edi, target
3416 					//
3417 					//	qLoop:
3418 					//		movntps [edi+8*eax], xmm0
3419 					//
3420 					//		add eax, 2
3421 					//		cmp eax, x1
3422 					//		jl qLoop
3423 					//	qEnd:
3424 					//	}
3425 
3426 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3427 
3428 						if((x1 & 1) != 0)
3429 						{
3430 							target[oddX1 + 0] = depth;
3431 							target[oddX1 + 2] = depth;
3432 						}
3433 
3434 						y++;
3435 					}
3436 					else
3437 					{
3438 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3439 						{
3440 							target[i] = depth;
3441 						}
3442 					}
3443 				}
3444 
3445 				buffer += internal.sliceP;
3446 			}
3447 
3448 			unlockInternal();
3449 		}
3450 	}
3451 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3452 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3453 	{
3454 		if(mask == 0 || width == 0 || height == 0)
3455 		{
3456 			return;
3457 		}
3458 
3459 		if(stencil.format == FORMAT_NULL)
3460 		{
3461 			return;
3462 		}
3463 
3464 		// Not overlapping
3465 		if(x0 > internal.width) return;
3466 		if(y0 > internal.height) return;
3467 		if(x0 + width < 0) return;
3468 		if(y0 + height < 0) return;
3469 
3470 		// Clip against dimensions
3471 		if(x0 < 0) {width += x0; x0 = 0;}
3472 		if(x0 + width > internal.width) width = internal.width - x0;
3473 		if(y0 < 0) {height += y0; y0 = 0;}
3474 		if(y0 + height > internal.height) height = internal.height - y0;
3475 
3476 		int x1 = x0 + width;
3477 		int y1 = y0 + height;
3478 
3479 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3480 		int oddX1 = (x1 & ~1) * 2;
3481 		int evenX0 = ((x0 + 1) & ~1) * 2;
3482 		int evenBytes = oddX1 - evenX0;
3483 
3484 		unsigned char maskedS = s & mask;
3485 		unsigned char invMask = ~mask;
3486 		unsigned int fill = maskedS;
3487 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3488 
3489 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3490 
3491 		// Stencil buffers are assumed to use quad layout
3492 		for(int z = 0; z < stencil.samples; z++)
3493 		{
3494 			for(int y = y0; y < y1; y++)
3495 			{
3496 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
3497 
3498 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3499 				{
3500 					if((x0 & 1) != 0)
3501 					{
3502 						target[oddX0 + 0] = fill;
3503 						target[oddX0 + 2] = fill;
3504 					}
3505 
3506 					memfill4(&target[evenX0], fill, evenBytes);
3507 
3508 					if((x1 & 1) != 0)
3509 					{
3510 						target[oddX1 + 0] = fill;
3511 						target[oddX1 + 2] = fill;
3512 					}
3513 
3514 					y++;
3515 				}
3516 				else
3517 				{
3518 					for(int x = x0; x < x1; x++)
3519 					{
3520 						int i = (x & ~1) * 2 + (x & 1);
3521 						target[i] = maskedS | (target[i] & invMask);
3522 					}
3523 				}
3524 			}
3525 
3526 			buffer += stencil.sliceP;
3527 		}
3528 
3529 		unlockStencil();
3530 	}
3531 
fill(const Color<float> & color,int x0,int y0,int width,int height)3532 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3533 	{
3534 		unsigned char *row;
3535 		Buffer *buffer;
3536 
3537 		if(internal.dirty)
3538 		{
3539 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3540 			buffer = &internal;
3541 		}
3542 		else
3543 		{
3544 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3545 			buffer = &external;
3546 		}
3547 
3548 		if(buffer->bytes <= 4)
3549 		{
3550 			int c;
3551 			buffer->write(&c, color);
3552 
3553 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3554 			if(buffer->bytes <= 2) c = (c << 16) | c;
3555 
3556 			for(int y = 0; y < height; y++)
3557 			{
3558 				memfill4(row, c, width * buffer->bytes);
3559 
3560 				row += buffer->pitchB;
3561 			}
3562 		}
3563 		else   // Generic
3564 		{
3565 			for(int y = 0; y < height; y++)
3566 			{
3567 				unsigned char *element = row;
3568 
3569 				for(int x = 0; x < width; x++)
3570 				{
3571 					buffer->write(element, color);
3572 
3573 					element += buffer->bytes;
3574 				}
3575 
3576 				row += buffer->pitchB;
3577 			}
3578 		}
3579 
3580 		if(buffer == &internal)
3581 		{
3582 			unlockInternal();
3583 		}
3584 		else
3585 		{
3586 			unlockExternal();
3587 		}
3588 	}
3589 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3590 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
3591 	{
3592 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3593 
3594 		sw::Color<float> color;
3595 
3596 		if(!filter)
3597 		{
3598 			color = source->internal.read((int)srcX, (int)srcY, 0);
3599 		}
3600 		else   // Bilinear filtering
3601 		{
3602 			color = source->internal.sample(srcX, srcY, 0);
3603 		}
3604 
3605 		internal.write(x, y, color);
3606 	}
3607 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3608 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3609 	{
3610 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3611 
3612 		sw::Color<float> color;
3613 
3614 		if(!filter)
3615 		{
3616 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3617 		}
3618 		else   // Bilinear filtering
3619 		{
3620 			color = source->internal.sample(srcX, srcY, srcZ);
3621 		}
3622 
3623 		internal.write(x, y, z, color);
3624 	}
3625 
copyCubeEdge(Edge dstEdge,Surface * src,Edge srcEdge)3626 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
3627 	{
3628 		Surface *dst = this;
3629 
3630 		// Figure out if the edges to be copied in reverse order respectively from one another
3631 		// The copy should be reversed whenever the same edges are contiguous or if we're
3632 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
3633 		//
3634 		//      | +y |
3635 		// | -x | +z | +x | -z |
3636 		//      | -y |
3637 
3638 		bool reverse = (srcEdge == dstEdge) ||
3639 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
3640 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
3641 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
3642 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
3643 
3644 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
3645 		int srcPitch = src->getInternalPitchB();
3646 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
3647 		int dstPitch = dst->getInternalPitchB();
3648 
3649 		int srcW = src->getWidth();
3650 		int srcH = src->getHeight();
3651 		int dstW = dst->getWidth();
3652 		int dstH = dst->getHeight();
3653 
3654 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
3655 
3656 		// Src is expressed in the regular [0, width-1], [0, height-1] space
3657 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
3658 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
3659 
3660 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
3661 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
3662 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
3663 
3664 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
3665 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
3666 
3667 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
3668 		{
3669 			memcpy(dstBuf, srcBuf, srcBytes);
3670 		}
3671 
3672 		if(dstEdge == LEFT || dstEdge == RIGHT)
3673 		{
3674 			// TOP and BOTTOM are already set, let's average out the corners
3675 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
3676 			int y0 = -1;
3677 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
3678 			int y1 = 0;
3679 			dst->computeCubeCorner(x0, y0, x1, y1);
3680 			y0 = dstH;
3681 			y1 = dstH - 1;
3682 			dst->computeCubeCorner(x0, y0, x1, y1);
3683 		}
3684 
3685 		src->unlockInternal();
3686 		dst->unlockInternal();
3687 	}
3688 
computeCubeCorner(int x0,int y0,int x1,int y1)3689 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
3690 	{
3691 		ASSERT(internal.lock != LOCK_UNLOCKED);
3692 
3693 		sw::Color<float> color = internal.read(x0, y1);
3694 		color += internal.read(x1, y0);
3695 		color += internal.read(x1, y1);
3696 		color *= (1.0f / 3.0f);
3697 
3698 		internal.write(x0, y0, color);
3699 	}
3700 
hasStencil() const3701 	bool Surface::hasStencil() const
3702 	{
3703 		return isStencil(external.format);
3704 	}
3705 
hasDepth() const3706 	bool Surface::hasDepth() const
3707 	{
3708 		return isDepth(external.format);
3709 	}
3710 
hasPalette() const3711 	bool Surface::hasPalette() const
3712 	{
3713 		return isPalette(external.format);
3714 	}
3715 
isRenderTarget() const3716 	bool Surface::isRenderTarget() const
3717 	{
3718 		return renderTarget;
3719 	}
3720 
hasDirtyContents() const3721 	bool Surface::hasDirtyContents() const
3722 	{
3723 		return dirtyContents;
3724 	}
3725 
markContentsClean()3726 	void Surface::markContentsClean()
3727 	{
3728 		dirtyContents = false;
3729 	}
3730 
getResource()3731 	Resource *Surface::getResource()
3732 	{
3733 		return resource;
3734 	}
3735 
identicalBuffers() const3736 	bool Surface::identicalBuffers() const
3737 	{
3738 		return external.format == internal.format &&
3739 		       external.width  == internal.width &&
3740 		       external.height == internal.height &&
3741 		       external.depth  == internal.depth &&
3742 		       external.pitchB == internal.pitchB &&
3743 		       external.sliceB == internal.sliceB &&
3744 		       external.border == internal.border &&
3745 		       external.samples == internal.samples;
3746 	}
3747 
selectInternalFormat(Format format) const3748 	Format Surface::selectInternalFormat(Format format) const
3749 	{
3750 		switch(format)
3751 		{
3752 		case FORMAT_NULL:
3753 			return FORMAT_NULL;
3754 		case FORMAT_P8:
3755 		case FORMAT_A8P8:
3756 		case FORMAT_A4R4G4B4:
3757 		case FORMAT_A1R5G5B5:
3758 		case FORMAT_A8R3G3B2:
3759 			return FORMAT_A8R8G8B8;
3760 		case FORMAT_A8:
3761 			return FORMAT_A8;
3762 		case FORMAT_R8I:
3763 			return FORMAT_R8I;
3764 		case FORMAT_R8UI:
3765 			return FORMAT_R8UI;
3766 		case FORMAT_R8_SNORM:
3767 			return FORMAT_R8_SNORM;
3768 		case FORMAT_R8:
3769 			return FORMAT_R8;
3770 		case FORMAT_R16I:
3771 			return FORMAT_R16I;
3772 		case FORMAT_R16UI:
3773 			return FORMAT_R16UI;
3774 		case FORMAT_R32I:
3775 			return FORMAT_R32I;
3776 		case FORMAT_R32UI:
3777 			return FORMAT_R32UI;
3778 		case FORMAT_X16B16G16R16I:
3779 			return FORMAT_X16B16G16R16I;
3780 		case FORMAT_A16B16G16R16I:
3781 			return FORMAT_A16B16G16R16I;
3782 		case FORMAT_X16B16G16R16UI:
3783 			return FORMAT_X16B16G16R16UI;
3784 		case FORMAT_A16B16G16R16UI:
3785 			return FORMAT_A16B16G16R16UI;
3786 		case FORMAT_A2R10G10B10:
3787 		case FORMAT_A2B10G10R10:
3788 		case FORMAT_A16B16G16R16:
3789 			return FORMAT_A16B16G16R16;
3790 		case FORMAT_A2B10G10R10UI:
3791 			return FORMAT_A16B16G16R16UI;
3792 		case FORMAT_X32B32G32R32I:
3793 			return FORMAT_X32B32G32R32I;
3794 		case FORMAT_A32B32G32R32I:
3795 			return FORMAT_A32B32G32R32I;
3796 		case FORMAT_X32B32G32R32UI:
3797 			return FORMAT_X32B32G32R32UI;
3798 		case FORMAT_A32B32G32R32UI:
3799 			return FORMAT_A32B32G32R32UI;
3800 		case FORMAT_G8R8I:
3801 			return FORMAT_G8R8I;
3802 		case FORMAT_G8R8UI:
3803 			return FORMAT_G8R8UI;
3804 		case FORMAT_G8R8_SNORM:
3805 			return FORMAT_G8R8_SNORM;
3806 		case FORMAT_G8R8:
3807 			return FORMAT_G8R8;
3808 		case FORMAT_G16R16I:
3809 			return FORMAT_G16R16I;
3810 		case FORMAT_G16R16UI:
3811 			return FORMAT_G16R16UI;
3812 		case FORMAT_G16R16:
3813 			return FORMAT_G16R16;
3814 		case FORMAT_G32R32I:
3815 			return FORMAT_G32R32I;
3816 		case FORMAT_G32R32UI:
3817 			return FORMAT_G32R32UI;
3818 		case FORMAT_A8R8G8B8:
3819 			if(lockable || !quadLayoutEnabled)
3820 			{
3821 				return FORMAT_A8R8G8B8;
3822 			}
3823 			else
3824 			{
3825 				return FORMAT_A8G8R8B8Q;
3826 			}
3827 		case FORMAT_A8B8G8R8I:
3828 			return FORMAT_A8B8G8R8I;
3829 		case FORMAT_A8B8G8R8UI:
3830 			return FORMAT_A8B8G8R8UI;
3831 		case FORMAT_A8B8G8R8_SNORM:
3832 			return FORMAT_A8B8G8R8_SNORM;
3833 		case FORMAT_R5G5B5A1:
3834 		case FORMAT_R4G4B4A4:
3835 		case FORMAT_A8B8G8R8:
3836 			return FORMAT_A8B8G8R8;
3837 		case FORMAT_R5G6B5:
3838 			return FORMAT_R5G6B5;
3839 		case FORMAT_R3G3B2:
3840 		case FORMAT_R8G8B8:
3841 		case FORMAT_X4R4G4B4:
3842 		case FORMAT_X1R5G5B5:
3843 		case FORMAT_X8R8G8B8:
3844 			if(lockable || !quadLayoutEnabled)
3845 			{
3846 				return FORMAT_X8R8G8B8;
3847 			}
3848 			else
3849 			{
3850 				return FORMAT_X8G8R8B8Q;
3851 			}
3852 		case FORMAT_X8B8G8R8I:
3853 			return FORMAT_X8B8G8R8I;
3854 		case FORMAT_X8B8G8R8UI:
3855 			return FORMAT_X8B8G8R8UI;
3856 		case FORMAT_X8B8G8R8_SNORM:
3857 			return FORMAT_X8B8G8R8_SNORM;
3858 		case FORMAT_B8G8R8:
3859 		case FORMAT_X8B8G8R8:
3860 			return FORMAT_X8B8G8R8;
3861 		case FORMAT_SRGB8_X8:
3862 			return FORMAT_SRGB8_X8;
3863 		case FORMAT_SRGB8_A8:
3864 			return FORMAT_SRGB8_A8;
3865 		// Compressed formats
3866 		case FORMAT_DXT1:
3867 		case FORMAT_DXT3:
3868 		case FORMAT_DXT5:
3869 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3870 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3871 		case FORMAT_RGBA8_ETC2_EAC:
3872 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3873 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3874 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3875 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3876 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3877 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3878 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3879 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3880 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3881 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3882 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3883 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3884 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3885 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3886 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3887 			return FORMAT_A8R8G8B8;
3888 		case FORMAT_RGBA_ASTC_4x4_KHR:
3889 		case FORMAT_RGBA_ASTC_5x4_KHR:
3890 		case FORMAT_RGBA_ASTC_5x5_KHR:
3891 		case FORMAT_RGBA_ASTC_6x5_KHR:
3892 		case FORMAT_RGBA_ASTC_6x6_KHR:
3893 		case FORMAT_RGBA_ASTC_8x5_KHR:
3894 		case FORMAT_RGBA_ASTC_8x6_KHR:
3895 		case FORMAT_RGBA_ASTC_8x8_KHR:
3896 		case FORMAT_RGBA_ASTC_10x5_KHR:
3897 		case FORMAT_RGBA_ASTC_10x6_KHR:
3898 		case FORMAT_RGBA_ASTC_10x8_KHR:
3899 		case FORMAT_RGBA_ASTC_10x10_KHR:
3900 		case FORMAT_RGBA_ASTC_12x10_KHR:
3901 		case FORMAT_RGBA_ASTC_12x12_KHR:
3902 			// ASTC supports HDR, so a floating point format is required to represent it properly
3903 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3904 		case FORMAT_ATI1:
3905 			return FORMAT_R8;
3906 		case FORMAT_R11_EAC:
3907 		case FORMAT_SIGNED_R11_EAC:
3908 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3909 		case FORMAT_ATI2:
3910 			return FORMAT_G8R8;
3911 		case FORMAT_RG11_EAC:
3912 		case FORMAT_SIGNED_RG11_EAC:
3913 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3914 		case FORMAT_ETC1:
3915 		case FORMAT_RGB8_ETC2:
3916 		case FORMAT_SRGB8_ETC2:
3917 			return FORMAT_X8R8G8B8;
3918 		// Bumpmap formats
3919 		case FORMAT_V8U8:			return FORMAT_V8U8;
3920 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3921 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3922 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3923 		case FORMAT_V16U16:			return FORMAT_V16U16;
3924 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3925 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3926 		// Floating-point formats
3927 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3928 		case FORMAT_R16F:			return FORMAT_R32F;
3929 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3930 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3931 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
3932 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3933 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3934 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3935 		case FORMAT_R32F:			return FORMAT_R32F;
3936 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3937 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3938 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3939 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3940 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3941 		// Luminance formats
3942 		case FORMAT_L8:				return FORMAT_L8;
3943 		case FORMAT_A4L4:			return FORMAT_A8L8;
3944 		case FORMAT_L16:			return FORMAT_L16;
3945 		case FORMAT_A8L8:			return FORMAT_A8L8;
3946 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3947 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3948 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3949 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3950 		// Depth/stencil formats
3951 		case FORMAT_D16:
3952 		case FORMAT_D32:
3953 		case FORMAT_D24X8:
3954 			if(hasParent)   // Texture
3955 			{
3956 				return FORMAT_D32F_SHADOW;
3957 			}
3958 			else if(complementaryDepthBuffer)
3959 			{
3960 				return FORMAT_D32F_COMPLEMENTARY;
3961 			}
3962 			else
3963 			{
3964 				return FORMAT_D32F;
3965 			}
3966 		case FORMAT_D24S8:
3967 		case FORMAT_D24FS8:
3968 			if(hasParent)   // Texture
3969 			{
3970 				return FORMAT_D32FS8_SHADOW;
3971 			}
3972 			else if(complementaryDepthBuffer)
3973 			{
3974 				return FORMAT_D32FS8_COMPLEMENTARY;
3975 			}
3976 			else
3977 			{
3978 				return FORMAT_D32FS8;
3979 			}
3980 		case FORMAT_D32F:           return FORMAT_D32F;
3981 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
3982 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3983 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3984 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3985 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3986 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3987 		case FORMAT_S8:             return FORMAT_S8;
3988 		// YUV formats
3989 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3990 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3991 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3992 		default:
3993 			ASSERT(false);
3994 		}
3995 
3996 		return FORMAT_NULL;
3997 	}
3998 
setTexturePalette(unsigned int * palette)3999 	void Surface::setTexturePalette(unsigned int *palette)
4000 	{
4001 		Surface::palette = palette;
4002 		Surface::paletteID++;
4003 	}
4004 
resolve()4005 	void Surface::resolve()
4006 	{
4007 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
4008 		{
4009 			return;
4010 		}
4011 
4012 		ASSERT(internal.depth == 1);  // Unimplemented
4013 
4014 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
4015 
4016 		int width = internal.width;
4017 		int height = internal.height;
4018 		int pitch = internal.pitchB;
4019 		int slice = internal.sliceB;
4020 
4021 		unsigned char *source0 = (unsigned char*)source;
4022 		unsigned char *source1 = source0 + slice;
4023 		unsigned char *source2 = source1 + slice;
4024 		unsigned char *source3 = source2 + slice;
4025 		unsigned char *source4 = source3 + slice;
4026 		unsigned char *source5 = source4 + slice;
4027 		unsigned char *source6 = source5 + slice;
4028 		unsigned char *source7 = source6 + slice;
4029 		unsigned char *source8 = source7 + slice;
4030 		unsigned char *source9 = source8 + slice;
4031 		unsigned char *sourceA = source9 + slice;
4032 		unsigned char *sourceB = sourceA + slice;
4033 		unsigned char *sourceC = sourceB + slice;
4034 		unsigned char *sourceD = sourceC + slice;
4035 		unsigned char *sourceE = sourceD + slice;
4036 		unsigned char *sourceF = sourceE + slice;
4037 
4038 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
4039 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
4040 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
4041 		{
4042 			#if defined(__i386__) || defined(__x86_64__)
4043 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4044 				{
4045 					if(internal.samples == 2)
4046 					{
4047 						for(int y = 0; y < height; y++)
4048 						{
4049 							for(int x = 0; x < width; x += 4)
4050 							{
4051 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4052 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4053 
4054 								c0 = _mm_avg_epu8(c0, c1);
4055 
4056 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4057 							}
4058 
4059 							source0 += pitch;
4060 							source1 += pitch;
4061 						}
4062 					}
4063 					else if(internal.samples == 4)
4064 					{
4065 						for(int y = 0; y < height; y++)
4066 						{
4067 							for(int x = 0; x < width; x += 4)
4068 							{
4069 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4070 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4071 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4072 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4073 
4074 								c0 = _mm_avg_epu8(c0, c1);
4075 								c2 = _mm_avg_epu8(c2, c3);
4076 								c0 = _mm_avg_epu8(c0, c2);
4077 
4078 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4079 							}
4080 
4081 							source0 += pitch;
4082 							source1 += pitch;
4083 							source2 += pitch;
4084 							source3 += pitch;
4085 						}
4086 					}
4087 					else if(internal.samples == 8)
4088 					{
4089 						for(int y = 0; y < height; y++)
4090 						{
4091 							for(int x = 0; x < width; x += 4)
4092 							{
4093 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4094 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4095 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4096 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4097 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4098 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4099 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4100 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4101 
4102 								c0 = _mm_avg_epu8(c0, c1);
4103 								c2 = _mm_avg_epu8(c2, c3);
4104 								c4 = _mm_avg_epu8(c4, c5);
4105 								c6 = _mm_avg_epu8(c6, c7);
4106 								c0 = _mm_avg_epu8(c0, c2);
4107 								c4 = _mm_avg_epu8(c4, c6);
4108 								c0 = _mm_avg_epu8(c0, c4);
4109 
4110 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4111 							}
4112 
4113 							source0 += pitch;
4114 							source1 += pitch;
4115 							source2 += pitch;
4116 							source3 += pitch;
4117 							source4 += pitch;
4118 							source5 += pitch;
4119 							source6 += pitch;
4120 							source7 += pitch;
4121 						}
4122 					}
4123 					else if(internal.samples == 16)
4124 					{
4125 						for(int y = 0; y < height; y++)
4126 						{
4127 							for(int x = 0; x < width; x += 4)
4128 							{
4129 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4130 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4131 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4132 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4133 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4134 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4135 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4136 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4137 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4138 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4139 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4140 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4141 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4142 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4143 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4144 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4145 
4146 								c0 = _mm_avg_epu8(c0, c1);
4147 								c2 = _mm_avg_epu8(c2, c3);
4148 								c4 = _mm_avg_epu8(c4, c5);
4149 								c6 = _mm_avg_epu8(c6, c7);
4150 								c8 = _mm_avg_epu8(c8, c9);
4151 								cA = _mm_avg_epu8(cA, cB);
4152 								cC = _mm_avg_epu8(cC, cD);
4153 								cE = _mm_avg_epu8(cE, cF);
4154 								c0 = _mm_avg_epu8(c0, c2);
4155 								c4 = _mm_avg_epu8(c4, c6);
4156 								c8 = _mm_avg_epu8(c8, cA);
4157 								cC = _mm_avg_epu8(cC, cE);
4158 								c0 = _mm_avg_epu8(c0, c4);
4159 								c8 = _mm_avg_epu8(c8, cC);
4160 								c0 = _mm_avg_epu8(c0, c8);
4161 
4162 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4163 							}
4164 
4165 							source0 += pitch;
4166 							source1 += pitch;
4167 							source2 += pitch;
4168 							source3 += pitch;
4169 							source4 += pitch;
4170 							source5 += pitch;
4171 							source6 += pitch;
4172 							source7 += pitch;
4173 							source8 += pitch;
4174 							source9 += pitch;
4175 							sourceA += pitch;
4176 							sourceB += pitch;
4177 							sourceC += pitch;
4178 							sourceD += pitch;
4179 							sourceE += pitch;
4180 							sourceF += pitch;
4181 						}
4182 					}
4183 					else ASSERT(false);
4184 				}
4185 				else
4186 			#endif
4187 			{
4188 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
4189 
4190 				if(internal.samples == 2)
4191 				{
4192 					for(int y = 0; y < height; y++)
4193 					{
4194 						for(int x = 0; x < width; x++)
4195 						{
4196 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4197 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4198 
4199 							c0 = AVERAGE(c0, c1);
4200 
4201 							*(unsigned int*)(source0 + 4 * x) = c0;
4202 						}
4203 
4204 						source0 += pitch;
4205 						source1 += pitch;
4206 					}
4207 				}
4208 				else if(internal.samples == 4)
4209 				{
4210 					for(int y = 0; y < height; y++)
4211 					{
4212 						for(int x = 0; x < width; x++)
4213 						{
4214 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4215 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4216 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4217 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4218 
4219 							c0 = AVERAGE(c0, c1);
4220 							c2 = AVERAGE(c2, c3);
4221 							c0 = AVERAGE(c0, c2);
4222 
4223 							*(unsigned int*)(source0 + 4 * x) = c0;
4224 						}
4225 
4226 						source0 += pitch;
4227 						source1 += pitch;
4228 						source2 += pitch;
4229 						source3 += pitch;
4230 					}
4231 				}
4232 				else if(internal.samples == 8)
4233 				{
4234 					for(int y = 0; y < height; y++)
4235 					{
4236 						for(int x = 0; x < width; x++)
4237 						{
4238 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4239 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4240 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4241 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4242 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4243 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4244 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4245 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4246 
4247 							c0 = AVERAGE(c0, c1);
4248 							c2 = AVERAGE(c2, c3);
4249 							c4 = AVERAGE(c4, c5);
4250 							c6 = AVERAGE(c6, c7);
4251 							c0 = AVERAGE(c0, c2);
4252 							c4 = AVERAGE(c4, c6);
4253 							c0 = AVERAGE(c0, c4);
4254 
4255 							*(unsigned int*)(source0 + 4 * x) = c0;
4256 						}
4257 
4258 						source0 += pitch;
4259 						source1 += pitch;
4260 						source2 += pitch;
4261 						source3 += pitch;
4262 						source4 += pitch;
4263 						source5 += pitch;
4264 						source6 += pitch;
4265 						source7 += pitch;
4266 					}
4267 				}
4268 				else if(internal.samples == 16)
4269 				{
4270 					for(int y = 0; y < height; y++)
4271 					{
4272 						for(int x = 0; x < width; x++)
4273 						{
4274 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4275 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4276 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4277 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4278 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4279 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4280 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4281 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4282 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4283 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4284 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4285 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4286 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4287 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4288 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4289 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4290 
4291 							c0 = AVERAGE(c0, c1);
4292 							c2 = AVERAGE(c2, c3);
4293 							c4 = AVERAGE(c4, c5);
4294 							c6 = AVERAGE(c6, c7);
4295 							c8 = AVERAGE(c8, c9);
4296 							cA = AVERAGE(cA, cB);
4297 							cC = AVERAGE(cC, cD);
4298 							cE = AVERAGE(cE, cF);
4299 							c0 = AVERAGE(c0, c2);
4300 							c4 = AVERAGE(c4, c6);
4301 							c8 = AVERAGE(c8, cA);
4302 							cC = AVERAGE(cC, cE);
4303 							c0 = AVERAGE(c0, c4);
4304 							c8 = AVERAGE(c8, cC);
4305 							c0 = AVERAGE(c0, c8);
4306 
4307 							*(unsigned int*)(source0 + 4 * x) = c0;
4308 						}
4309 
4310 						source0 += pitch;
4311 						source1 += pitch;
4312 						source2 += pitch;
4313 						source3 += pitch;
4314 						source4 += pitch;
4315 						source5 += pitch;
4316 						source6 += pitch;
4317 						source7 += pitch;
4318 						source8 += pitch;
4319 						source9 += pitch;
4320 						sourceA += pitch;
4321 						sourceB += pitch;
4322 						sourceC += pitch;
4323 						sourceD += pitch;
4324 						sourceE += pitch;
4325 						sourceF += pitch;
4326 					}
4327 				}
4328 				else ASSERT(false);
4329 
4330 				#undef AVERAGE
4331 			}
4332 		}
4333 		else if(internal.format == FORMAT_G16R16)
4334 		{
4335 
4336 			#if defined(__i386__) || defined(__x86_64__)
4337 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4338 				{
4339 					if(internal.samples == 2)
4340 					{
4341 						for(int y = 0; y < height; y++)
4342 						{
4343 							for(int x = 0; x < width; x += 4)
4344 							{
4345 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4346 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4347 
4348 								c0 = _mm_avg_epu16(c0, c1);
4349 
4350 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4351 							}
4352 
4353 							source0 += pitch;
4354 							source1 += pitch;
4355 						}
4356 					}
4357 					else if(internal.samples == 4)
4358 					{
4359 						for(int y = 0; y < height; y++)
4360 						{
4361 							for(int x = 0; x < width; x += 4)
4362 							{
4363 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4364 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4365 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4366 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4367 
4368 								c0 = _mm_avg_epu16(c0, c1);
4369 								c2 = _mm_avg_epu16(c2, c3);
4370 								c0 = _mm_avg_epu16(c0, c2);
4371 
4372 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4373 							}
4374 
4375 							source0 += pitch;
4376 							source1 += pitch;
4377 							source2 += pitch;
4378 							source3 += pitch;
4379 						}
4380 					}
4381 					else if(internal.samples == 8)
4382 					{
4383 						for(int y = 0; y < height; y++)
4384 						{
4385 							for(int x = 0; x < width; x += 4)
4386 							{
4387 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4388 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4389 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4390 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4391 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4392 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4393 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4394 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4395 
4396 								c0 = _mm_avg_epu16(c0, c1);
4397 								c2 = _mm_avg_epu16(c2, c3);
4398 								c4 = _mm_avg_epu16(c4, c5);
4399 								c6 = _mm_avg_epu16(c6, c7);
4400 								c0 = _mm_avg_epu16(c0, c2);
4401 								c4 = _mm_avg_epu16(c4, c6);
4402 								c0 = _mm_avg_epu16(c0, c4);
4403 
4404 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4405 							}
4406 
4407 							source0 += pitch;
4408 							source1 += pitch;
4409 							source2 += pitch;
4410 							source3 += pitch;
4411 							source4 += pitch;
4412 							source5 += pitch;
4413 							source6 += pitch;
4414 							source7 += pitch;
4415 						}
4416 					}
4417 					else if(internal.samples == 16)
4418 					{
4419 						for(int y = 0; y < height; y++)
4420 						{
4421 							for(int x = 0; x < width; x += 4)
4422 							{
4423 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4424 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4425 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4426 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4427 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4428 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4429 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4430 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4431 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4432 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4433 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4434 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4435 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4436 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4437 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4438 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4439 
4440 								c0 = _mm_avg_epu16(c0, c1);
4441 								c2 = _mm_avg_epu16(c2, c3);
4442 								c4 = _mm_avg_epu16(c4, c5);
4443 								c6 = _mm_avg_epu16(c6, c7);
4444 								c8 = _mm_avg_epu16(c8, c9);
4445 								cA = _mm_avg_epu16(cA, cB);
4446 								cC = _mm_avg_epu16(cC, cD);
4447 								cE = _mm_avg_epu16(cE, cF);
4448 								c0 = _mm_avg_epu16(c0, c2);
4449 								c4 = _mm_avg_epu16(c4, c6);
4450 								c8 = _mm_avg_epu16(c8, cA);
4451 								cC = _mm_avg_epu16(cC, cE);
4452 								c0 = _mm_avg_epu16(c0, c4);
4453 								c8 = _mm_avg_epu16(c8, cC);
4454 								c0 = _mm_avg_epu16(c0, c8);
4455 
4456 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4457 							}
4458 
4459 							source0 += pitch;
4460 							source1 += pitch;
4461 							source2 += pitch;
4462 							source3 += pitch;
4463 							source4 += pitch;
4464 							source5 += pitch;
4465 							source6 += pitch;
4466 							source7 += pitch;
4467 							source8 += pitch;
4468 							source9 += pitch;
4469 							sourceA += pitch;
4470 							sourceB += pitch;
4471 							sourceC += pitch;
4472 							sourceD += pitch;
4473 							sourceE += pitch;
4474 							sourceF += pitch;
4475 						}
4476 					}
4477 					else ASSERT(false);
4478 				}
4479 				else
4480 			#endif
4481 			{
4482 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4483 
4484 				if(internal.samples == 2)
4485 				{
4486 					for(int y = 0; y < height; y++)
4487 					{
4488 						for(int x = 0; x < width; x++)
4489 						{
4490 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4491 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4492 
4493 							c0 = AVERAGE(c0, c1);
4494 
4495 							*(unsigned int*)(source0 + 4 * x) = c0;
4496 						}
4497 
4498 						source0 += pitch;
4499 						source1 += pitch;
4500 					}
4501 				}
4502 				else if(internal.samples == 4)
4503 				{
4504 					for(int y = 0; y < height; y++)
4505 					{
4506 						for(int x = 0; x < width; x++)
4507 						{
4508 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4509 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4510 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4511 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4512 
4513 							c0 = AVERAGE(c0, c1);
4514 							c2 = AVERAGE(c2, c3);
4515 							c0 = AVERAGE(c0, c2);
4516 
4517 							*(unsigned int*)(source0 + 4 * x) = c0;
4518 						}
4519 
4520 						source0 += pitch;
4521 						source1 += pitch;
4522 						source2 += pitch;
4523 						source3 += pitch;
4524 					}
4525 				}
4526 				else if(internal.samples == 8)
4527 				{
4528 					for(int y = 0; y < height; y++)
4529 					{
4530 						for(int x = 0; x < width; x++)
4531 						{
4532 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4533 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4534 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4535 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4536 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4537 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4538 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4539 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4540 
4541 							c0 = AVERAGE(c0, c1);
4542 							c2 = AVERAGE(c2, c3);
4543 							c4 = AVERAGE(c4, c5);
4544 							c6 = AVERAGE(c6, c7);
4545 							c0 = AVERAGE(c0, c2);
4546 							c4 = AVERAGE(c4, c6);
4547 							c0 = AVERAGE(c0, c4);
4548 
4549 							*(unsigned int*)(source0 + 4 * x) = c0;
4550 						}
4551 
4552 						source0 += pitch;
4553 						source1 += pitch;
4554 						source2 += pitch;
4555 						source3 += pitch;
4556 						source4 += pitch;
4557 						source5 += pitch;
4558 						source6 += pitch;
4559 						source7 += pitch;
4560 					}
4561 				}
4562 				else if(internal.samples == 16)
4563 				{
4564 					for(int y = 0; y < height; y++)
4565 					{
4566 						for(int x = 0; x < width; x++)
4567 						{
4568 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4569 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4570 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4571 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4572 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4573 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4574 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4575 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4576 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4577 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4578 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4579 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4580 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4581 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4582 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4583 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4584 
4585 							c0 = AVERAGE(c0, c1);
4586 							c2 = AVERAGE(c2, c3);
4587 							c4 = AVERAGE(c4, c5);
4588 							c6 = AVERAGE(c6, c7);
4589 							c8 = AVERAGE(c8, c9);
4590 							cA = AVERAGE(cA, cB);
4591 							cC = AVERAGE(cC, cD);
4592 							cE = AVERAGE(cE, cF);
4593 							c0 = AVERAGE(c0, c2);
4594 							c4 = AVERAGE(c4, c6);
4595 							c8 = AVERAGE(c8, cA);
4596 							cC = AVERAGE(cC, cE);
4597 							c0 = AVERAGE(c0, c4);
4598 							c8 = AVERAGE(c8, cC);
4599 							c0 = AVERAGE(c0, c8);
4600 
4601 							*(unsigned int*)(source0 + 4 * x) = c0;
4602 						}
4603 
4604 						source0 += pitch;
4605 						source1 += pitch;
4606 						source2 += pitch;
4607 						source3 += pitch;
4608 						source4 += pitch;
4609 						source5 += pitch;
4610 						source6 += pitch;
4611 						source7 += pitch;
4612 						source8 += pitch;
4613 						source9 += pitch;
4614 						sourceA += pitch;
4615 						sourceB += pitch;
4616 						sourceC += pitch;
4617 						sourceD += pitch;
4618 						sourceE += pitch;
4619 						sourceF += pitch;
4620 					}
4621 				}
4622 				else ASSERT(false);
4623 
4624 				#undef AVERAGE
4625 			}
4626 		}
4627 		else if(internal.format == FORMAT_A16B16G16R16)
4628 		{
4629 			#if defined(__i386__) || defined(__x86_64__)
4630 				if(CPUID::supportsSSE2() && (width % 2) == 0)
4631 				{
4632 					if(internal.samples == 2)
4633 					{
4634 						for(int y = 0; y < height; y++)
4635 						{
4636 							for(int x = 0; x < width; x += 2)
4637 							{
4638 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4639 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4640 
4641 								c0 = _mm_avg_epu16(c0, c1);
4642 
4643 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4644 							}
4645 
4646 							source0 += pitch;
4647 							source1 += pitch;
4648 						}
4649 					}
4650 					else if(internal.samples == 4)
4651 					{
4652 						for(int y = 0; y < height; y++)
4653 						{
4654 							for(int x = 0; x < width; x += 2)
4655 							{
4656 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4657 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4658 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4659 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4660 
4661 								c0 = _mm_avg_epu16(c0, c1);
4662 								c2 = _mm_avg_epu16(c2, c3);
4663 								c0 = _mm_avg_epu16(c0, c2);
4664 
4665 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4666 							}
4667 
4668 							source0 += pitch;
4669 							source1 += pitch;
4670 							source2 += pitch;
4671 							source3 += pitch;
4672 						}
4673 					}
4674 					else if(internal.samples == 8)
4675 					{
4676 						for(int y = 0; y < height; y++)
4677 						{
4678 							for(int x = 0; x < width; x += 2)
4679 							{
4680 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4681 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4682 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4683 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4684 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4685 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4686 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4687 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4688 
4689 								c0 = _mm_avg_epu16(c0, c1);
4690 								c2 = _mm_avg_epu16(c2, c3);
4691 								c4 = _mm_avg_epu16(c4, c5);
4692 								c6 = _mm_avg_epu16(c6, c7);
4693 								c0 = _mm_avg_epu16(c0, c2);
4694 								c4 = _mm_avg_epu16(c4, c6);
4695 								c0 = _mm_avg_epu16(c0, c4);
4696 
4697 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4698 							}
4699 
4700 							source0 += pitch;
4701 							source1 += pitch;
4702 							source2 += pitch;
4703 							source3 += pitch;
4704 							source4 += pitch;
4705 							source5 += pitch;
4706 							source6 += pitch;
4707 							source7 += pitch;
4708 						}
4709 					}
4710 					else if(internal.samples == 16)
4711 					{
4712 						for(int y = 0; y < height; y++)
4713 						{
4714 							for(int x = 0; x < width; x += 2)
4715 							{
4716 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4717 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4718 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4719 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4720 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4721 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4722 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4723 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4724 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4725 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4726 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4727 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4728 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4729 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4730 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4731 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4732 
4733 								c0 = _mm_avg_epu16(c0, c1);
4734 								c2 = _mm_avg_epu16(c2, c3);
4735 								c4 = _mm_avg_epu16(c4, c5);
4736 								c6 = _mm_avg_epu16(c6, c7);
4737 								c8 = _mm_avg_epu16(c8, c9);
4738 								cA = _mm_avg_epu16(cA, cB);
4739 								cC = _mm_avg_epu16(cC, cD);
4740 								cE = _mm_avg_epu16(cE, cF);
4741 								c0 = _mm_avg_epu16(c0, c2);
4742 								c4 = _mm_avg_epu16(c4, c6);
4743 								c8 = _mm_avg_epu16(c8, cA);
4744 								cC = _mm_avg_epu16(cC, cE);
4745 								c0 = _mm_avg_epu16(c0, c4);
4746 								c8 = _mm_avg_epu16(c8, cC);
4747 								c0 = _mm_avg_epu16(c0, c8);
4748 
4749 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4750 							}
4751 
4752 							source0 += pitch;
4753 							source1 += pitch;
4754 							source2 += pitch;
4755 							source3 += pitch;
4756 							source4 += pitch;
4757 							source5 += pitch;
4758 							source6 += pitch;
4759 							source7 += pitch;
4760 							source8 += pitch;
4761 							source9 += pitch;
4762 							sourceA += pitch;
4763 							sourceB += pitch;
4764 							sourceC += pitch;
4765 							sourceD += pitch;
4766 							sourceE += pitch;
4767 							sourceF += pitch;
4768 						}
4769 					}
4770 					else ASSERT(false);
4771 				}
4772 				else
4773 			#endif
4774 			{
4775 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4776 
4777 				if(internal.samples == 2)
4778 				{
4779 					for(int y = 0; y < height; y++)
4780 					{
4781 						for(int x = 0; x < 2 * width; x++)
4782 						{
4783 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4784 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4785 
4786 							c0 = AVERAGE(c0, c1);
4787 
4788 							*(unsigned int*)(source0 + 4 * x) = c0;
4789 						}
4790 
4791 						source0 += pitch;
4792 						source1 += pitch;
4793 					}
4794 				}
4795 				else if(internal.samples == 4)
4796 				{
4797 					for(int y = 0; y < height; y++)
4798 					{
4799 						for(int x = 0; x < 2 * width; x++)
4800 						{
4801 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4802 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4803 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4804 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4805 
4806 							c0 = AVERAGE(c0, c1);
4807 							c2 = AVERAGE(c2, c3);
4808 							c0 = AVERAGE(c0, c2);
4809 
4810 							*(unsigned int*)(source0 + 4 * x) = c0;
4811 						}
4812 
4813 						source0 += pitch;
4814 						source1 += pitch;
4815 						source2 += pitch;
4816 						source3 += pitch;
4817 					}
4818 				}
4819 				else if(internal.samples == 8)
4820 				{
4821 					for(int y = 0; y < height; y++)
4822 					{
4823 						for(int x = 0; x < 2 * width; x++)
4824 						{
4825 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4826 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4827 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4828 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4829 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4830 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4831 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4832 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4833 
4834 							c0 = AVERAGE(c0, c1);
4835 							c2 = AVERAGE(c2, c3);
4836 							c4 = AVERAGE(c4, c5);
4837 							c6 = AVERAGE(c6, c7);
4838 							c0 = AVERAGE(c0, c2);
4839 							c4 = AVERAGE(c4, c6);
4840 							c0 = AVERAGE(c0, c4);
4841 
4842 							*(unsigned int*)(source0 + 4 * x) = c0;
4843 						}
4844 
4845 						source0 += pitch;
4846 						source1 += pitch;
4847 						source2 += pitch;
4848 						source3 += pitch;
4849 						source4 += pitch;
4850 						source5 += pitch;
4851 						source6 += pitch;
4852 						source7 += pitch;
4853 					}
4854 				}
4855 				else if(internal.samples == 16)
4856 				{
4857 					for(int y = 0; y < height; y++)
4858 					{
4859 						for(int x = 0; x < 2 * width; x++)
4860 						{
4861 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4862 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4863 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4864 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4865 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4866 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4867 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4868 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4869 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4870 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4871 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4872 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4873 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4874 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4875 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4876 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4877 
4878 							c0 = AVERAGE(c0, c1);
4879 							c2 = AVERAGE(c2, c3);
4880 							c4 = AVERAGE(c4, c5);
4881 							c6 = AVERAGE(c6, c7);
4882 							c8 = AVERAGE(c8, c9);
4883 							cA = AVERAGE(cA, cB);
4884 							cC = AVERAGE(cC, cD);
4885 							cE = AVERAGE(cE, cF);
4886 							c0 = AVERAGE(c0, c2);
4887 							c4 = AVERAGE(c4, c6);
4888 							c8 = AVERAGE(c8, cA);
4889 							cC = AVERAGE(cC, cE);
4890 							c0 = AVERAGE(c0, c4);
4891 							c8 = AVERAGE(c8, cC);
4892 							c0 = AVERAGE(c0, c8);
4893 
4894 							*(unsigned int*)(source0 + 4 * x) = c0;
4895 						}
4896 
4897 						source0 += pitch;
4898 						source1 += pitch;
4899 						source2 += pitch;
4900 						source3 += pitch;
4901 						source4 += pitch;
4902 						source5 += pitch;
4903 						source6 += pitch;
4904 						source7 += pitch;
4905 						source8 += pitch;
4906 						source9 += pitch;
4907 						sourceA += pitch;
4908 						sourceB += pitch;
4909 						sourceC += pitch;
4910 						sourceD += pitch;
4911 						sourceE += pitch;
4912 						sourceF += pitch;
4913 					}
4914 				}
4915 				else ASSERT(false);
4916 
4917 				#undef AVERAGE
4918 			}
4919 		}
4920 		else if(internal.format == FORMAT_R32F)
4921 		{
4922 			#if defined(__i386__) || defined(__x86_64__)
4923 				if(CPUID::supportsSSE() && (width % 4) == 0)
4924 				{
4925 					if(internal.samples == 2)
4926 					{
4927 						for(int y = 0; y < height; y++)
4928 						{
4929 							for(int x = 0; x < width; x += 4)
4930 							{
4931 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4932 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4933 
4934 								c0 = _mm_add_ps(c0, c1);
4935 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4936 
4937 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4938 							}
4939 
4940 							source0 += pitch;
4941 							source1 += pitch;
4942 						}
4943 					}
4944 					else if(internal.samples == 4)
4945 					{
4946 						for(int y = 0; y < height; y++)
4947 						{
4948 							for(int x = 0; x < width; x += 4)
4949 							{
4950 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4951 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4952 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4953 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4954 
4955 								c0 = _mm_add_ps(c0, c1);
4956 								c2 = _mm_add_ps(c2, c3);
4957 								c0 = _mm_add_ps(c0, c2);
4958 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4959 
4960 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4961 							}
4962 
4963 							source0 += pitch;
4964 							source1 += pitch;
4965 							source2 += pitch;
4966 							source3 += pitch;
4967 						}
4968 					}
4969 					else if(internal.samples == 8)
4970 					{
4971 						for(int y = 0; y < height; y++)
4972 						{
4973 							for(int x = 0; x < width; x += 4)
4974 							{
4975 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4976 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4977 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4978 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4979 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4980 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4981 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4982 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4983 
4984 								c0 = _mm_add_ps(c0, c1);
4985 								c2 = _mm_add_ps(c2, c3);
4986 								c4 = _mm_add_ps(c4, c5);
4987 								c6 = _mm_add_ps(c6, c7);
4988 								c0 = _mm_add_ps(c0, c2);
4989 								c4 = _mm_add_ps(c4, c6);
4990 								c0 = _mm_add_ps(c0, c4);
4991 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4992 
4993 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4994 							}
4995 
4996 							source0 += pitch;
4997 							source1 += pitch;
4998 							source2 += pitch;
4999 							source3 += pitch;
5000 							source4 += pitch;
5001 							source5 += pitch;
5002 							source6 += pitch;
5003 							source7 += pitch;
5004 						}
5005 					}
5006 					else if(internal.samples == 16)
5007 					{
5008 						for(int y = 0; y < height; y++)
5009 						{
5010 							for(int x = 0; x < width; x += 4)
5011 							{
5012 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
5013 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
5014 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
5015 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
5016 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5017 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5018 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5019 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5020 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
5021 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
5022 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
5023 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
5024 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
5025 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
5026 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
5027 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
5028 
5029 								c0 = _mm_add_ps(c0, c1);
5030 								c2 = _mm_add_ps(c2, c3);
5031 								c4 = _mm_add_ps(c4, c5);
5032 								c6 = _mm_add_ps(c6, c7);
5033 								c8 = _mm_add_ps(c8, c9);
5034 								cA = _mm_add_ps(cA, cB);
5035 								cC = _mm_add_ps(cC, cD);
5036 								cE = _mm_add_ps(cE, cF);
5037 								c0 = _mm_add_ps(c0, c2);
5038 								c4 = _mm_add_ps(c4, c6);
5039 								c8 = _mm_add_ps(c8, cA);
5040 								cC = _mm_add_ps(cC, cE);
5041 								c0 = _mm_add_ps(c0, c4);
5042 								c8 = _mm_add_ps(c8, cC);
5043 								c0 = _mm_add_ps(c0, c8);
5044 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5045 
5046 								_mm_store_ps((float*)(source0 + 4 * x), c0);
5047 							}
5048 
5049 							source0 += pitch;
5050 							source1 += pitch;
5051 							source2 += pitch;
5052 							source3 += pitch;
5053 							source4 += pitch;
5054 							source5 += pitch;
5055 							source6 += pitch;
5056 							source7 += pitch;
5057 							source8 += pitch;
5058 							source9 += pitch;
5059 							sourceA += pitch;
5060 							sourceB += pitch;
5061 							sourceC += pitch;
5062 							sourceD += pitch;
5063 							sourceE += pitch;
5064 							sourceF += pitch;
5065 						}
5066 					}
5067 					else ASSERT(false);
5068 				}
5069 				else
5070 			#endif
5071 			{
5072 				if(internal.samples == 2)
5073 				{
5074 					for(int y = 0; y < height; y++)
5075 					{
5076 						for(int x = 0; x < width; x++)
5077 						{
5078 							float c0 = *(float*)(source0 + 4 * x);
5079 							float c1 = *(float*)(source1 + 4 * x);
5080 
5081 							c0 = c0 + c1;
5082 							c0 *= 1.0f / 2.0f;
5083 
5084 							*(float*)(source0 + 4 * x) = c0;
5085 						}
5086 
5087 						source0 += pitch;
5088 						source1 += pitch;
5089 					}
5090 				}
5091 				else if(internal.samples == 4)
5092 				{
5093 					for(int y = 0; y < height; y++)
5094 					{
5095 						for(int x = 0; x < width; x++)
5096 						{
5097 							float c0 = *(float*)(source0 + 4 * x);
5098 							float c1 = *(float*)(source1 + 4 * x);
5099 							float c2 = *(float*)(source2 + 4 * x);
5100 							float c3 = *(float*)(source3 + 4 * x);
5101 
5102 							c0 = c0 + c1;
5103 							c2 = c2 + c3;
5104 							c0 = c0 + c2;
5105 							c0 *= 1.0f / 4.0f;
5106 
5107 							*(float*)(source0 + 4 * x) = c0;
5108 						}
5109 
5110 						source0 += pitch;
5111 						source1 += pitch;
5112 						source2 += pitch;
5113 						source3 += pitch;
5114 					}
5115 				}
5116 				else if(internal.samples == 8)
5117 				{
5118 					for(int y = 0; y < height; y++)
5119 					{
5120 						for(int x = 0; x < width; x++)
5121 						{
5122 							float c0 = *(float*)(source0 + 4 * x);
5123 							float c1 = *(float*)(source1 + 4 * x);
5124 							float c2 = *(float*)(source2 + 4 * x);
5125 							float c3 = *(float*)(source3 + 4 * x);
5126 							float c4 = *(float*)(source4 + 4 * x);
5127 							float c5 = *(float*)(source5 + 4 * x);
5128 							float c6 = *(float*)(source6 + 4 * x);
5129 							float c7 = *(float*)(source7 + 4 * x);
5130 
5131 							c0 = c0 + c1;
5132 							c2 = c2 + c3;
5133 							c4 = c4 + c5;
5134 							c6 = c6 + c7;
5135 							c0 = c0 + c2;
5136 							c4 = c4 + c6;
5137 							c0 = c0 + c4;
5138 							c0 *= 1.0f / 8.0f;
5139 
5140 							*(float*)(source0 + 4 * x) = c0;
5141 						}
5142 
5143 						source0 += pitch;
5144 						source1 += pitch;
5145 						source2 += pitch;
5146 						source3 += pitch;
5147 						source4 += pitch;
5148 						source5 += pitch;
5149 						source6 += pitch;
5150 						source7 += pitch;
5151 					}
5152 				}
5153 				else if(internal.samples == 16)
5154 				{
5155 					for(int y = 0; y < height; y++)
5156 					{
5157 						for(int x = 0; x < width; x++)
5158 						{
5159 							float c0 = *(float*)(source0 + 4 * x);
5160 							float c1 = *(float*)(source1 + 4 * x);
5161 							float c2 = *(float*)(source2 + 4 * x);
5162 							float c3 = *(float*)(source3 + 4 * x);
5163 							float c4 = *(float*)(source4 + 4 * x);
5164 							float c5 = *(float*)(source5 + 4 * x);
5165 							float c6 = *(float*)(source6 + 4 * x);
5166 							float c7 = *(float*)(source7 + 4 * x);
5167 							float c8 = *(float*)(source8 + 4 * x);
5168 							float c9 = *(float*)(source9 + 4 * x);
5169 							float cA = *(float*)(sourceA + 4 * x);
5170 							float cB = *(float*)(sourceB + 4 * x);
5171 							float cC = *(float*)(sourceC + 4 * x);
5172 							float cD = *(float*)(sourceD + 4 * x);
5173 							float cE = *(float*)(sourceE + 4 * x);
5174 							float cF = *(float*)(sourceF + 4 * x);
5175 
5176 							c0 = c0 + c1;
5177 							c2 = c2 + c3;
5178 							c4 = c4 + c5;
5179 							c6 = c6 + c7;
5180 							c8 = c8 + c9;
5181 							cA = cA + cB;
5182 							cC = cC + cD;
5183 							cE = cE + cF;
5184 							c0 = c0 + c2;
5185 							c4 = c4 + c6;
5186 							c8 = c8 + cA;
5187 							cC = cC + cE;
5188 							c0 = c0 + c4;
5189 							c8 = c8 + cC;
5190 							c0 = c0 + c8;
5191 							c0 *= 1.0f / 16.0f;
5192 
5193 							*(float*)(source0 + 4 * x) = c0;
5194 						}
5195 
5196 						source0 += pitch;
5197 						source1 += pitch;
5198 						source2 += pitch;
5199 						source3 += pitch;
5200 						source4 += pitch;
5201 						source5 += pitch;
5202 						source6 += pitch;
5203 						source7 += pitch;
5204 						source8 += pitch;
5205 						source9 += pitch;
5206 						sourceA += pitch;
5207 						sourceB += pitch;
5208 						sourceC += pitch;
5209 						sourceD += pitch;
5210 						sourceE += pitch;
5211 						sourceF += pitch;
5212 					}
5213 				}
5214 				else ASSERT(false);
5215 			}
5216 		}
5217 		else if(internal.format == FORMAT_G32R32F)
5218 		{
5219 			#if defined(__i386__) || defined(__x86_64__)
5220 				if(CPUID::supportsSSE() && (width % 2) == 0)
5221 				{
5222 					if(internal.samples == 2)
5223 					{
5224 						for(int y = 0; y < height; y++)
5225 						{
5226 							for(int x = 0; x < width; x += 2)
5227 							{
5228 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5229 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5230 
5231 								c0 = _mm_add_ps(c0, c1);
5232 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5233 
5234 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5235 							}
5236 
5237 							source0 += pitch;
5238 							source1 += pitch;
5239 						}
5240 					}
5241 					else if(internal.samples == 4)
5242 					{
5243 						for(int y = 0; y < height; y++)
5244 						{
5245 							for(int x = 0; x < width; x += 2)
5246 							{
5247 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5248 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5249 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5250 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5251 
5252 								c0 = _mm_add_ps(c0, c1);
5253 								c2 = _mm_add_ps(c2, c3);
5254 								c0 = _mm_add_ps(c0, c2);
5255 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5256 
5257 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5258 							}
5259 
5260 							source0 += pitch;
5261 							source1 += pitch;
5262 							source2 += pitch;
5263 							source3 += pitch;
5264 						}
5265 					}
5266 					else if(internal.samples == 8)
5267 					{
5268 						for(int y = 0; y < height; y++)
5269 						{
5270 							for(int x = 0; x < width; x += 2)
5271 							{
5272 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5273 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5274 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5275 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5276 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5277 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5278 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5279 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5280 
5281 								c0 = _mm_add_ps(c0, c1);
5282 								c2 = _mm_add_ps(c2, c3);
5283 								c4 = _mm_add_ps(c4, c5);
5284 								c6 = _mm_add_ps(c6, c7);
5285 								c0 = _mm_add_ps(c0, c2);
5286 								c4 = _mm_add_ps(c4, c6);
5287 								c0 = _mm_add_ps(c0, c4);
5288 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5289 
5290 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5291 							}
5292 
5293 							source0 += pitch;
5294 							source1 += pitch;
5295 							source2 += pitch;
5296 							source3 += pitch;
5297 							source4 += pitch;
5298 							source5 += pitch;
5299 							source6 += pitch;
5300 							source7 += pitch;
5301 						}
5302 					}
5303 					else if(internal.samples == 16)
5304 					{
5305 						for(int y = 0; y < height; y++)
5306 						{
5307 							for(int x = 0; x < width; x += 2)
5308 							{
5309 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5310 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5311 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5312 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5313 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5314 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5315 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5316 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5317 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5318 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5319 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5320 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5321 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5322 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5323 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5324 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5325 
5326 								c0 = _mm_add_ps(c0, c1);
5327 								c2 = _mm_add_ps(c2, c3);
5328 								c4 = _mm_add_ps(c4, c5);
5329 								c6 = _mm_add_ps(c6, c7);
5330 								c8 = _mm_add_ps(c8, c9);
5331 								cA = _mm_add_ps(cA, cB);
5332 								cC = _mm_add_ps(cC, cD);
5333 								cE = _mm_add_ps(cE, cF);
5334 								c0 = _mm_add_ps(c0, c2);
5335 								c4 = _mm_add_ps(c4, c6);
5336 								c8 = _mm_add_ps(c8, cA);
5337 								cC = _mm_add_ps(cC, cE);
5338 								c0 = _mm_add_ps(c0, c4);
5339 								c8 = _mm_add_ps(c8, cC);
5340 								c0 = _mm_add_ps(c0, c8);
5341 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5342 
5343 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5344 							}
5345 
5346 							source0 += pitch;
5347 							source1 += pitch;
5348 							source2 += pitch;
5349 							source3 += pitch;
5350 							source4 += pitch;
5351 							source5 += pitch;
5352 							source6 += pitch;
5353 							source7 += pitch;
5354 							source8 += pitch;
5355 							source9 += pitch;
5356 							sourceA += pitch;
5357 							sourceB += pitch;
5358 							sourceC += pitch;
5359 							sourceD += pitch;
5360 							sourceE += pitch;
5361 							sourceF += pitch;
5362 						}
5363 					}
5364 					else ASSERT(false);
5365 				}
5366 				else
5367 			#endif
5368 			{
5369 				if(internal.samples == 2)
5370 				{
5371 					for(int y = 0; y < height; y++)
5372 					{
5373 						for(int x = 0; x < 2 * width; x++)
5374 						{
5375 							float c0 = *(float*)(source0 + 4 * x);
5376 							float c1 = *(float*)(source1 + 4 * x);
5377 
5378 							c0 = c0 + c1;
5379 							c0 *= 1.0f / 2.0f;
5380 
5381 							*(float*)(source0 + 4 * x) = c0;
5382 						}
5383 
5384 						source0 += pitch;
5385 						source1 += pitch;
5386 					}
5387 				}
5388 				else if(internal.samples == 4)
5389 				{
5390 					for(int y = 0; y < height; y++)
5391 					{
5392 						for(int x = 0; x < 2 * width; x++)
5393 						{
5394 							float c0 = *(float*)(source0 + 4 * x);
5395 							float c1 = *(float*)(source1 + 4 * x);
5396 							float c2 = *(float*)(source2 + 4 * x);
5397 							float c3 = *(float*)(source3 + 4 * x);
5398 
5399 							c0 = c0 + c1;
5400 							c2 = c2 + c3;
5401 							c0 = c0 + c2;
5402 							c0 *= 1.0f / 4.0f;
5403 
5404 							*(float*)(source0 + 4 * x) = c0;
5405 						}
5406 
5407 						source0 += pitch;
5408 						source1 += pitch;
5409 						source2 += pitch;
5410 						source3 += pitch;
5411 					}
5412 				}
5413 				else if(internal.samples == 8)
5414 				{
5415 					for(int y = 0; y < height; y++)
5416 					{
5417 						for(int x = 0; x < 2 * width; x++)
5418 						{
5419 							float c0 = *(float*)(source0 + 4 * x);
5420 							float c1 = *(float*)(source1 + 4 * x);
5421 							float c2 = *(float*)(source2 + 4 * x);
5422 							float c3 = *(float*)(source3 + 4 * x);
5423 							float c4 = *(float*)(source4 + 4 * x);
5424 							float c5 = *(float*)(source5 + 4 * x);
5425 							float c6 = *(float*)(source6 + 4 * x);
5426 							float c7 = *(float*)(source7 + 4 * x);
5427 
5428 							c0 = c0 + c1;
5429 							c2 = c2 + c3;
5430 							c4 = c4 + c5;
5431 							c6 = c6 + c7;
5432 							c0 = c0 + c2;
5433 							c4 = c4 + c6;
5434 							c0 = c0 + c4;
5435 							c0 *= 1.0f / 8.0f;
5436 
5437 							*(float*)(source0 + 4 * x) = c0;
5438 						}
5439 
5440 						source0 += pitch;
5441 						source1 += pitch;
5442 						source2 += pitch;
5443 						source3 += pitch;
5444 						source4 += pitch;
5445 						source5 += pitch;
5446 						source6 += pitch;
5447 						source7 += pitch;
5448 					}
5449 				}
5450 				else if(internal.samples == 16)
5451 				{
5452 					for(int y = 0; y < height; y++)
5453 					{
5454 						for(int x = 0; x < 2 * width; x++)
5455 						{
5456 							float c0 = *(float*)(source0 + 4 * x);
5457 							float c1 = *(float*)(source1 + 4 * x);
5458 							float c2 = *(float*)(source2 + 4 * x);
5459 							float c3 = *(float*)(source3 + 4 * x);
5460 							float c4 = *(float*)(source4 + 4 * x);
5461 							float c5 = *(float*)(source5 + 4 * x);
5462 							float c6 = *(float*)(source6 + 4 * x);
5463 							float c7 = *(float*)(source7 + 4 * x);
5464 							float c8 = *(float*)(source8 + 4 * x);
5465 							float c9 = *(float*)(source9 + 4 * x);
5466 							float cA = *(float*)(sourceA + 4 * x);
5467 							float cB = *(float*)(sourceB + 4 * x);
5468 							float cC = *(float*)(sourceC + 4 * x);
5469 							float cD = *(float*)(sourceD + 4 * x);
5470 							float cE = *(float*)(sourceE + 4 * x);
5471 							float cF = *(float*)(sourceF + 4 * x);
5472 
5473 							c0 = c0 + c1;
5474 							c2 = c2 + c3;
5475 							c4 = c4 + c5;
5476 							c6 = c6 + c7;
5477 							c8 = c8 + c9;
5478 							cA = cA + cB;
5479 							cC = cC + cD;
5480 							cE = cE + cF;
5481 							c0 = c0 + c2;
5482 							c4 = c4 + c6;
5483 							c8 = c8 + cA;
5484 							cC = cC + cE;
5485 							c0 = c0 + c4;
5486 							c8 = c8 + cC;
5487 							c0 = c0 + c8;
5488 							c0 *= 1.0f / 16.0f;
5489 
5490 							*(float*)(source0 + 4 * x) = c0;
5491 						}
5492 
5493 						source0 += pitch;
5494 						source1 += pitch;
5495 						source2 += pitch;
5496 						source3 += pitch;
5497 						source4 += pitch;
5498 						source5 += pitch;
5499 						source6 += pitch;
5500 						source7 += pitch;
5501 						source8 += pitch;
5502 						source9 += pitch;
5503 						sourceA += pitch;
5504 						sourceB += pitch;
5505 						sourceC += pitch;
5506 						sourceD += pitch;
5507 						sourceE += pitch;
5508 						sourceF += pitch;
5509 					}
5510 				}
5511 				else ASSERT(false);
5512 			}
5513 		}
5514 		else if(internal.format == FORMAT_A32B32G32R32F ||
5515 		        internal.format == FORMAT_X32B32G32R32F ||
5516 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
5517 		{
5518 			#if defined(__i386__) || defined(__x86_64__)
5519 				if(CPUID::supportsSSE())
5520 				{
5521 					if(internal.samples == 2)
5522 					{
5523 						for(int y = 0; y < height; y++)
5524 						{
5525 							for(int x = 0; x < width; x++)
5526 							{
5527 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5528 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5529 
5530 								c0 = _mm_add_ps(c0, c1);
5531 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5532 
5533 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5534 							}
5535 
5536 							source0 += pitch;
5537 							source1 += pitch;
5538 						}
5539 					}
5540 					else if(internal.samples == 4)
5541 					{
5542 						for(int y = 0; y < height; y++)
5543 						{
5544 							for(int x = 0; x < width; x++)
5545 							{
5546 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5547 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5548 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5549 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5550 
5551 								c0 = _mm_add_ps(c0, c1);
5552 								c2 = _mm_add_ps(c2, c3);
5553 								c0 = _mm_add_ps(c0, c2);
5554 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5555 
5556 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5557 							}
5558 
5559 							source0 += pitch;
5560 							source1 += pitch;
5561 							source2 += pitch;
5562 							source3 += pitch;
5563 						}
5564 					}
5565 					else if(internal.samples == 8)
5566 					{
5567 						for(int y = 0; y < height; y++)
5568 						{
5569 							for(int x = 0; x < width; x++)
5570 							{
5571 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5572 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5573 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5574 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5575 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5576 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5577 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5578 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5579 
5580 								c0 = _mm_add_ps(c0, c1);
5581 								c2 = _mm_add_ps(c2, c3);
5582 								c4 = _mm_add_ps(c4, c5);
5583 								c6 = _mm_add_ps(c6, c7);
5584 								c0 = _mm_add_ps(c0, c2);
5585 								c4 = _mm_add_ps(c4, c6);
5586 								c0 = _mm_add_ps(c0, c4);
5587 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5588 
5589 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5590 							}
5591 
5592 							source0 += pitch;
5593 							source1 += pitch;
5594 							source2 += pitch;
5595 							source3 += pitch;
5596 							source4 += pitch;
5597 							source5 += pitch;
5598 							source6 += pitch;
5599 							source7 += pitch;
5600 						}
5601 					}
5602 					else if(internal.samples == 16)
5603 					{
5604 						for(int y = 0; y < height; y++)
5605 						{
5606 							for(int x = 0; x < width; x++)
5607 							{
5608 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5609 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5610 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5611 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5612 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5613 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5614 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5615 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5616 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5617 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5618 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5619 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5620 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5621 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5622 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5623 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5624 
5625 								c0 = _mm_add_ps(c0, c1);
5626 								c2 = _mm_add_ps(c2, c3);
5627 								c4 = _mm_add_ps(c4, c5);
5628 								c6 = _mm_add_ps(c6, c7);
5629 								c8 = _mm_add_ps(c8, c9);
5630 								cA = _mm_add_ps(cA, cB);
5631 								cC = _mm_add_ps(cC, cD);
5632 								cE = _mm_add_ps(cE, cF);
5633 								c0 = _mm_add_ps(c0, c2);
5634 								c4 = _mm_add_ps(c4, c6);
5635 								c8 = _mm_add_ps(c8, cA);
5636 								cC = _mm_add_ps(cC, cE);
5637 								c0 = _mm_add_ps(c0, c4);
5638 								c8 = _mm_add_ps(c8, cC);
5639 								c0 = _mm_add_ps(c0, c8);
5640 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5641 
5642 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5643 							}
5644 
5645 							source0 += pitch;
5646 							source1 += pitch;
5647 							source2 += pitch;
5648 							source3 += pitch;
5649 							source4 += pitch;
5650 							source5 += pitch;
5651 							source6 += pitch;
5652 							source7 += pitch;
5653 							source8 += pitch;
5654 							source9 += pitch;
5655 							sourceA += pitch;
5656 							sourceB += pitch;
5657 							sourceC += pitch;
5658 							sourceD += pitch;
5659 							sourceE += pitch;
5660 							sourceF += pitch;
5661 						}
5662 					}
5663 					else ASSERT(false);
5664 				}
5665 				else
5666 			#endif
5667 			{
5668 				if(internal.samples == 2)
5669 				{
5670 					for(int y = 0; y < height; y++)
5671 					{
5672 						for(int x = 0; x < 4 * width; x++)
5673 						{
5674 							float c0 = *(float*)(source0 + 4 * x);
5675 							float c1 = *(float*)(source1 + 4 * x);
5676 
5677 							c0 = c0 + c1;
5678 							c0 *= 1.0f / 2.0f;
5679 
5680 							*(float*)(source0 + 4 * x) = c0;
5681 						}
5682 
5683 						source0 += pitch;
5684 						source1 += pitch;
5685 					}
5686 				}
5687 				else if(internal.samples == 4)
5688 				{
5689 					for(int y = 0; y < height; y++)
5690 					{
5691 						for(int x = 0; x < 4 * width; x++)
5692 						{
5693 							float c0 = *(float*)(source0 + 4 * x);
5694 							float c1 = *(float*)(source1 + 4 * x);
5695 							float c2 = *(float*)(source2 + 4 * x);
5696 							float c3 = *(float*)(source3 + 4 * x);
5697 
5698 							c0 = c0 + c1;
5699 							c2 = c2 + c3;
5700 							c0 = c0 + c2;
5701 							c0 *= 1.0f / 4.0f;
5702 
5703 							*(float*)(source0 + 4 * x) = c0;
5704 						}
5705 
5706 						source0 += pitch;
5707 						source1 += pitch;
5708 						source2 += pitch;
5709 						source3 += pitch;
5710 					}
5711 				}
5712 				else if(internal.samples == 8)
5713 				{
5714 					for(int y = 0; y < height; y++)
5715 					{
5716 						for(int x = 0; x < 4 * width; x++)
5717 						{
5718 							float c0 = *(float*)(source0 + 4 * x);
5719 							float c1 = *(float*)(source1 + 4 * x);
5720 							float c2 = *(float*)(source2 + 4 * x);
5721 							float c3 = *(float*)(source3 + 4 * x);
5722 							float c4 = *(float*)(source4 + 4 * x);
5723 							float c5 = *(float*)(source5 + 4 * x);
5724 							float c6 = *(float*)(source6 + 4 * x);
5725 							float c7 = *(float*)(source7 + 4 * x);
5726 
5727 							c0 = c0 + c1;
5728 							c2 = c2 + c3;
5729 							c4 = c4 + c5;
5730 							c6 = c6 + c7;
5731 							c0 = c0 + c2;
5732 							c4 = c4 + c6;
5733 							c0 = c0 + c4;
5734 							c0 *= 1.0f / 8.0f;
5735 
5736 							*(float*)(source0 + 4 * x) = c0;
5737 						}
5738 
5739 						source0 += pitch;
5740 						source1 += pitch;
5741 						source2 += pitch;
5742 						source3 += pitch;
5743 						source4 += pitch;
5744 						source5 += pitch;
5745 						source6 += pitch;
5746 						source7 += pitch;
5747 					}
5748 				}
5749 				else if(internal.samples == 16)
5750 				{
5751 					for(int y = 0; y < height; y++)
5752 					{
5753 						for(int x = 0; x < 4 * width; x++)
5754 						{
5755 							float c0 = *(float*)(source0 + 4 * x);
5756 							float c1 = *(float*)(source1 + 4 * x);
5757 							float c2 = *(float*)(source2 + 4 * x);
5758 							float c3 = *(float*)(source3 + 4 * x);
5759 							float c4 = *(float*)(source4 + 4 * x);
5760 							float c5 = *(float*)(source5 + 4 * x);
5761 							float c6 = *(float*)(source6 + 4 * x);
5762 							float c7 = *(float*)(source7 + 4 * x);
5763 							float c8 = *(float*)(source8 + 4 * x);
5764 							float c9 = *(float*)(source9 + 4 * x);
5765 							float cA = *(float*)(sourceA + 4 * x);
5766 							float cB = *(float*)(sourceB + 4 * x);
5767 							float cC = *(float*)(sourceC + 4 * x);
5768 							float cD = *(float*)(sourceD + 4 * x);
5769 							float cE = *(float*)(sourceE + 4 * x);
5770 							float cF = *(float*)(sourceF + 4 * x);
5771 
5772 							c0 = c0 + c1;
5773 							c2 = c2 + c3;
5774 							c4 = c4 + c5;
5775 							c6 = c6 + c7;
5776 							c8 = c8 + c9;
5777 							cA = cA + cB;
5778 							cC = cC + cD;
5779 							cE = cE + cF;
5780 							c0 = c0 + c2;
5781 							c4 = c4 + c6;
5782 							c8 = c8 + cA;
5783 							cC = cC + cE;
5784 							c0 = c0 + c4;
5785 							c8 = c8 + cC;
5786 							c0 = c0 + c8;
5787 							c0 *= 1.0f / 16.0f;
5788 
5789 							*(float*)(source0 + 4 * x) = c0;
5790 						}
5791 
5792 						source0 += pitch;
5793 						source1 += pitch;
5794 						source2 += pitch;
5795 						source3 += pitch;
5796 						source4 += pitch;
5797 						source5 += pitch;
5798 						source6 += pitch;
5799 						source7 += pitch;
5800 						source8 += pitch;
5801 						source9 += pitch;
5802 						sourceA += pitch;
5803 						sourceB += pitch;
5804 						sourceC += pitch;
5805 						sourceD += pitch;
5806 						sourceE += pitch;
5807 						sourceF += pitch;
5808 					}
5809 				}
5810 				else ASSERT(false);
5811 			}
5812 		}
5813 		else if(internal.format == FORMAT_R5G6B5)
5814 		{
5815 			#if defined(__i386__) || defined(__x86_64__)
5816 				if(CPUID::supportsSSE2() && (width % 8) == 0)
5817 				{
5818 					if(internal.samples == 2)
5819 					{
5820 						for(int y = 0; y < height; y++)
5821 						{
5822 							for(int x = 0; x < width; x += 8)
5823 							{
5824 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5825 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5826 
5827 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5828 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5829 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5830 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5831 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5832 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5833 
5834 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5835 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5836 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5837 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5838 								c0 = _mm_or_si128(c0, c1);
5839 
5840 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5841 							}
5842 
5843 							source0 += pitch;
5844 							source1 += pitch;
5845 						}
5846 					}
5847 					else if(internal.samples == 4)
5848 					{
5849 						for(int y = 0; y < height; y++)
5850 						{
5851 							for(int x = 0; x < width; x += 8)
5852 							{
5853 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5854 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5855 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5856 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5857 
5858 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5859 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5860 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5861 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5862 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5863 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5864 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5865 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5866 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5867 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5868 
5869 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5870 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5871 								c0 = _mm_avg_epu8(c0, c2);
5872 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5873 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5874 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5875 								c1 = _mm_avg_epu16(c1, c3);
5876 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5877 								c0 = _mm_or_si128(c0, c1);
5878 
5879 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5880 							}
5881 
5882 							source0 += pitch;
5883 							source1 += pitch;
5884 							source2 += pitch;
5885 							source3 += pitch;
5886 						}
5887 					}
5888 					else if(internal.samples == 8)
5889 					{
5890 						for(int y = 0; y < height; y++)
5891 						{
5892 							for(int x = 0; x < width; x += 8)
5893 							{
5894 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5895 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5896 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5897 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5898 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5899 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5900 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5901 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5902 
5903 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5904 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5905 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5906 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5907 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5908 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5909 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5910 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5911 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5912 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5913 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5914 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5915 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5916 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5917 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5918 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5919 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5920 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5921 
5922 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5923 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5924 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5925 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5926 								c0 = _mm_avg_epu8(c0, c2);
5927 								c4 = _mm_avg_epu8(c4, c6);
5928 								c0 = _mm_avg_epu8(c0, c4);
5929 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5930 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5931 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5932 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5933 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5934 								c1 = _mm_avg_epu16(c1, c3);
5935 								c5 = _mm_avg_epu16(c5, c7);
5936 								c1 = _mm_avg_epu16(c1, c5);
5937 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5938 								c0 = _mm_or_si128(c0, c1);
5939 
5940 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5941 							}
5942 
5943 							source0 += pitch;
5944 							source1 += pitch;
5945 							source2 += pitch;
5946 							source3 += pitch;
5947 							source4 += pitch;
5948 							source5 += pitch;
5949 							source6 += pitch;
5950 							source7 += pitch;
5951 						}
5952 					}
5953 					else if(internal.samples == 16)
5954 					{
5955 						for(int y = 0; y < height; y++)
5956 						{
5957 							for(int x = 0; x < width; x += 8)
5958 							{
5959 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5960 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5961 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5962 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5963 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5964 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5965 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5966 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5967 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5968 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5969 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5970 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5971 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5972 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5973 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5974 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5975 
5976 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5977 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5978 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5979 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5980 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5981 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5982 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5983 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5984 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5985 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5986 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5987 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5988 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5989 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5990 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5991 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5992 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5993 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5994 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5995 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5996 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5997 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5998 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5999 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
6000 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
6001 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
6002 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
6003 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
6004 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
6005 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
6006 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
6007 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
6008 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
6009 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
6010 
6011 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
6012 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
6013 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
6014 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
6015 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
6016 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
6017 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
6018 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
6019 								c0 = _mm_avg_epu8(c0, c2);
6020 								c4 = _mm_avg_epu8(c4, c6);
6021 								c8 = _mm_avg_epu8(c8, cA);
6022 								cC = _mm_avg_epu8(cC, cE);
6023 								c0 = _mm_avg_epu8(c0, c4);
6024 								c8 = _mm_avg_epu8(c8, cC);
6025 								c0 = _mm_avg_epu8(c0, c8);
6026 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
6027 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
6028 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
6029 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
6030 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
6031 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
6032 								cB = _mm_avg_epu16(cA__g_, cB__g_);
6033 								cD = _mm_avg_epu16(cC__g_, cD__g_);
6034 								cF = _mm_avg_epu16(cE__g_, cF__g_);
6035 								c1 = _mm_avg_epu8(c1, c3);
6036 								c5 = _mm_avg_epu8(c5, c7);
6037 								c9 = _mm_avg_epu8(c9, cB);
6038 								cD = _mm_avg_epu8(cD, cF);
6039 								c1 = _mm_avg_epu8(c1, c5);
6040 								c9 = _mm_avg_epu8(c9, cD);
6041 								c1 = _mm_avg_epu8(c1, c9);
6042 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6043 								c0 = _mm_or_si128(c0, c1);
6044 
6045 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
6046 							}
6047 
6048 							source0 += pitch;
6049 							source1 += pitch;
6050 							source2 += pitch;
6051 							source3 += pitch;
6052 							source4 += pitch;
6053 							source5 += pitch;
6054 							source6 += pitch;
6055 							source7 += pitch;
6056 							source8 += pitch;
6057 							source9 += pitch;
6058 							sourceA += pitch;
6059 							sourceB += pitch;
6060 							sourceC += pitch;
6061 							sourceD += pitch;
6062 							sourceE += pitch;
6063 							sourceF += pitch;
6064 						}
6065 					}
6066 					else ASSERT(false);
6067 				}
6068 				else
6069 			#endif
6070 			{
6071 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
6072 
6073 				if(internal.samples == 2)
6074 				{
6075 					for(int y = 0; y < height; y++)
6076 					{
6077 						for(int x = 0; x < width; x++)
6078 						{
6079 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6080 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6081 
6082 							c0 = AVERAGE(c0, c1);
6083 
6084 							*(unsigned short*)(source0 + 2 * x) = c0;
6085 						}
6086 
6087 						source0 += pitch;
6088 						source1 += pitch;
6089 					}
6090 				}
6091 				else if(internal.samples == 4)
6092 				{
6093 					for(int y = 0; y < height; y++)
6094 					{
6095 						for(int x = 0; x < width; x++)
6096 						{
6097 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6098 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6099 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6100 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6101 
6102 							c0 = AVERAGE(c0, c1);
6103 							c2 = AVERAGE(c2, c3);
6104 							c0 = AVERAGE(c0, c2);
6105 
6106 							*(unsigned short*)(source0 + 2 * x) = c0;
6107 						}
6108 
6109 						source0 += pitch;
6110 						source1 += pitch;
6111 						source2 += pitch;
6112 						source3 += pitch;
6113 					}
6114 				}
6115 				else if(internal.samples == 8)
6116 				{
6117 					for(int y = 0; y < height; y++)
6118 					{
6119 						for(int x = 0; x < width; x++)
6120 						{
6121 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6122 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6123 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6124 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6125 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6126 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6127 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6128 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6129 
6130 							c0 = AVERAGE(c0, c1);
6131 							c2 = AVERAGE(c2, c3);
6132 							c4 = AVERAGE(c4, c5);
6133 							c6 = AVERAGE(c6, c7);
6134 							c0 = AVERAGE(c0, c2);
6135 							c4 = AVERAGE(c4, c6);
6136 							c0 = AVERAGE(c0, c4);
6137 
6138 							*(unsigned short*)(source0 + 2 * x) = c0;
6139 						}
6140 
6141 						source0 += pitch;
6142 						source1 += pitch;
6143 						source2 += pitch;
6144 						source3 += pitch;
6145 						source4 += pitch;
6146 						source5 += pitch;
6147 						source6 += pitch;
6148 						source7 += pitch;
6149 					}
6150 				}
6151 				else if(internal.samples == 16)
6152 				{
6153 					for(int y = 0; y < height; y++)
6154 					{
6155 						for(int x = 0; x < width; x++)
6156 						{
6157 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6158 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6159 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6160 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6161 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6162 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6163 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6164 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6165 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
6166 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
6167 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
6168 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
6169 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
6170 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
6171 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
6172 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
6173 
6174 							c0 = AVERAGE(c0, c1);
6175 							c2 = AVERAGE(c2, c3);
6176 							c4 = AVERAGE(c4, c5);
6177 							c6 = AVERAGE(c6, c7);
6178 							c8 = AVERAGE(c8, c9);
6179 							cA = AVERAGE(cA, cB);
6180 							cC = AVERAGE(cC, cD);
6181 							cE = AVERAGE(cE, cF);
6182 							c0 = AVERAGE(c0, c2);
6183 							c4 = AVERAGE(c4, c6);
6184 							c8 = AVERAGE(c8, cA);
6185 							cC = AVERAGE(cC, cE);
6186 							c0 = AVERAGE(c0, c4);
6187 							c8 = AVERAGE(c8, cC);
6188 							c0 = AVERAGE(c0, c8);
6189 
6190 							*(unsigned short*)(source0 + 2 * x) = c0;
6191 						}
6192 
6193 						source0 += pitch;
6194 						source1 += pitch;
6195 						source2 += pitch;
6196 						source3 += pitch;
6197 						source4 += pitch;
6198 						source5 += pitch;
6199 						source6 += pitch;
6200 						source7 += pitch;
6201 						source8 += pitch;
6202 						source9 += pitch;
6203 						sourceA += pitch;
6204 						sourceB += pitch;
6205 						sourceC += pitch;
6206 						sourceD += pitch;
6207 						sourceE += pitch;
6208 						sourceF += pitch;
6209 					}
6210 				}
6211 				else ASSERT(false);
6212 
6213 				#undef AVERAGE
6214 			}
6215 		}
6216 		else
6217 		{
6218 		//	UNIMPLEMENTED();
6219 		}
6220 	}
6221 }
6222