• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 	#include <xmmintrin.h>
30 	#include <emmintrin.h>
31 #endif
32 
33 #undef min
34 #undef max
35 
36 namespace sw
37 {
38 	extern bool quadLayoutEnabled;
39 	extern bool complementaryDepthBuffer;
40 	extern TranscendentalPrecision logPrecision;
41 
42 	unsigned int *Surface::palette = 0;
43 	unsigned int Surface::paletteID = 0;
44 
clip(int minX,int minY,int maxX,int maxY)45 	void Rect::clip(int minX, int minY, int maxX, int maxY)
46 	{
47 		x0 = clamp(x0, minX, maxX);
48 		y0 = clamp(y0, minY, maxY);
49 		x1 = clamp(x1, minX, maxX);
50 		y1 = clamp(y1, minY, maxY);
51 	}
52 
write(int x,int y,int z,const Color<float> & color)53 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
54 	{
55 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
56 
57 		write(element, color);
58 	}
59 
write(int x,int y,const Color<float> & color)60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
61 	{
62 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
63 
64 		write(element, color);
65 	}
66 
write(void * element,const Color<float> & color)67 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
68 	{
69 		switch(format)
70 		{
71 		case FORMAT_A8:
72 			*(unsigned char*)element = unorm<8>(color.a);
73 			break;
74 		case FORMAT_R8I_SNORM:
75 			*(char*)element = snorm<8>(color.r);
76 			break;
77 		case FORMAT_R8:
78 			*(unsigned char*)element = unorm<8>(color.r);
79 			break;
80 		case FORMAT_R8I:
81 			*(char*)element = scast<8>(color.r);
82 			break;
83 		case FORMAT_R8UI:
84 			*(unsigned char*)element = ucast<8>(color.r);
85 			break;
86 		case FORMAT_R16I:
87 			*(short*)element = scast<16>(color.r);
88 			break;
89 		case FORMAT_R16UI:
90 			*(unsigned short*)element = ucast<16>(color.r);
91 			break;
92 		case FORMAT_R32I:
93 			*(int*)element = static_cast<int>(color.r);
94 			break;
95 		case FORMAT_R32UI:
96 			*(unsigned int*)element = static_cast<unsigned int>(color.r);
97 			break;
98 		case FORMAT_R3G3B2:
99 			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
100 			break;
101 		case FORMAT_A8R3G3B2:
102 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
103 			break;
104 		case FORMAT_X4R4G4B4:
105 			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
106 			break;
107 		case FORMAT_A4R4G4B4:
108 			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
109 			break;
110 		case FORMAT_R4G4B4A4:
111 			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
112 			break;
113 		case FORMAT_R5G6B5:
114 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
115 			break;
116 		case FORMAT_A1R5G5B5:
117 			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
118 			break;
119 		case FORMAT_R5G5B5A1:
120 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
121 			break;
122 		case FORMAT_X1R5G5B5:
123 			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
124 			break;
125 		case FORMAT_A8R8G8B8:
126 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
127 			break;
128 		case FORMAT_X8R8G8B8:
129 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
130 			break;
131 		case FORMAT_A8B8G8R8I_SNORM:
132 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
133 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
134 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
135 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
136 			break;
137 		case FORMAT_A8B8G8R8:
138 		case FORMAT_SRGB8_A8:
139 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
140 			break;
141 		case FORMAT_A8B8G8R8I:
142 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
143 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
144 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
145 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
146 			break;
147 		case FORMAT_A8B8G8R8UI:
148 			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
149 			break;
150 		case FORMAT_X8B8G8R8I_SNORM:
151 			*(unsigned int*)element = 0x7F000000 |
152 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
153 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
154 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
155 			break;
156 		case FORMAT_X8B8G8R8:
157 		case FORMAT_SRGB8_X8:
158 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
159 			break;
160 		case FORMAT_X8B8G8R8I:
161 			*(unsigned int*)element = 0x7F000000 |
162 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
163 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
164 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
165 		case FORMAT_X8B8G8R8UI:
166 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
167 			break;
168 		case FORMAT_A2R10G10B10:
169 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
170 			break;
171 		case FORMAT_A2B10G10R10:
172 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
173 			break;
174 		case FORMAT_G8R8I_SNORM:
175 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
176 			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
177 			break;
178 		case FORMAT_G8R8:
179 			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
180 			break;
181 		case FORMAT_G8R8I:
182 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
183 			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
184 			break;
185 		case FORMAT_G8R8UI:
186 			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
187 			break;
188 		case FORMAT_G16R16:
189 			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
190 			break;
191 		case FORMAT_G16R16I:
192 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
193 			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
194 			break;
195 		case FORMAT_G16R16UI:
196 			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
197 			break;
198 		case FORMAT_G32R32I:
199 		case FORMAT_G32R32UI:
200 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
201 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
202 			break;
203 		case FORMAT_A16B16G16R16:
204 			((unsigned short*)element)[0] = unorm<16>(color.r);
205 			((unsigned short*)element)[1] = unorm<16>(color.g);
206 			((unsigned short*)element)[2] = unorm<16>(color.b);
207 			((unsigned short*)element)[3] = unorm<16>(color.a);
208 			break;
209 		case FORMAT_A16B16G16R16I:
210 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
211 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
212 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
213 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
214 			break;
215 		case FORMAT_A16B16G16R16UI:
216 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
217 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
218 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
219 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
220 			break;
221 		case FORMAT_X16B16G16R16I:
222 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
223 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
224 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
225 			break;
226 		case FORMAT_X16B16G16R16UI:
227 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
228 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
229 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
230 			break;
231 		case FORMAT_A32B32G32R32I:
232 		case FORMAT_A32B32G32R32UI:
233 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
234 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
235 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
236 			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
237 			break;
238 		case FORMAT_X32B32G32R32I:
239 		case FORMAT_X32B32G32R32UI:
240 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
241 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
242 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
243 			break;
244 		case FORMAT_V8U8:
245 			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
246 			break;
247 		case FORMAT_L6V5U5:
248 			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
249 			break;
250 		case FORMAT_Q8W8V8U8:
251 			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
252 			break;
253 		case FORMAT_X8L8V8U8:
254 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
255 			break;
256 		case FORMAT_V16U16:
257 			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
258 			break;
259 		case FORMAT_A2W10V10U10:
260 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
261 			break;
262 		case FORMAT_A16W16V16U16:
263 			((unsigned short*)element)[0] = snorm<16>(color.r);
264 			((unsigned short*)element)[1] = snorm<16>(color.g);
265 			((unsigned short*)element)[2] = snorm<16>(color.b);
266 			((unsigned short*)element)[3] = unorm<16>(color.a);
267 			break;
268 		case FORMAT_Q16W16V16U16:
269 			((unsigned short*)element)[0] = snorm<16>(color.r);
270 			((unsigned short*)element)[1] = snorm<16>(color.g);
271 			((unsigned short*)element)[2] = snorm<16>(color.b);
272 			((unsigned short*)element)[3] = snorm<16>(color.a);
273 			break;
274 		case FORMAT_R8G8B8:
275 			((unsigned char*)element)[0] = unorm<8>(color.b);
276 			((unsigned char*)element)[1] = unorm<8>(color.g);
277 			((unsigned char*)element)[2] = unorm<8>(color.r);
278 			break;
279 		case FORMAT_B8G8R8:
280 			((unsigned char*)element)[0] = unorm<8>(color.r);
281 			((unsigned char*)element)[1] = unorm<8>(color.g);
282 			((unsigned char*)element)[2] = unorm<8>(color.b);
283 			break;
284 		case FORMAT_R16F:
285 			*(half*)element = (half)color.r;
286 			break;
287 		case FORMAT_A16F:
288 			*(half*)element = (half)color.a;
289 			break;
290 		case FORMAT_G16R16F:
291 			((half*)element)[0] = (half)color.r;
292 			((half*)element)[1] = (half)color.g;
293 			break;
294 		case FORMAT_B16G16R16F:
295 			((half*)element)[0] = (half)color.r;
296 			((half*)element)[1] = (half)color.g;
297 			((half*)element)[2] = (half)color.b;
298 			break;
299 		case FORMAT_A16B16G16R16F:
300 			((half*)element)[0] = (half)color.r;
301 			((half*)element)[1] = (half)color.g;
302 			((half*)element)[2] = (half)color.b;
303 			((half*)element)[3] = (half)color.a;
304 			break;
305 		case FORMAT_A32F:
306 			*(float*)element = color.a;
307 			break;
308 		case FORMAT_R32F:
309 			*(float*)element = color.r;
310 			break;
311 		case FORMAT_G32R32F:
312 			((float*)element)[0] = color.r;
313 			((float*)element)[1] = color.g;
314 			break;
315 		case FORMAT_X32B32G32R32F:
316 			((float*)element)[3] = 1.0f;
317 		case FORMAT_B32G32R32F:
318 			((float*)element)[0] = color.r;
319 			((float*)element)[1] = color.g;
320 			((float*)element)[2] = color.b;
321 			break;
322 		case FORMAT_A32B32G32R32F:
323 			((float*)element)[0] = color.r;
324 			((float*)element)[1] = color.g;
325 			((float*)element)[2] = color.b;
326 			((float*)element)[3] = color.a;
327 			break;
328 		case FORMAT_D32F:
329 		case FORMAT_D32F_LOCKABLE:
330 		case FORMAT_D32FS8_TEXTURE:
331 		case FORMAT_D32FS8_SHADOW:
332 			*((float*)element) = color.r;
333 			break;
334 		case FORMAT_D32F_COMPLEMENTARY:
335 			*((float*)element) = 1 - color.r;
336 			break;
337 		case FORMAT_S8:
338 			*((unsigned char*)element) = unorm<8>(color.r);
339 			break;
340 		case FORMAT_L8:
341 			*(unsigned char*)element = unorm<8>(color.r);
342 			break;
343 		case FORMAT_A4L4:
344 			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
345 			break;
346 		case FORMAT_L16:
347 			*(unsigned short*)element = unorm<16>(color.r);
348 			break;
349 		case FORMAT_A8L8:
350 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
351 			break;
352 		case FORMAT_L16F:
353 			*(half*)element = (half)color.r;
354 			break;
355 		case FORMAT_A16L16F:
356 			((half*)element)[0] = (half)color.r;
357 			((half*)element)[1] = (half)color.a;
358 			break;
359 		case FORMAT_L32F:
360 			*(float*)element = color.r;
361 			break;
362 		case FORMAT_A32L32F:
363 			((float*)element)[0] = color.r;
364 			((float*)element)[1] = color.a;
365 			break;
366 		default:
367 			ASSERT(false);
368 		}
369 	}
370 
read(int x,int y,int z) const371 	Color<float> Surface::Buffer::read(int x, int y, int z) const
372 	{
373 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
374 
375 		return read(element);
376 	}
377 
read(int x,int y) const378 	Color<float> Surface::Buffer::read(int x, int y) const
379 	{
380 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
381 
382 		return read(element);
383 	}
384 
read(void * element) const385 	inline Color<float> Surface::Buffer::read(void *element) const
386 	{
387 		float r = 0.0f;
388 		float g = 0.0f;
389 		float b = 0.0f;
390 		float a = 1.0f;
391 
392 		switch(format)
393 		{
394 		case FORMAT_P8:
395 			{
396 				ASSERT(palette);
397 
398 				unsigned int abgr = palette[*(unsigned char*)element];
399 
400 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
401 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
402 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
403 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
404 			}
405 			break;
406 		case FORMAT_A8P8:
407 			{
408 				ASSERT(palette);
409 
410 				unsigned int bgr = palette[((unsigned char*)element)[0]];
411 
412 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
413 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
414 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
415 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
416 			}
417 			break;
418 		case FORMAT_A8:
419 			r = 0;
420 			g = 0;
421 			b = 0;
422 			a = *(unsigned char*)element * (1.0f / 0xFF);
423 			break;
424 		case FORMAT_R8I_SNORM:
425 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
426 			break;
427 		case FORMAT_R8:
428 			r = *(unsigned char*)element * (1.0f / 0xFF);
429 			break;
430 		case FORMAT_R8I:
431 			r = *(signed char*)element;
432 			break;
433 		case FORMAT_R8UI:
434 			r = *(unsigned char*)element;
435 			break;
436 		case FORMAT_R3G3B2:
437 			{
438 				unsigned char rgb = *(unsigned char*)element;
439 
440 				r = (rgb & 0xE0) * (1.0f / 0xE0);
441 				g = (rgb & 0x1C) * (1.0f / 0x1C);
442 				b = (rgb & 0x03) * (1.0f / 0x03);
443 			}
444 			break;
445 		case FORMAT_A8R3G3B2:
446 			{
447 				unsigned short argb = *(unsigned short*)element;
448 
449 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
450 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
451 				g = (argb & 0x001C) * (1.0f / 0x001C);
452 				b = (argb & 0x0003) * (1.0f / 0x0003);
453 			}
454 			break;
455 		case FORMAT_X4R4G4B4:
456 			{
457 				unsigned short rgb = *(unsigned short*)element;
458 
459 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
460 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
461 				b = (rgb & 0x000F) * (1.0f / 0x000F);
462 			}
463 			break;
464 		case FORMAT_A4R4G4B4:
465 			{
466 				unsigned short argb = *(unsigned short*)element;
467 
468 				a = (argb & 0xF000) * (1.0f / 0xF000);
469 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
470 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
471 				b = (argb & 0x000F) * (1.0f / 0x000F);
472 			}
473 			break;
474 		case FORMAT_R4G4B4A4:
475 			{
476 				unsigned short rgba = *(unsigned short*)element;
477 
478 				r = (rgba & 0xF000) * (1.0f / 0xF000);
479 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
480 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
481 				a = (rgba & 0x000F) * (1.0f / 0x000F);
482 			}
483 			break;
484 		case FORMAT_R5G6B5:
485 			{
486 				unsigned short rgb = *(unsigned short*)element;
487 
488 				r = (rgb & 0xF800) * (1.0f / 0xF800);
489 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
490 				b = (rgb & 0x001F) * (1.0f / 0x001F);
491 			}
492 			break;
493 		case FORMAT_A1R5G5B5:
494 			{
495 				unsigned short argb = *(unsigned short*)element;
496 
497 				a = (argb & 0x8000) * (1.0f / 0x8000);
498 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
499 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
500 				b = (argb & 0x001F) * (1.0f / 0x001F);
501 			}
502 			break;
503 		case FORMAT_R5G5B5A1:
504 			{
505 				unsigned short rgba = *(unsigned short*)element;
506 
507 				r = (rgba & 0xF800) * (1.0f / 0xF800);
508 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
509 				b = (rgba & 0x003E) * (1.0f / 0x003E);
510 				a = (rgba & 0x0001) * (1.0f / 0x0001);
511 			}
512 			break;
513 		case FORMAT_X1R5G5B5:
514 			{
515 				unsigned short xrgb = *(unsigned short*)element;
516 
517 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
518 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
519 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
520 			}
521 			break;
522 		case FORMAT_A8R8G8B8:
523 			{
524 				unsigned int argb = *(unsigned int*)element;
525 
526 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
527 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
528 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
529 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
530 			}
531 			break;
532 		case FORMAT_X8R8G8B8:
533 			{
534 				unsigned int xrgb = *(unsigned int*)element;
535 
536 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
537 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
538 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
539 			}
540 			break;
541 		case FORMAT_A8B8G8R8I_SNORM:
542 			{
543 				signed char* abgr = (signed char*)element;
544 
545 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
546 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
547 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
548 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
549 			}
550 			break;
551 		case FORMAT_A8B8G8R8:
552 		case FORMAT_SRGB8_A8:
553 			{
554 				unsigned int abgr = *(unsigned int*)element;
555 
556 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
557 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
558 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
559 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
560 			}
561 			break;
562 		case FORMAT_A8B8G8R8I:
563 			{
564 				signed char* abgr = (signed char*)element;
565 
566 				r = abgr[0];
567 				g = abgr[1];
568 				b = abgr[2];
569 				a = abgr[3];
570 			}
571 			break;
572 		case FORMAT_A8B8G8R8UI:
573 			{
574 				unsigned char* abgr = (unsigned char*)element;
575 
576 				r = abgr[0];
577 				g = abgr[1];
578 				b = abgr[2];
579 				a = abgr[3];
580 			}
581 			break;
582 		case FORMAT_X8B8G8R8I_SNORM:
583 			{
584 				signed char* bgr = (signed char*)element;
585 
586 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
587 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
588 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
589 			}
590 			break;
591 		case FORMAT_X8B8G8R8:
592 		case FORMAT_SRGB8_X8:
593 			{
594 				unsigned int xbgr = *(unsigned int*)element;
595 
596 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
597 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
598 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
599 			}
600 			break;
601 		case FORMAT_X8B8G8R8I:
602 			{
603 				signed char* bgr = (signed char*)element;
604 
605 				r = bgr[0];
606 				g = bgr[1];
607 				b = bgr[2];
608 			}
609 			break;
610 		case FORMAT_X8B8G8R8UI:
611 			{
612 				unsigned char* bgr = (unsigned char*)element;
613 
614 				r = bgr[0];
615 				g = bgr[1];
616 				b = bgr[2];
617 			}
618 			break;
619 		case FORMAT_G8R8I_SNORM:
620 			{
621 				signed char* gr = (signed char*)element;
622 
623 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
624 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
625 			}
626 			break;
627 		case FORMAT_G8R8:
628 			{
629 				unsigned short gr = *(unsigned short*)element;
630 
631 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
632 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
633 			}
634 			break;
635 		case FORMAT_G8R8I:
636 			{
637 				signed char* gr = (signed char*)element;
638 
639 				r = gr[0];
640 				g = gr[1];
641 			}
642 			break;
643 		case FORMAT_G8R8UI:
644 			{
645 				unsigned char* gr = (unsigned char*)element;
646 
647 				r = gr[0];
648 				g = gr[1];
649 			}
650 			break;
651 		case FORMAT_R16I:
652 			r = *((short*)element);
653 			break;
654 		case FORMAT_R16UI:
655 			r = *((unsigned short*)element);
656 			break;
657 		case FORMAT_G16R16I:
658 			{
659 				short* gr = (short*)element;
660 
661 				r = gr[0];
662 				g = gr[1];
663 			}
664 			break;
665 		case FORMAT_G16R16:
666 			{
667 				unsigned int gr = *(unsigned int*)element;
668 
669 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
670 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
671 			}
672 			break;
673 		case FORMAT_G16R16UI:
674 			{
675 				unsigned short* gr = (unsigned short*)element;
676 
677 				r = gr[0];
678 				g = gr[1];
679 			}
680 			break;
681 		case FORMAT_A2R10G10B10:
682 			{
683 				unsigned int argb = *(unsigned int*)element;
684 
685 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
686 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
687 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
688 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
689 			}
690 			break;
691 		case FORMAT_A2B10G10R10:
692 			{
693 				unsigned int abgr = *(unsigned int*)element;
694 
695 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
696 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
697 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
698 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
699 			}
700 			break;
701 		case FORMAT_A16B16G16R16I:
702 			{
703 				short* abgr = (short*)element;
704 
705 				r = abgr[0];
706 				g = abgr[1];
707 				b = abgr[2];
708 				a = abgr[3];
709 			}
710 			break;
711 		case FORMAT_A16B16G16R16:
712 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
713 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
714 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
715 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
716 			break;
717 		case FORMAT_A16B16G16R16UI:
718 			{
719 				unsigned short* abgr = (unsigned short*)element;
720 
721 				r = abgr[0];
722 				g = abgr[1];
723 				b = abgr[2];
724 				a = abgr[3];
725 			}
726 			break;
727 		case FORMAT_X16B16G16R16I:
728 			{
729 				short* bgr = (short*)element;
730 
731 				r = bgr[0];
732 				g = bgr[1];
733 				b = bgr[2];
734 			}
735 			break;
736 		case FORMAT_X16B16G16R16UI:
737 			{
738 				unsigned short* bgr = (unsigned short*)element;
739 
740 				r = bgr[0];
741 				g = bgr[1];
742 				b = bgr[2];
743 			}
744 			break;
745 		case FORMAT_A32B32G32R32I:
746 			{
747 				int* abgr = (int*)element;
748 
749 				r = static_cast<float>(abgr[0]);
750 				g = static_cast<float>(abgr[1]);
751 				b = static_cast<float>(abgr[2]);
752 				a = static_cast<float>(abgr[3]);
753 			}
754 			break;
755 		case FORMAT_A32B32G32R32UI:
756 			{
757 				unsigned int* abgr = (unsigned int*)element;
758 
759 				r = static_cast<float>(abgr[0]);
760 				g = static_cast<float>(abgr[1]);
761 				b = static_cast<float>(abgr[2]);
762 				a = static_cast<float>(abgr[3]);
763 			}
764 			break;
765 		case FORMAT_X32B32G32R32I:
766 			{
767 				int* bgr = (int*)element;
768 
769 				r = static_cast<float>(bgr[0]);
770 				g = static_cast<float>(bgr[1]);
771 				b = static_cast<float>(bgr[2]);
772 			}
773 			break;
774 		case FORMAT_X32B32G32R32UI:
775 			{
776 				unsigned int* bgr = (unsigned int*)element;
777 
778 				r = static_cast<float>(bgr[0]);
779 				g = static_cast<float>(bgr[1]);
780 				b = static_cast<float>(bgr[2]);
781 			}
782 			break;
783 		case FORMAT_G32R32I:
784 			{
785 				int* gr = (int*)element;
786 
787 				r = static_cast<float>(gr[0]);
788 				g = static_cast<float>(gr[1]);
789 			}
790 			break;
791 		case FORMAT_G32R32UI:
792 			{
793 				unsigned int* gr = (unsigned int*)element;
794 
795 				r = static_cast<float>(gr[0]);
796 				g = static_cast<float>(gr[1]);
797 			}
798 			break;
799 		case FORMAT_R32I:
800 			r = static_cast<float>(*((int*)element));
801 			break;
802 		case FORMAT_R32UI:
803 			r = static_cast<float>(*((unsigned int*)element));
804 			break;
805 		case FORMAT_V8U8:
806 			{
807 				unsigned short vu = *(unsigned short*)element;
808 
809 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
810 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
811 			}
812 			break;
813 		case FORMAT_L6V5U5:
814 			{
815 				unsigned short lvu = *(unsigned short*)element;
816 
817 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
818 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
819 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
820 			}
821 			break;
822 		case FORMAT_Q8W8V8U8:
823 			{
824 				unsigned int qwvu = *(unsigned int*)element;
825 
826 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
827 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
828 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
829 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
830 			}
831 			break;
832 		case FORMAT_X8L8V8U8:
833 			{
834 				unsigned int xlvu = *(unsigned int*)element;
835 
836 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
837 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
838 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
839 			}
840 			break;
841 		case FORMAT_R8G8B8:
842 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
843 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
844 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
845 			break;
846 		case FORMAT_B8G8R8:
847 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
848 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
849 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
850 			break;
851 		case FORMAT_V16U16:
852 			{
853 				unsigned int vu = *(unsigned int*)element;
854 
855 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
856 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
857 			}
858 			break;
859 		case FORMAT_A2W10V10U10:
860 			{
861 				unsigned int awvu = *(unsigned int*)element;
862 
863 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
864 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
865 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
866 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
867 			}
868 			break;
869 		case FORMAT_A16W16V16U16:
870 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
871 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
872 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
873 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
874 			break;
875 		case FORMAT_Q16W16V16U16:
876 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
877 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
878 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
879 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
880 			break;
881 		case FORMAT_L8:
882 			r =
883 			g =
884 			b = *(unsigned char*)element * (1.0f / 0xFF);
885 			break;
886 		case FORMAT_A4L4:
887 			{
888 				unsigned char al = *(unsigned char*)element;
889 
890 				r =
891 				g =
892 				b = (al & 0x0F) * (1.0f / 0x0F);
893 				a = (al & 0xF0) * (1.0f / 0xF0);
894 			}
895 			break;
896 		case FORMAT_L16:
897 			r =
898 			g =
899 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
900 			break;
901 		case FORMAT_A8L8:
902 			r =
903 			g =
904 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
905 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
906 			break;
907 		case FORMAT_L16F:
908 			r =
909 			g =
910 			b = *(half*)element;
911 			break;
912 		case FORMAT_A16L16F:
913 			r =
914 			g =
915 			b = ((half*)element)[0];
916 			a = ((half*)element)[1];
917 			break;
918 		case FORMAT_L32F:
919 			r =
920 			g =
921 			b = *(float*)element;
922 			break;
923 		case FORMAT_A32L32F:
924 			r =
925 			g =
926 			b = ((float*)element)[0];
927 			a = ((float*)element)[1];
928 			break;
929 		case FORMAT_A16F:
930 			a = *(half*)element;
931 			break;
932 		case FORMAT_R16F:
933 			r = *(half*)element;
934 			break;
935 		case FORMAT_G16R16F:
936 			r = ((half*)element)[0];
937 			g = ((half*)element)[1];
938 			break;
939 		case FORMAT_B16G16R16F:
940 			r = ((half*)element)[0];
941 			g = ((half*)element)[1];
942 			b = ((half*)element)[2];
943 			break;
944 		case FORMAT_A16B16G16R16F:
945 			r = ((half*)element)[0];
946 			g = ((half*)element)[1];
947 			b = ((half*)element)[2];
948 			a = ((half*)element)[3];
949 			break;
950 		case FORMAT_A32F:
951 			a = *(float*)element;
952 			break;
953 		case FORMAT_R32F:
954 			r = *(float*)element;
955 			break;
956 		case FORMAT_G32R32F:
957 			r = ((float*)element)[0];
958 			g = ((float*)element)[1];
959 			break;
960 		case FORMAT_X32B32G32R32F:
961 		case FORMAT_B32G32R32F:
962 			r = ((float*)element)[0];
963 			g = ((float*)element)[1];
964 			b = ((float*)element)[2];
965 			break;
966 		case FORMAT_A32B32G32R32F:
967 			r = ((float*)element)[0];
968 			g = ((float*)element)[1];
969 			b = ((float*)element)[2];
970 			a = ((float*)element)[3];
971 			break;
972 		case FORMAT_D32F:
973 		case FORMAT_D32F_LOCKABLE:
974 		case FORMAT_D32FS8_TEXTURE:
975 		case FORMAT_D32FS8_SHADOW:
976 			r = *(float*)element;
977 			g = r;
978 			b = r;
979 			a = r;
980 			break;
981 		case FORMAT_D32F_COMPLEMENTARY:
982 			r = 1.0f - *(float*)element;
983 			g = r;
984 			b = r;
985 			a = r;
986 			break;
987 		case FORMAT_S8:
988 			r = *(unsigned char*)element * (1.0f / 0xFF);
989 			break;
990 		default:
991 			ASSERT(false);
992 		}
993 
994 	//	if(sRGB)
995 	//	{
996 	//		r = sRGBtoLinear(r);
997 	//		g = sRGBtoLinear(g);
998 	//		b = sRGBtoLinear(b);
999 	//	}
1000 
1001 		return Color<float>(r, g, b, a);
1002 	}
1003 
sample(float x,float y,float z) const1004 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1005 	{
1006 		x -= 0.5f;
1007 		y -= 0.5f;
1008 		z -= 0.5f;
1009 
1010 		int x0 = clamp((int)x, 0, width - 1);
1011 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1012 
1013 		int y0 = clamp((int)y, 0, height - 1);
1014 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1015 
1016 		int z0 = clamp((int)z, 0, depth - 1);
1017 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1018 
1019 		Color<float> c000 = read(x0, y0, z0);
1020 		Color<float> c100 = read(x1, y0, z0);
1021 		Color<float> c010 = read(x0, y1, z0);
1022 		Color<float> c110 = read(x1, y1, z0);
1023 		Color<float> c001 = read(x0, y0, z1);
1024 		Color<float> c101 = read(x1, y0, z1);
1025 		Color<float> c011 = read(x0, y1, z1);
1026 		Color<float> c111 = read(x1, y1, z1);
1027 
1028 		float fx = x - x0;
1029 		float fy = y - y0;
1030 		float fz = z - z0;
1031 
1032 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1033 		c100 *= fx * (1 - fy) * (1 - fz);
1034 		c010 *= (1 - fx) * fy * (1 - fz);
1035 		c110 *= fx * fy * (1 - fz);
1036 		c001 *= (1 - fx) * (1 - fy) * fz;
1037 		c101 *= fx * (1 - fy) * fz;
1038 		c011 *= (1 - fx) * fy * fz;
1039 		c111 *= fx * fy * fz;
1040 
1041 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1042 	}
1043 
sample(float x,float y) const1044 	Color<float> Surface::Buffer::sample(float x, float y) const
1045 	{
1046 		x -= 0.5f;
1047 		y -= 0.5f;
1048 
1049 		int x0 = clamp((int)x, 0, width - 1);
1050 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1051 
1052 		int y0 = clamp((int)y, 0, height - 1);
1053 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1054 
1055 		Color<float> c00 = read(x0, y0);
1056 		Color<float> c10 = read(x1, y0);
1057 		Color<float> c01 = read(x0, y1);
1058 		Color<float> c11 = read(x1, y1);
1059 
1060 		float fx = x - x0;
1061 		float fy = y - y0;
1062 
1063 		c00 *= (1 - fx) * (1 - fy);
1064 		c10 *= fx * (1 - fy);
1065 		c01 *= (1 - fx) * fy;
1066 		c11 *= fx * fy;
1067 
1068 		return c00 + c10 + c01 + c11;
1069 	}
1070 
lockRect(int x,int y,int z,Lock lock)1071 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1072 	{
1073 		this->lock = lock;
1074 
1075 		switch(lock)
1076 		{
1077 		case LOCK_UNLOCKED:
1078 		case LOCK_READONLY:
1079 			break;
1080 		case LOCK_WRITEONLY:
1081 		case LOCK_READWRITE:
1082 		case LOCK_DISCARD:
1083 			dirty = true;
1084 			break;
1085 		default:
1086 			ASSERT(false);
1087 		}
1088 
1089 		if(buffer)
1090 		{
1091 			switch(format)
1092 			{
1093 			#if S3TC_SUPPORT
1094 			case FORMAT_DXT1:
1095 			#endif
1096 			case FORMAT_ATI1:
1097 			case FORMAT_ETC1:
1098 			case FORMAT_R11_EAC:
1099 			case FORMAT_SIGNED_R11_EAC:
1100 			case FORMAT_RGB8_ETC2:
1101 			case FORMAT_SRGB8_ETC2:
1102 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1103 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1104 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1105 			case FORMAT_RG11_EAC:
1106 			case FORMAT_SIGNED_RG11_EAC:
1107 			case FORMAT_RGBA8_ETC2_EAC:
1108 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1109 			case FORMAT_RGBA_ASTC_4x4_KHR:
1110 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1111 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1112 			case FORMAT_RGBA_ASTC_5x4_KHR:
1113 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1114 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1115 			case FORMAT_RGBA_ASTC_5x5_KHR:
1116 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1117 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1118 			case FORMAT_RGBA_ASTC_6x5_KHR:
1119 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1120 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1121 			case FORMAT_RGBA_ASTC_6x6_KHR:
1122 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1123 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1124 			case FORMAT_RGBA_ASTC_8x5_KHR:
1125 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1126 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1127 			case FORMAT_RGBA_ASTC_8x6_KHR:
1128 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1129 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1130 			case FORMAT_RGBA_ASTC_8x8_KHR:
1131 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1132 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1133 			case FORMAT_RGBA_ASTC_10x5_KHR:
1134 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1135 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1136 			case FORMAT_RGBA_ASTC_10x6_KHR:
1137 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1138 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1139 			case FORMAT_RGBA_ASTC_10x8_KHR:
1140 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1141 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1142 			case FORMAT_RGBA_ASTC_10x10_KHR:
1143 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1144 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1145 			case FORMAT_RGBA_ASTC_12x10_KHR:
1146 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1147 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1148 			case FORMAT_RGBA_ASTC_12x12_KHR:
1149 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1150 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1151 			#if S3TC_SUPPORT
1152 			case FORMAT_DXT3:
1153 			case FORMAT_DXT5:
1154 			#endif
1155 			case FORMAT_ATI2:
1156 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1157 			default:
1158 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
1159 			}
1160 		}
1161 
1162 		return 0;
1163 	}
1164 
unlockRect()1165 	void Surface::Buffer::unlockRect()
1166 	{
1167 		lock = LOCK_UNLOCKED;
1168 	}
1169 
1170 	class SurfaceImplementation : public Surface
1171 	{
1172 	public:
SurfaceImplementation(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1173 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1174 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
SurfaceImplementation(Resource * texture,int width,int height,int depth,Format format,bool lockable,bool renderTarget,int pitchP=0)1175 		SurfaceImplementation(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1176 			: Surface(texture, width, height, depth, format, lockable, renderTarget, pitchP) {}
~SurfaceImplementation()1177 		~SurfaceImplementation() override {};
1178 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1179 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1180 		{
1181 			return Surface::lockInternal(x, y, z, lock, client);
1182 		}
1183 
unlockInternal()1184 		void unlockInternal() override
1185 		{
1186 			Surface::unlockInternal();
1187 		}
1188 	};
1189 
create(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1190 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1191 	{
1192 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1193 	}
1194 
create(Resource * texture,int width,int height,int depth,Format format,bool lockable,bool renderTarget,int pitchPprovided)1195 	Surface *Surface::create(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1196 	{
1197 		return new SurfaceImplementation(texture, width, height, depth, format, lockable, renderTarget, pitchPprovided);
1198 	}
1199 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1200 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1201 	{
1202 		resource = new Resource(0);
1203 		hasParent = false;
1204 		ownExternal = false;
1205 		depth = max(1, depth);
1206 
1207 		external.buffer = pixels;
1208 		external.width = width;
1209 		external.height = height;
1210 		external.depth = depth;
1211 		external.format = format;
1212 		external.bytes = bytes(external.format);
1213 		external.pitchB = pitch;
1214 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1215 		external.sliceB = slice;
1216 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1217 		external.lock = LOCK_UNLOCKED;
1218 		external.dirty = true;
1219 
1220 		internal.buffer = 0;
1221 		internal.width = width;
1222 		internal.height = height;
1223 		internal.depth = depth;
1224 		internal.format = selectInternalFormat(format);
1225 		internal.bytes = bytes(internal.format);
1226 		internal.pitchB = pitchB(internal.width, internal.format, false);
1227 		internal.pitchP = pitchP(internal.width, internal.format, false);
1228 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
1229 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
1230 		internal.lock = LOCK_UNLOCKED;
1231 		internal.dirty = false;
1232 
1233 		stencil.buffer = 0;
1234 		stencil.width = width;
1235 		stencil.height = height;
1236 		stencil.depth = depth;
1237 		stencil.format = FORMAT_S8;
1238 		stencil.bytes = bytes(stencil.format);
1239 		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
1240 		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
1241 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
1242 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
1243 		stencil.lock = LOCK_UNLOCKED;
1244 		stencil.dirty = false;
1245 
1246 		dirtyMipmaps = true;
1247 		paletteUsed = 0;
1248 	}
1249 
Surface(Resource * texture,int width,int height,int depth,Format format,bool lockable,bool renderTarget,int pitchPprovided)1250 	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1251 	{
1252 		resource = texture ? texture : new Resource(0);
1253 		hasParent = texture != 0;
1254 		ownExternal = true;
1255 		depth = max(1, depth);
1256 
1257 		external.buffer = 0;
1258 		external.width = width;
1259 		external.height = height;
1260 		external.depth = depth;
1261 		external.format = format;
1262 		external.bytes = bytes(external.format);
1263 		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
1264 		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
1265 		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
1266 		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
1267 		external.lock = LOCK_UNLOCKED;
1268 		external.dirty = false;
1269 
1270 		internal.buffer = 0;
1271 		internal.width = width;
1272 		internal.height = height;
1273 		internal.depth = depth;
1274 		internal.format = selectInternalFormat(format);
1275 		internal.bytes = bytes(internal.format);
1276 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1277 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
1278 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
1279 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
1280 		internal.lock = LOCK_UNLOCKED;
1281 		internal.dirty = false;
1282 
1283 		stencil.buffer = 0;
1284 		stencil.width = width;
1285 		stencil.height = height;
1286 		stencil.depth = depth;
1287 		stencil.format = FORMAT_S8;
1288 		stencil.bytes = bytes(stencil.format);
1289 		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
1290 		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
1291 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
1292 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
1293 		stencil.lock = LOCK_UNLOCKED;
1294 		stencil.dirty = false;
1295 
1296 		dirtyMipmaps = true;
1297 		paletteUsed = 0;
1298 	}
1299 
~Surface()1300 	Surface::~Surface()
1301 	{
1302 		// sync() must be called before this destructor to ensure all locks have been released.
1303 		// We can't call it here because the parent resource may already have been destroyed.
1304 		ASSERT(isUnlocked());
1305 
1306 		if(!hasParent)
1307 		{
1308 			resource->destruct();
1309 		}
1310 
1311 		if(ownExternal)
1312 		{
1313 			deallocate(external.buffer);
1314 		}
1315 
1316 		if(internal.buffer != external.buffer)
1317 		{
1318 			deallocate(internal.buffer);
1319 		}
1320 
1321 		deallocate(stencil.buffer);
1322 
1323 		external.buffer = 0;
1324 		internal.buffer = 0;
1325 		stencil.buffer = 0;
1326 	}
1327 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1328 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1329 	{
1330 		resource->lock(client);
1331 
1332 		if(!external.buffer)
1333 		{
1334 			if(internal.buffer && identicalFormats())
1335 			{
1336 				external.buffer = internal.buffer;
1337 			}
1338 			else
1339 			{
1340 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
1341 			}
1342 		}
1343 
1344 		if(internal.dirty)
1345 		{
1346 			if(lock != LOCK_DISCARD)
1347 			{
1348 				update(external, internal);
1349 			}
1350 
1351 			internal.dirty = false;
1352 		}
1353 
1354 		switch(lock)
1355 		{
1356 		case LOCK_READONLY:
1357 			break;
1358 		case LOCK_WRITEONLY:
1359 		case LOCK_READWRITE:
1360 		case LOCK_DISCARD:
1361 			dirtyMipmaps = true;
1362 			break;
1363 		default:
1364 			ASSERT(false);
1365 		}
1366 
1367 		return external.lockRect(x, y, z, lock);
1368 	}
1369 
unlockExternal()1370 	void Surface::unlockExternal()
1371 	{
1372 		external.unlockRect();
1373 
1374 		resource->unlock();
1375 	}
1376 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1377 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1378 	{
1379 		if(lock != LOCK_UNLOCKED)
1380 		{
1381 			resource->lock(client);
1382 		}
1383 
1384 		if(!internal.buffer)
1385 		{
1386 			if(external.buffer && identicalFormats())
1387 			{
1388 				internal.buffer = external.buffer;
1389 			}
1390 			else
1391 			{
1392 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1393 			}
1394 		}
1395 
1396 		// FIXME: WHQL requires conversion to lower external precision and back
1397 		if(logPrecision >= WHQL)
1398 		{
1399 			if(internal.dirty && renderTarget && internal.format != external.format)
1400 			{
1401 				if(lock != LOCK_DISCARD)
1402 				{
1403 					switch(external.format)
1404 					{
1405 					case FORMAT_R3G3B2:
1406 					case FORMAT_A8R3G3B2:
1407 					case FORMAT_A1R5G5B5:
1408 					case FORMAT_A2R10G10B10:
1409 					case FORMAT_A2B10G10R10:
1410 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1411 						unlockExternal();
1412 						break;
1413 					default:
1414 						// Difference passes WHQL
1415 						break;
1416 					}
1417 				}
1418 			}
1419 		}
1420 
1421 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1422 		{
1423 			if(lock != LOCK_DISCARD)
1424 			{
1425 				update(internal, external);
1426 			}
1427 
1428 			external.dirty = false;
1429 			paletteUsed = Surface::paletteID;
1430 		}
1431 
1432 		switch(lock)
1433 		{
1434 		case LOCK_UNLOCKED:
1435 		case LOCK_READONLY:
1436 			break;
1437 		case LOCK_WRITEONLY:
1438 		case LOCK_READWRITE:
1439 		case LOCK_DISCARD:
1440 			dirtyMipmaps = true;
1441 			break;
1442 		default:
1443 			ASSERT(false);
1444 		}
1445 
1446 		if(lock == LOCK_READONLY && client == PUBLIC)
1447 		{
1448 			resolve();
1449 		}
1450 
1451 		return internal.lockRect(x, y, z, lock);
1452 	}
1453 
unlockInternal()1454 	void Surface::unlockInternal()
1455 	{
1456 		internal.unlockRect();
1457 
1458 		resource->unlock();
1459 	}
1460 
lockStencil(int x,int y,int front,Accessor client)1461 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1462 	{
1463 		resource->lock(client);
1464 
1465 		if(!stencil.buffer)
1466 		{
1467 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1468 		}
1469 
1470 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1471 	}
1472 
unlockStencil()1473 	void Surface::unlockStencil()
1474 	{
1475 		stencil.unlockRect();
1476 
1477 		resource->unlock();
1478 	}
1479 
bytes(Format format)1480 	int Surface::bytes(Format format)
1481 	{
1482 		switch(format)
1483 		{
1484 		case FORMAT_NULL:				return 0;
1485 		case FORMAT_P8:					return 1;
1486 		case FORMAT_A8P8:				return 2;
1487 		case FORMAT_A8:					return 1;
1488 		case FORMAT_R8I:				return 1;
1489 		case FORMAT_R8:					return 1;
1490 		case FORMAT_R3G3B2:				return 1;
1491 		case FORMAT_R16I:				return 2;
1492 		case FORMAT_R16UI:				return 2;
1493 		case FORMAT_A8R3G3B2:			return 2;
1494 		case FORMAT_R5G6B5:				return 2;
1495 		case FORMAT_A1R5G5B5:			return 2;
1496 		case FORMAT_X1R5G5B5:			return 2;
1497 		case FORMAT_R5G5B5A1:           return 2;
1498 		case FORMAT_X4R4G4B4:			return 2;
1499 		case FORMAT_A4R4G4B4:			return 2;
1500 		case FORMAT_R4G4B4A4:           return 2;
1501 		case FORMAT_R8G8B8:				return 3;
1502 		case FORMAT_B8G8R8:             return 3;
1503 		case FORMAT_R32I:				return 4;
1504 		case FORMAT_R32UI:				return 4;
1505 		case FORMAT_X8R8G8B8:			return 4;
1506 	//	case FORMAT_X8G8R8B8Q:			return 4;
1507 		case FORMAT_A8R8G8B8:			return 4;
1508 	//	case FORMAT_A8G8R8B8Q:			return 4;
1509 		case FORMAT_X8B8G8R8I:			return 4;
1510 		case FORMAT_X8B8G8R8:			return 4;
1511 		case FORMAT_SRGB8_X8:			return 4;
1512 		case FORMAT_SRGB8_A8:			return 4;
1513 		case FORMAT_A8B8G8R8I:			return 4;
1514 		case FORMAT_R8UI:				return 1;
1515 		case FORMAT_G8R8UI:				return 2;
1516 		case FORMAT_X8B8G8R8UI:			return 4;
1517 		case FORMAT_A8B8G8R8UI:			return 4;
1518 		case FORMAT_A8B8G8R8:			return 4;
1519 		case FORMAT_R8I_SNORM:			return 1;
1520 		case FORMAT_G8R8I_SNORM:		return 2;
1521 		case FORMAT_X8B8G8R8I_SNORM:	return 4;
1522 		case FORMAT_A8B8G8R8I_SNORM:	return 4;
1523 		case FORMAT_A2R10G10B10:		return 4;
1524 		case FORMAT_A2B10G10R10:		return 4;
1525 		case FORMAT_G8R8I:				return 2;
1526 		case FORMAT_G8R8:				return 2;
1527 		case FORMAT_G16R16I:			return 4;
1528 		case FORMAT_G16R16UI:			return 4;
1529 		case FORMAT_G16R16:				return 4;
1530 		case FORMAT_G32R32I:			return 8;
1531 		case FORMAT_G32R32UI:			return 8;
1532 		case FORMAT_X16B16G16R16I:		return 8;
1533 		case FORMAT_X16B16G16R16UI:		return 8;
1534 		case FORMAT_A16B16G16R16I:		return 8;
1535 		case FORMAT_A16B16G16R16UI:		return 8;
1536 		case FORMAT_A16B16G16R16:		return 8;
1537 		case FORMAT_X32B32G32R32I:		return 16;
1538 		case FORMAT_X32B32G32R32UI:		return 16;
1539 		case FORMAT_A32B32G32R32I:		return 16;
1540 		case FORMAT_A32B32G32R32UI:		return 16;
1541 		// Compressed formats
1542 		#if S3TC_SUPPORT
1543 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1544 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1545 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1546 		#endif
1547 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1548 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1549 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1550 		case FORMAT_R11_EAC:			return 2;
1551 		case FORMAT_SIGNED_R11_EAC:		return 2;
1552 		case FORMAT_RG11_EAC:			return 4;
1553 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1554 		case FORMAT_RGB8_ETC2:			return 2;
1555 		case FORMAT_SRGB8_ETC2:			return 2;
1556 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1557 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1558 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1559 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1560 		case FORMAT_RGBA_ASTC_4x4_KHR:
1561 		case FORMAT_RGBA_ASTC_5x4_KHR:
1562 		case FORMAT_RGBA_ASTC_5x5_KHR:
1563 		case FORMAT_RGBA_ASTC_6x5_KHR:
1564 		case FORMAT_RGBA_ASTC_6x6_KHR:
1565 		case FORMAT_RGBA_ASTC_8x5_KHR:
1566 		case FORMAT_RGBA_ASTC_8x6_KHR:
1567 		case FORMAT_RGBA_ASTC_8x8_KHR:
1568 		case FORMAT_RGBA_ASTC_10x5_KHR:
1569 		case FORMAT_RGBA_ASTC_10x6_KHR:
1570 		case FORMAT_RGBA_ASTC_10x8_KHR:
1571 		case FORMAT_RGBA_ASTC_10x10_KHR:
1572 		case FORMAT_RGBA_ASTC_12x10_KHR:
1573 		case FORMAT_RGBA_ASTC_12x12_KHR:
1574 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1575 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1576 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1577 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1578 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1579 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1580 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1581 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1582 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1583 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1584 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1585 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1586 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1587 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1588 		// Bumpmap formats
1589 		case FORMAT_V8U8:				return 2;
1590 		case FORMAT_L6V5U5:				return 2;
1591 		case FORMAT_Q8W8V8U8:			return 4;
1592 		case FORMAT_X8L8V8U8:			return 4;
1593 		case FORMAT_A2W10V10U10:		return 4;
1594 		case FORMAT_V16U16:				return 4;
1595 		case FORMAT_A16W16V16U16:		return 8;
1596 		case FORMAT_Q16W16V16U16:		return 8;
1597 		// Luminance formats
1598 		case FORMAT_L8:					return 1;
1599 		case FORMAT_A4L4:				return 1;
1600 		case FORMAT_L16:				return 2;
1601 		case FORMAT_A8L8:				return 2;
1602 		case FORMAT_L16F:               return 2;
1603 		case FORMAT_A16L16F:            return 4;
1604 		case FORMAT_L32F:               return 4;
1605 		case FORMAT_A32L32F:            return 8;
1606 		// Floating-point formats
1607 		case FORMAT_A16F:				return 2;
1608 		case FORMAT_R16F:				return 2;
1609 		case FORMAT_G16R16F:			return 4;
1610 		case FORMAT_B16G16R16F:			return 6;
1611 		case FORMAT_A16B16G16R16F:		return 8;
1612 		case FORMAT_A32F:				return 4;
1613 		case FORMAT_R32F:				return 4;
1614 		case FORMAT_G32R32F:			return 8;
1615 		case FORMAT_B32G32R32F:			return 12;
1616 		case FORMAT_X32B32G32R32F:		return 16;
1617 		case FORMAT_A32B32G32R32F:		return 16;
1618 		// Depth/stencil formats
1619 		case FORMAT_D16:				return 2;
1620 		case FORMAT_D32:				return 4;
1621 		case FORMAT_D24X8:				return 4;
1622 		case FORMAT_D24S8:				return 4;
1623 		case FORMAT_D24FS8:				return 4;
1624 		case FORMAT_D32F:				return 4;
1625 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1626 		case FORMAT_D32F_LOCKABLE:		return 4;
1627 		case FORMAT_D32FS8_TEXTURE:		return 4;
1628 		case FORMAT_D32FS8_SHADOW:		return 4;
1629 		case FORMAT_DF24S8:				return 4;
1630 		case FORMAT_DF16S8:				return 2;
1631 		case FORMAT_INTZ:				return 4;
1632 		case FORMAT_S8:					return 1;
1633 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1634 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1635 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1636 		default:
1637 			ASSERT(false);
1638 		}
1639 
1640 		return 0;
1641 	}
1642 
pitchB(int width,Format format,bool target)1643 	int Surface::pitchB(int width, Format format, bool target)
1644 	{
1645 		if(target || isDepth(format) || isStencil(format))
1646 		{
1647 			width = align(width, 2);
1648 		}
1649 
1650 		switch(format)
1651 		{
1652 		#if S3TC_SUPPORT
1653 		case FORMAT_DXT1:
1654 		#endif
1655 		case FORMAT_ETC1:
1656 		case FORMAT_R11_EAC:
1657 		case FORMAT_SIGNED_R11_EAC:
1658 		case FORMAT_RGB8_ETC2:
1659 		case FORMAT_SRGB8_ETC2:
1660 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1661 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1662 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1663 		case FORMAT_RG11_EAC:
1664 		case FORMAT_SIGNED_RG11_EAC:
1665 		case FORMAT_RGBA8_ETC2_EAC:
1666 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1667 		case FORMAT_RGBA_ASTC_4x4_KHR:
1668 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1669 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1670 		case FORMAT_RGBA_ASTC_5x4_KHR:
1671 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1672 		case FORMAT_RGBA_ASTC_5x5_KHR:
1673 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1674 			return 16 * ((width + 4) / 5);
1675 		case FORMAT_RGBA_ASTC_6x5_KHR:
1676 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1677 		case FORMAT_RGBA_ASTC_6x6_KHR:
1678 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1679 			return 16 * ((width + 5) / 6);
1680 		case FORMAT_RGBA_ASTC_8x5_KHR:
1681 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1682 		case FORMAT_RGBA_ASTC_8x6_KHR:
1683 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1684 		case FORMAT_RGBA_ASTC_8x8_KHR:
1685 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1686 			return 16 * ((width + 7) / 8);
1687 		case FORMAT_RGBA_ASTC_10x5_KHR:
1688 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1689 		case FORMAT_RGBA_ASTC_10x6_KHR:
1690 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1691 		case FORMAT_RGBA_ASTC_10x8_KHR:
1692 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1693 		case FORMAT_RGBA_ASTC_10x10_KHR:
1694 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1695 			return 16 * ((width + 9) / 10);
1696 		case FORMAT_RGBA_ASTC_12x10_KHR:
1697 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1698 		case FORMAT_RGBA_ASTC_12x12_KHR:
1699 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1700 			return 16 * ((width + 11) / 12);
1701 		#if S3TC_SUPPORT
1702 		case FORMAT_DXT3:
1703 		case FORMAT_DXT5:
1704 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1705 		#endif
1706 		case FORMAT_ATI1:
1707 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1708 		case FORMAT_ATI2:
1709 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1710 		case FORMAT_YV12_BT601:
1711 		case FORMAT_YV12_BT709:
1712 		case FORMAT_YV12_JFIF:
1713 			return align(width, 16);
1714 		default:
1715 			return bytes(format) * width;
1716 		}
1717 	}
1718 
pitchP(int width,Format format,bool target)1719 	int Surface::pitchP(int width, Format format, bool target)
1720 	{
1721 		int B = bytes(format);
1722 
1723 		return B > 0 ? pitchB(width, format, target) / B : 0;
1724 	}
1725 
sliceB(int width,int height,Format format,bool target)1726 	int Surface::sliceB(int width, int height, Format format, bool target)
1727 	{
1728 		if(target || isDepth(format) || isStencil(format))
1729 		{
1730 			height = ((height + 1) & ~1);
1731 		}
1732 
1733 		switch(format)
1734 		{
1735 		#if S3TC_SUPPORT
1736 		case FORMAT_DXT1:
1737 		case FORMAT_DXT3:
1738 		case FORMAT_DXT5:
1739 		#endif
1740 		case FORMAT_ETC1:
1741 		case FORMAT_R11_EAC:
1742 		case FORMAT_SIGNED_R11_EAC:
1743 		case FORMAT_RG11_EAC:
1744 		case FORMAT_SIGNED_RG11_EAC:
1745 		case FORMAT_RGB8_ETC2:
1746 		case FORMAT_SRGB8_ETC2:
1747 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1748 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1749 		case FORMAT_RGBA8_ETC2_EAC:
1750 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1751 		case FORMAT_RGBA_ASTC_4x4_KHR:
1752 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1753 		case FORMAT_RGBA_ASTC_5x4_KHR:
1754 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1755 			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1756 		case FORMAT_RGBA_ASTC_5x5_KHR:
1757 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1758 		case FORMAT_RGBA_ASTC_6x5_KHR:
1759 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1760 		case FORMAT_RGBA_ASTC_8x5_KHR:
1761 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1762 		case FORMAT_RGBA_ASTC_10x5_KHR:
1763 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1764 			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1765 		case FORMAT_RGBA_ASTC_6x6_KHR:
1766 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1767 		case FORMAT_RGBA_ASTC_8x6_KHR:
1768 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1769 		case FORMAT_RGBA_ASTC_10x6_KHR:
1770 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1771 			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1772 		case FORMAT_RGBA_ASTC_8x8_KHR:
1773 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1774 		case FORMAT_RGBA_ASTC_10x8_KHR:
1775 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1776 			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1777 		case FORMAT_RGBA_ASTC_10x10_KHR:
1778 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1779 		case FORMAT_RGBA_ASTC_12x10_KHR:
1780 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1781 			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1782 		case FORMAT_RGBA_ASTC_12x12_KHR:
1783 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1784 			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1785 		case FORMAT_ATI1:
1786 		case FORMAT_ATI2:
1787 		default:
1788 			return pitchB(width, format, target) * height;   // Pitch computed per row
1789 		}
1790 	}
1791 
sliceP(int width,int height,Format format,bool target)1792 	int Surface::sliceP(int width, int height, Format format, bool target)
1793 	{
1794 		int B = bytes(format);
1795 
1796 		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1797 	}
1798 
update(Buffer & destination,Buffer & source)1799 	void Surface::update(Buffer &destination, Buffer &source)
1800 	{
1801 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1802 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1803 
1804 		if(destination.buffer != source.buffer)
1805 		{
1806 			ASSERT(source.dirty && !destination.dirty);
1807 
1808 			switch(source.format)
1809 			{
1810 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1811 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1812 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1813 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1814 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1815 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1816 			#if S3TC_SUPPORT
1817 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1818 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1819 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1820 			#endif
1821 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1822 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1823 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1824 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1825 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1826 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1827 			case FORMAT_ETC1:
1828 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1829 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1830 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1831 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1832 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1833 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1834 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1835 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1836 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1837 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1838 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1839 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1840 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1841 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1842 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1843 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1844 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1845 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1846 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1847 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1848 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1849 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1850 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1851 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1852 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1853 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1854 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1855 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1856 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1857 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1858 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1859 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1860 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1861 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1862 			default:				genericUpdate(destination, source);		break;
1863 			}
1864 		}
1865 	}
1866 
genericUpdate(Buffer & destination,Buffer & source)1867 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1868 	{
1869 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1870 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1871 
1872 		int depth = min(destination.depth, source.depth);
1873 		int height = min(destination.height, source.height);
1874 		int width = min(destination.width, source.width);
1875 		int rowBytes = width * source.bytes;
1876 
1877 		for(int z = 0; z < depth; z++)
1878 		{
1879 			unsigned char *sourceRow = sourceSlice;
1880 			unsigned char *destinationRow = destinationSlice;
1881 
1882 			for(int y = 0; y < height; y++)
1883 			{
1884 				if(source.format == destination.format)
1885 				{
1886 					memcpy(destinationRow, sourceRow, rowBytes);
1887 				}
1888 				else
1889 				{
1890 					unsigned char *sourceElement = sourceRow;
1891 					unsigned char *destinationElement = destinationRow;
1892 
1893 					for(int x = 0; x < width; x++)
1894 					{
1895 						Color<float> color = source.read(sourceElement);
1896 						destination.write(destinationElement, color);
1897 
1898 						sourceElement += source.bytes;
1899 						destinationElement += destination.bytes;
1900 					}
1901 				}
1902 
1903 				sourceRow += source.pitchB;
1904 				destinationRow += destination.pitchB;
1905 			}
1906 
1907 			sourceSlice += source.sliceB;
1908 			destinationSlice += destination.sliceB;
1909 		}
1910 	}
1911 
decodeR8G8B8(Buffer & destination,const Buffer & source)1912 	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1913 	{
1914 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1915 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1916 
1917 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1918 		{
1919 			unsigned char *sourceRow = sourceSlice;
1920 			unsigned char *destinationRow = destinationSlice;
1921 
1922 			for(int y = 0; y < destination.height && y < source.height; y++)
1923 			{
1924 				unsigned char *sourceElement = sourceRow;
1925 				unsigned char *destinationElement = destinationRow;
1926 
1927 				for(int x = 0; x < destination.width && x < source.width; x++)
1928 				{
1929 					unsigned int b = sourceElement[0];
1930 					unsigned int g = sourceElement[1];
1931 					unsigned int r = sourceElement[2];
1932 
1933 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1934 
1935 					sourceElement += source.bytes;
1936 					destinationElement += destination.bytes;
1937 				}
1938 
1939 				sourceRow += source.pitchB;
1940 				destinationRow += destination.pitchB;
1941 			}
1942 
1943 			sourceSlice += source.sliceB;
1944 			destinationSlice += destination.sliceB;
1945 		}
1946 	}
1947 
decodeX1R5G5B5(Buffer & destination,const Buffer & source)1948 	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1949 	{
1950 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1951 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1952 
1953 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1954 		{
1955 			unsigned char *sourceRow = sourceSlice;
1956 			unsigned char *destinationRow = destinationSlice;
1957 
1958 			for(int y = 0; y < destination.height && y < source.height; y++)
1959 			{
1960 				unsigned char *sourceElement = sourceRow;
1961 				unsigned char *destinationElement = destinationRow;
1962 
1963 				for(int x = 0; x < destination.width && x < source.width; x++)
1964 				{
1965 					unsigned int xrgb = *(unsigned short*)sourceElement;
1966 
1967 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1968 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1969 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1970 
1971 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1972 
1973 					sourceElement += source.bytes;
1974 					destinationElement += destination.bytes;
1975 				}
1976 
1977 				sourceRow += source.pitchB;
1978 				destinationRow += destination.pitchB;
1979 			}
1980 
1981 			sourceSlice += source.sliceB;
1982 			destinationSlice += destination.sliceB;
1983 		}
1984 	}
1985 
decodeA1R5G5B5(Buffer & destination,const Buffer & source)1986 	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1987 	{
1988 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1989 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1990 
1991 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1992 		{
1993 			unsigned char *sourceRow = sourceSlice;
1994 			unsigned char *destinationRow = destinationSlice;
1995 
1996 			for(int y = 0; y < destination.height && y < source.height; y++)
1997 			{
1998 				unsigned char *sourceElement = sourceRow;
1999 				unsigned char *destinationElement = destinationRow;
2000 
2001 				for(int x = 0; x < destination.width && x < source.width; x++)
2002 				{
2003 					unsigned int argb = *(unsigned short*)sourceElement;
2004 
2005 					unsigned int a =   (argb & 0x8000) * 130560;
2006 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2007 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
2008 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
2009 
2010 					*(unsigned int*)destinationElement = a | r | g | b;
2011 
2012 					sourceElement += source.bytes;
2013 					destinationElement += destination.bytes;
2014 				}
2015 
2016 				sourceRow += source.pitchB;
2017 				destinationRow += destination.pitchB;
2018 			}
2019 
2020 			sourceSlice += source.sliceB;
2021 			destinationSlice += destination.sliceB;
2022 		}
2023 	}
2024 
decodeX4R4G4B4(Buffer & destination,const Buffer & source)2025 	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
2026 	{
2027 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2028 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2029 
2030 		for(int z = 0; z < destination.depth && z < source.depth; z++)
2031 		{
2032 			unsigned char *sourceRow = sourceSlice;
2033 			unsigned char *destinationRow = destinationSlice;
2034 
2035 			for(int y = 0; y < destination.height && y < source.height; y++)
2036 			{
2037 				unsigned char *sourceElement = sourceRow;
2038 				unsigned char *destinationElement = destinationRow;
2039 
2040 				for(int x = 0; x < destination.width && x < source.width; x++)
2041 				{
2042 					unsigned int xrgb = *(unsigned short*)sourceElement;
2043 
2044 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2045 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2046 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2047 
2048 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2049 
2050 					sourceElement += source.bytes;
2051 					destinationElement += destination.bytes;
2052 				}
2053 
2054 				sourceRow += source.pitchB;
2055 				destinationRow += destination.pitchB;
2056 			}
2057 
2058 			sourceSlice += source.sliceB;
2059 			destinationSlice += destination.sliceB;
2060 		}
2061 	}
2062 
decodeA4R4G4B4(Buffer & destination,const Buffer & source)2063 	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
2064 	{
2065 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2066 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2067 
2068 		for(int z = 0; z < destination.depth && z < source.depth; z++)
2069 		{
2070 			unsigned char *sourceRow = sourceSlice;
2071 			unsigned char *destinationRow = destinationSlice;
2072 
2073 			for(int y = 0; y < destination.height && y < source.height; y++)
2074 			{
2075 				unsigned char *sourceElement = sourceRow;
2076 				unsigned char *destinationElement = destinationRow;
2077 
2078 				for(int x = 0; x < destination.width && x < source.width; x++)
2079 				{
2080 					unsigned int argb = *(unsigned short*)sourceElement;
2081 
2082 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2083 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2084 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2085 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2086 
2087 					*(unsigned int*)destinationElement = a | r | g | b;
2088 
2089 					sourceElement += source.bytes;
2090 					destinationElement += destination.bytes;
2091 				}
2092 
2093 				sourceRow += source.pitchB;
2094 				destinationRow += destination.pitchB;
2095 			}
2096 
2097 			sourceSlice += source.sliceB;
2098 			destinationSlice += destination.sliceB;
2099 		}
2100 	}
2101 
decodeP8(Buffer & destination,const Buffer & source)2102 	void Surface::decodeP8(Buffer &destination, const Buffer &source)
2103 	{
2104 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2105 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2106 
2107 		for(int z = 0; z < destination.depth && z < source.depth; z++)
2108 		{
2109 			unsigned char *sourceRow = sourceSlice;
2110 			unsigned char *destinationRow = destinationSlice;
2111 
2112 			for(int y = 0; y < destination.height && y < source.height; y++)
2113 			{
2114 				unsigned char *sourceElement = sourceRow;
2115 				unsigned char *destinationElement = destinationRow;
2116 
2117 				for(int x = 0; x < destination.width && x < source.width; x++)
2118 				{
2119 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2120 
2121 					unsigned int r = (abgr & 0x000000FF) << 16;
2122 					unsigned int g = (abgr & 0x0000FF00) << 0;
2123 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2124 					unsigned int a = (abgr & 0xFF000000) >> 0;
2125 
2126 					*(unsigned int*)destinationElement = a | r | g | b;
2127 
2128 					sourceElement += source.bytes;
2129 					destinationElement += destination.bytes;
2130 				}
2131 
2132 				sourceRow += source.pitchB;
2133 				destinationRow += destination.pitchB;
2134 			}
2135 
2136 			sourceSlice += source.sliceB;
2137 			destinationSlice += destination.sliceB;
2138 		}
2139 	}
2140 
2141 #if S3TC_SUPPORT
decodeDXT1(Buffer & internal,const Buffer & external)2142 	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
2143 	{
2144 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2145 		const DXT1 *source = (const DXT1*)external.buffer;
2146 
2147 		for(int z = 0; z < external.depth; z++)
2148 		{
2149 			unsigned int *dest = destSlice;
2150 
2151 			for(int y = 0; y < external.height; y += 4)
2152 			{
2153 				for(int x = 0; x < external.width; x += 4)
2154 				{
2155 					Color<byte> c[4];
2156 
2157 					c[0] = source->c0;
2158 					c[1] = source->c1;
2159 
2160 					if(source->c0 > source->c1)   // No transparency
2161 					{
2162 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2163 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2164 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2165 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2166 						c[2].a = 0xFF;
2167 
2168 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2169 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2170 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2171 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2172 						c[3].a = 0xFF;
2173 					}
2174 					else   // c3 transparent
2175 					{
2176 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2177 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2178 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2179 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2180 						c[2].a = 0xFF;
2181 
2182 						c[3].r = 0;
2183 						c[3].g = 0;
2184 						c[3].b = 0;
2185 						c[3].a = 0;
2186 					}
2187 
2188 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2189 					{
2190 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2191 						{
2192 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2193 						}
2194 					}
2195 
2196 					source++;
2197 				}
2198 			}
2199 
2200 			(byte*&)destSlice += internal.sliceB;
2201 		}
2202 	}
2203 
decodeDXT3(Buffer & internal,const Buffer & external)2204 	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
2205 	{
2206 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2207 		const DXT3 *source = (const DXT3*)external.buffer;
2208 
2209 		for(int z = 0; z < external.depth; z++)
2210 		{
2211 			unsigned int *dest = destSlice;
2212 
2213 			for(int y = 0; y < external.height; y += 4)
2214 			{
2215 				for(int x = 0; x < external.width; x += 4)
2216 				{
2217 					Color<byte> c[4];
2218 
2219 					c[0] = source->c0;
2220 					c[1] = source->c1;
2221 
2222 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2223 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2224 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2225 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2226 
2227 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2228 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2229 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2230 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2231 
2232 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2233 					{
2234 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2235 						{
2236 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2237 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2238 
2239 							dest[(x + i) + (y + j) * internal.width] = color;
2240 						}
2241 					}
2242 
2243 					source++;
2244 				}
2245 			}
2246 
2247 			(byte*&)destSlice += internal.sliceB;
2248 		}
2249 	}
2250 
decodeDXT5(Buffer & internal,const Buffer & external)2251 	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
2252 	{
2253 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2254 		const DXT5 *source = (const DXT5*)external.buffer;
2255 
2256 		for(int z = 0; z < external.depth; z++)
2257 		{
2258 			unsigned int *dest = destSlice;
2259 
2260 			for(int y = 0; y < external.height; y += 4)
2261 			{
2262 				for(int x = 0; x < external.width; x += 4)
2263 				{
2264 					Color<byte> c[4];
2265 
2266 					c[0] = source->c0;
2267 					c[1] = source->c1;
2268 
2269 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2270 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2271 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2272 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2273 
2274 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2275 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2276 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2277 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2278 
2279 					byte a[8];
2280 
2281 					a[0] = source->a0;
2282 					a[1] = source->a1;
2283 
2284 					if(a[0] > a[1])
2285 					{
2286 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2287 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2288 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2289 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2290 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2291 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2292 					}
2293 					else
2294 					{
2295 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2296 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2297 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2298 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2299 						a[6] = 0;
2300 						a[7] = 0xFF;
2301 					}
2302 
2303 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2304 					{
2305 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2306 						{
2307 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2308 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2309 
2310 							dest[(x + i) + (y + j) * internal.width] = color;
2311 						}
2312 					}
2313 
2314 					source++;
2315 				}
2316 			}
2317 
2318 			(byte*&)destSlice += internal.sliceB;
2319 		}
2320 	}
2321 #endif
2322 
decodeATI1(Buffer & internal,const Buffer & external)2323 	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
2324 	{
2325 		byte *destSlice = (byte*)internal.buffer;
2326 		const ATI1 *source = (const ATI1*)external.buffer;
2327 
2328 		for(int z = 0; z < external.depth; z++)
2329 		{
2330 			byte *dest = destSlice;
2331 
2332 			for(int y = 0; y < external.height; y += 4)
2333 			{
2334 				for(int x = 0; x < external.width; x += 4)
2335 				{
2336 					byte r[8];
2337 
2338 					r[0] = source->r0;
2339 					r[1] = source->r1;
2340 
2341 					if(r[0] > r[1])
2342 					{
2343 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2344 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2345 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2346 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2347 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2348 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2349 					}
2350 					else
2351 					{
2352 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2353 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2354 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2355 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2356 						r[6] = 0;
2357 						r[7] = 0xFF;
2358 					}
2359 
2360 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2361 					{
2362 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2363 						{
2364 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2365 						}
2366 					}
2367 
2368 					source++;
2369 				}
2370 			}
2371 
2372 			destSlice += internal.sliceB;
2373 		}
2374 	}
2375 
decodeATI2(Buffer & internal,const Buffer & external)2376 	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
2377 	{
2378 		word *destSlice = (word*)internal.buffer;
2379 		const ATI2 *source = (const ATI2*)external.buffer;
2380 
2381 		for(int z = 0; z < external.depth; z++)
2382 		{
2383 			word *dest = destSlice;
2384 
2385 			for(int y = 0; y < external.height; y += 4)
2386 			{
2387 				for(int x = 0; x < external.width; x += 4)
2388 				{
2389 					byte X[8];
2390 
2391 					X[0] = source->x0;
2392 					X[1] = source->x1;
2393 
2394 					if(X[0] > X[1])
2395 					{
2396 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2397 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2398 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2399 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2400 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2401 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2402 					}
2403 					else
2404 					{
2405 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2406 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2407 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2408 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2409 						X[6] = 0;
2410 						X[7] = 0xFF;
2411 					}
2412 
2413 					byte Y[8];
2414 
2415 					Y[0] = source->y0;
2416 					Y[1] = source->y1;
2417 
2418 					if(Y[0] > Y[1])
2419 					{
2420 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2421 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2422 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2423 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2424 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2425 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2426 					}
2427 					else
2428 					{
2429 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2430 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2431 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2432 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2433 						Y[6] = 0;
2434 						Y[7] = 0xFF;
2435 					}
2436 
2437 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2438 					{
2439 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2440 						{
2441 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2442 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2443 
2444 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2445 						}
2446 					}
2447 
2448 					source++;
2449 				}
2450 			}
2451 
2452 			(byte*&)destSlice += internal.sliceB;
2453 		}
2454 	}
2455 
decodeETC2(Buffer & internal,const Buffer & external,int nbAlphaBits,bool isSRGB)2456 	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2457 	{
2458 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2459 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2460 
2461 		if(isSRGB)
2462 		{
2463 			static byte sRGBtoLinearTable[256];
2464 			static bool sRGBtoLinearTableDirty = true;
2465 			if(sRGBtoLinearTableDirty)
2466 			{
2467 				for(int i = 0; i < 256; i++)
2468 				{
2469 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2470 				}
2471 				sRGBtoLinearTableDirty = false;
2472 			}
2473 
2474 			// Perform sRGB conversion in place after decoding
2475 			byte* src = (byte*)internal.buffer;
2476 			for(int y = 0; y < internal.height; y++)
2477 			{
2478 				byte* srcRow = src + y * internal.pitchB;
2479 				for(int x = 0; x <  internal.width; x++)
2480 				{
2481 					byte* srcPix = srcRow + x * internal.bytes;
2482 					for(int i = 0; i < 3; i++)
2483 					{
2484 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2485 					}
2486 				}
2487 			}
2488 		}
2489 	}
2490 
decodeEAC(Buffer & internal,const Buffer & external,int nbChannels,bool isSigned)2491 	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2492 	{
2493 		ASSERT(nbChannels == 1 || nbChannels == 2);
2494 
2495 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2496 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2497 
2498 		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2499 		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2500 		if(isSigned)
2501 		{
2502 			sbyte* src = (sbyte*)internal.buffer;
2503 
2504 			for(int y = 0; y < internal.height; y++)
2505 			{
2506 				sbyte* srcRow = src + y * internal.pitchB;
2507 				for(int x = internal.width - 1; x >= 0; x--)
2508 				{
2509 					int dx = x & 0xFFFFFFFC;
2510 					int mx = x - dx;
2511 					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2512 					float* dstPix = (float*)(srcRow + x * internal.bytes);
2513 					for(int c = nbChannels - 1; c >= 0; c--)
2514 					{
2515 						static const float normalization = 1.0f / 127.875f;
2516 						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2517 					}
2518 				}
2519 			}
2520 		}
2521 	}
2522 
decodeASTC(Buffer & internal,const Buffer & external,int xBlockSize,int yBlockSize,int zBlockSize,bool isSRGB)2523 	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2524 	{
2525 	}
2526 
size(int width,int height,int depth,Format format)2527 	unsigned int Surface::size(int width, int height, int depth, Format format)
2528 	{
2529 		// Dimensions rounded up to multiples of 4, used for compressed formats
2530 		int width4 = align(width, 4);
2531 		int height4 = align(height, 4);
2532 
2533 		switch(format)
2534 		{
2535 		#if S3TC_SUPPORT
2536 		case FORMAT_DXT1:
2537 		#endif
2538 		case FORMAT_ATI1:
2539 		case FORMAT_ETC1:
2540 		case FORMAT_R11_EAC:
2541 		case FORMAT_SIGNED_R11_EAC:
2542 		case FORMAT_RGB8_ETC2:
2543 		case FORMAT_SRGB8_ETC2:
2544 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2545 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2546 			return width4 * height4 * depth / 2;
2547 		#if S3TC_SUPPORT
2548 		case FORMAT_DXT3:
2549 		case FORMAT_DXT5:
2550 		#endif
2551 		case FORMAT_ATI2:
2552 		case FORMAT_RG11_EAC:
2553 		case FORMAT_SIGNED_RG11_EAC:
2554 		case FORMAT_RGBA8_ETC2_EAC:
2555 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2556 		case FORMAT_RGBA_ASTC_4x4_KHR:
2557 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2558 			return width4 * height4 * depth;
2559 		case FORMAT_RGBA_ASTC_5x4_KHR:
2560 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2561 			return align(width, 5) * height4 * depth;
2562 		case FORMAT_RGBA_ASTC_5x5_KHR:
2563 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2564 			return align(width, 5) * align(height, 5) * depth;
2565 		case FORMAT_RGBA_ASTC_6x5_KHR:
2566 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2567 			return align(width, 6) * align(height, 5) * depth;
2568 		case FORMAT_RGBA_ASTC_6x6_KHR:
2569 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2570 			return align(width, 6) * align(height, 6) * depth;
2571 		case FORMAT_RGBA_ASTC_8x5_KHR:
2572 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2573 			return align(width, 8) * align(height, 5) * depth;
2574 		case FORMAT_RGBA_ASTC_8x6_KHR:
2575 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2576 			return align(width, 8) * align(height, 6) * depth;
2577 		case FORMAT_RGBA_ASTC_8x8_KHR:
2578 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2579 			return align(width, 8) * align(height, 8) * depth;
2580 		case FORMAT_RGBA_ASTC_10x5_KHR:
2581 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2582 			return align(width, 10) * align(height, 5) * depth;
2583 		case FORMAT_RGBA_ASTC_10x6_KHR:
2584 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2585 			return align(width, 10) * align(height, 6) * depth;
2586 		case FORMAT_RGBA_ASTC_10x8_KHR:
2587 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2588 			return align(width, 10) * align(height, 8) * depth;
2589 		case FORMAT_RGBA_ASTC_10x10_KHR:
2590 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2591 			return align(width, 10) * align(height, 10) * depth;
2592 		case FORMAT_RGBA_ASTC_12x10_KHR:
2593 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2594 			return align(width, 12) * align(height, 10) * depth;
2595 		case FORMAT_RGBA_ASTC_12x12_KHR:
2596 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2597 			return align(width, 12) * align(height, 12) * depth;
2598 		case FORMAT_YV12_BT601:
2599 		case FORMAT_YV12_BT709:
2600 		case FORMAT_YV12_JFIF:
2601 			{
2602 				unsigned int YStride = align(width, 16);
2603 				unsigned int YSize = YStride * height;
2604 				unsigned int CStride = align(YStride / 2, 16);
2605 				unsigned int CSize = CStride * height / 2;
2606 
2607 				return YSize + 2 * CSize;
2608 			}
2609 		default:
2610 			return bytes(format) * width * height * depth;
2611 		}
2612 	}
2613 
isStencil(Format format)2614 	bool Surface::isStencil(Format format)
2615 	{
2616 		switch(format)
2617 		{
2618 		case FORMAT_D32:
2619 		case FORMAT_D16:
2620 		case FORMAT_D24X8:
2621 		case FORMAT_D32F:
2622 		case FORMAT_D32F_COMPLEMENTARY:
2623 		case FORMAT_D32F_LOCKABLE:
2624 			return false;
2625 		case FORMAT_D24S8:
2626 		case FORMAT_D24FS8:
2627 		case FORMAT_S8:
2628 		case FORMAT_DF24S8:
2629 		case FORMAT_DF16S8:
2630 		case FORMAT_D32FS8_TEXTURE:
2631 		case FORMAT_D32FS8_SHADOW:
2632 		case FORMAT_INTZ:
2633 			return true;
2634 		default:
2635 			return false;
2636 		}
2637 	}
2638 
isDepth(Format format)2639 	bool Surface::isDepth(Format format)
2640 	{
2641 		switch(format)
2642 		{
2643 		case FORMAT_D32:
2644 		case FORMAT_D16:
2645 		case FORMAT_D24X8:
2646 		case FORMAT_D24S8:
2647 		case FORMAT_D24FS8:
2648 		case FORMAT_D32F:
2649 		case FORMAT_D32F_COMPLEMENTARY:
2650 		case FORMAT_D32F_LOCKABLE:
2651 		case FORMAT_DF24S8:
2652 		case FORMAT_DF16S8:
2653 		case FORMAT_D32FS8_TEXTURE:
2654 		case FORMAT_D32FS8_SHADOW:
2655 		case FORMAT_INTZ:
2656 			return true;
2657 		case FORMAT_S8:
2658 			return false;
2659 		default:
2660 			return false;
2661 		}
2662 	}
2663 
hasQuadLayout(Format format)2664 	bool Surface::hasQuadLayout(Format format)
2665 	{
2666 		switch(format)
2667 		{
2668 		case FORMAT_D32:
2669 		case FORMAT_D16:
2670 		case FORMAT_D24X8:
2671 		case FORMAT_D24S8:
2672 		case FORMAT_D24FS8:
2673 		case FORMAT_D32F:
2674 		case FORMAT_D32F_COMPLEMENTARY:
2675 		case FORMAT_DF24S8:
2676 		case FORMAT_DF16S8:
2677 		case FORMAT_INTZ:
2678 		case FORMAT_S8:
2679 		case FORMAT_A8G8R8B8Q:
2680 		case FORMAT_X8G8R8B8Q:
2681 			return true;
2682 		case FORMAT_D32F_LOCKABLE:
2683 		case FORMAT_D32FS8_TEXTURE:
2684 		case FORMAT_D32FS8_SHADOW:
2685 		default:
2686 			break;
2687 		}
2688 
2689 		return false;
2690 	}
2691 
isPalette(Format format)2692 	bool Surface::isPalette(Format format)
2693 	{
2694 		switch(format)
2695 		{
2696 		case FORMAT_P8:
2697 		case FORMAT_A8P8:
2698 			return true;
2699 		default:
2700 			return false;
2701 		}
2702 	}
2703 
isFloatFormat(Format format)2704 	bool Surface::isFloatFormat(Format format)
2705 	{
2706 		switch(format)
2707 		{
2708 		case FORMAT_R5G6B5:
2709 		case FORMAT_R8G8B8:
2710 		case FORMAT_B8G8R8:
2711 		case FORMAT_X8R8G8B8:
2712 		case FORMAT_X8B8G8R8I:
2713 		case FORMAT_X8B8G8R8:
2714 		case FORMAT_A8R8G8B8:
2715 		case FORMAT_SRGB8_X8:
2716 		case FORMAT_SRGB8_A8:
2717 		case FORMAT_A8B8G8R8I:
2718 		case FORMAT_R8UI:
2719 		case FORMAT_G8R8UI:
2720 		case FORMAT_X8B8G8R8UI:
2721 		case FORMAT_A8B8G8R8UI:
2722 		case FORMAT_A8B8G8R8:
2723 		case FORMAT_G8R8I:
2724 		case FORMAT_G8R8:
2725 		case FORMAT_A2B10G10R10:
2726 		case FORMAT_R8I_SNORM:
2727 		case FORMAT_G8R8I_SNORM:
2728 		case FORMAT_X8B8G8R8I_SNORM:
2729 		case FORMAT_A8B8G8R8I_SNORM:
2730 		case FORMAT_R16I:
2731 		case FORMAT_R16UI:
2732 		case FORMAT_G16R16I:
2733 		case FORMAT_G16R16UI:
2734 		case FORMAT_G16R16:
2735 		case FORMAT_X16B16G16R16I:
2736 		case FORMAT_X16B16G16R16UI:
2737 		case FORMAT_A16B16G16R16I:
2738 		case FORMAT_A16B16G16R16UI:
2739 		case FORMAT_A16B16G16R16:
2740 		case FORMAT_V8U8:
2741 		case FORMAT_Q8W8V8U8:
2742 		case FORMAT_X8L8V8U8:
2743 		case FORMAT_V16U16:
2744 		case FORMAT_A16W16V16U16:
2745 		case FORMAT_Q16W16V16U16:
2746 		case FORMAT_A8:
2747 		case FORMAT_R8I:
2748 		case FORMAT_R8:
2749 		case FORMAT_S8:
2750 		case FORMAT_L8:
2751 		case FORMAT_L16:
2752 		case FORMAT_A8L8:
2753 		case FORMAT_YV12_BT601:
2754 		case FORMAT_YV12_BT709:
2755 		case FORMAT_YV12_JFIF:
2756 		case FORMAT_R32I:
2757 		case FORMAT_R32UI:
2758 		case FORMAT_G32R32I:
2759 		case FORMAT_G32R32UI:
2760 		case FORMAT_X32B32G32R32I:
2761 		case FORMAT_X32B32G32R32UI:
2762 		case FORMAT_A32B32G32R32I:
2763 		case FORMAT_A32B32G32R32UI:
2764 			return false;
2765 		case FORMAT_R16F:
2766 		case FORMAT_G16R16F:
2767 		case FORMAT_B16G16R16F:
2768 		case FORMAT_A16B16G16R16F:
2769 		case FORMAT_R32F:
2770 		case FORMAT_G32R32F:
2771 		case FORMAT_B32G32R32F:
2772 		case FORMAT_X32B32G32R32F:
2773 		case FORMAT_A32B32G32R32F:
2774 		case FORMAT_D32F:
2775 		case FORMAT_D32F_COMPLEMENTARY:
2776 		case FORMAT_D32F_LOCKABLE:
2777 		case FORMAT_D32FS8_TEXTURE:
2778 		case FORMAT_D32FS8_SHADOW:
2779 		case FORMAT_L16F:
2780 		case FORMAT_A16L16F:
2781 		case FORMAT_L32F:
2782 		case FORMAT_A32L32F:
2783 			return true;
2784 		default:
2785 			ASSERT(false);
2786 		}
2787 
2788 		return false;
2789 	}
2790 
isUnsignedComponent(Format format,int component)2791 	bool Surface::isUnsignedComponent(Format format, int component)
2792 	{
2793 		switch(format)
2794 		{
2795 		case FORMAT_NULL:
2796 		case FORMAT_R5G6B5:
2797 		case FORMAT_R8G8B8:
2798 		case FORMAT_B8G8R8:
2799 		case FORMAT_X8R8G8B8:
2800 		case FORMAT_X8B8G8R8:
2801 		case FORMAT_A8R8G8B8:
2802 		case FORMAT_A8B8G8R8:
2803 		case FORMAT_SRGB8_X8:
2804 		case FORMAT_SRGB8_A8:
2805 		case FORMAT_G8R8:
2806 		case FORMAT_A2B10G10R10:
2807 		case FORMAT_R16UI:
2808 		case FORMAT_G16R16:
2809 		case FORMAT_G16R16UI:
2810 		case FORMAT_X16B16G16R16UI:
2811 		case FORMAT_A16B16G16R16:
2812 		case FORMAT_A16B16G16R16UI:
2813 		case FORMAT_R32UI:
2814 		case FORMAT_G32R32UI:
2815 		case FORMAT_X32B32G32R32UI:
2816 		case FORMAT_A32B32G32R32UI:
2817 		case FORMAT_R8UI:
2818 		case FORMAT_G8R8UI:
2819 		case FORMAT_X8B8G8R8UI:
2820 		case FORMAT_A8B8G8R8UI:
2821 		case FORMAT_D32F:
2822 		case FORMAT_D32F_COMPLEMENTARY:
2823 		case FORMAT_D32F_LOCKABLE:
2824 		case FORMAT_D32FS8_TEXTURE:
2825 		case FORMAT_D32FS8_SHADOW:
2826 		case FORMAT_A8:
2827 		case FORMAT_R8:
2828 		case FORMAT_L8:
2829 		case FORMAT_L16:
2830 		case FORMAT_A8L8:
2831 		case FORMAT_YV12_BT601:
2832 		case FORMAT_YV12_BT709:
2833 		case FORMAT_YV12_JFIF:
2834 			return true;
2835 		case FORMAT_A8B8G8R8I:
2836 		case FORMAT_A16B16G16R16I:
2837 		case FORMAT_A32B32G32R32I:
2838 		case FORMAT_A8B8G8R8I_SNORM:
2839 		case FORMAT_Q8W8V8U8:
2840 		case FORMAT_Q16W16V16U16:
2841 		case FORMAT_A32B32G32R32F:
2842 			return false;
2843 		case FORMAT_R32F:
2844 		case FORMAT_R8I:
2845 		case FORMAT_R16I:
2846 		case FORMAT_R32I:
2847 		case FORMAT_R8I_SNORM:
2848 			return component >= 1;
2849 		case FORMAT_V8U8:
2850 		case FORMAT_X8L8V8U8:
2851 		case FORMAT_V16U16:
2852 		case FORMAT_G32R32F:
2853 		case FORMAT_G8R8I:
2854 		case FORMAT_G16R16I:
2855 		case FORMAT_G32R32I:
2856 		case FORMAT_G8R8I_SNORM:
2857 			return component >= 2;
2858 		case FORMAT_A16W16V16U16:
2859 		case FORMAT_B32G32R32F:
2860 		case FORMAT_X32B32G32R32F:
2861 		case FORMAT_X8B8G8R8I:
2862 		case FORMAT_X16B16G16R16I:
2863 		case FORMAT_X32B32G32R32I:
2864 		case FORMAT_X8B8G8R8I_SNORM:
2865 			return component >= 3;
2866 		default:
2867 			ASSERT(false);
2868 		}
2869 
2870 		return false;
2871 	}
2872 
isSRGBreadable(Format format)2873 	bool Surface::isSRGBreadable(Format format)
2874 	{
2875 		// Keep in sync with Capabilities::isSRGBreadable
2876 		switch(format)
2877 		{
2878 		case FORMAT_L8:
2879 		case FORMAT_A8L8:
2880 		case FORMAT_R8G8B8:
2881 		case FORMAT_A8R8G8B8:
2882 		case FORMAT_X8R8G8B8:
2883 		case FORMAT_A8B8G8R8:
2884 		case FORMAT_X8B8G8R8:
2885 		case FORMAT_SRGB8_X8:
2886 		case FORMAT_SRGB8_A8:
2887 		case FORMAT_R5G6B5:
2888 		case FORMAT_X1R5G5B5:
2889 		case FORMAT_A1R5G5B5:
2890 		case FORMAT_A4R4G4B4:
2891 		#if S3TC_SUPPORT
2892 		case FORMAT_DXT1:
2893 		case FORMAT_DXT3:
2894 		case FORMAT_DXT5:
2895 		#endif
2896 		case FORMAT_ATI1:
2897 		case FORMAT_ATI2:
2898 			return true;
2899 		default:
2900 			return false;
2901 		}
2902 	}
2903 
isSRGBwritable(Format format)2904 	bool Surface::isSRGBwritable(Format format)
2905 	{
2906 		// Keep in sync with Capabilities::isSRGBwritable
2907 		switch(format)
2908 		{
2909 		case FORMAT_NULL:
2910 		case FORMAT_A8R8G8B8:
2911 		case FORMAT_X8R8G8B8:
2912 		case FORMAT_A8B8G8R8:
2913 		case FORMAT_X8B8G8R8:
2914 		case FORMAT_SRGB8_X8:
2915 		case FORMAT_SRGB8_A8:
2916 		case FORMAT_R5G6B5:
2917 			return true;
2918 		default:
2919 			return false;
2920 		}
2921 	}
2922 
isCompressed(Format format)2923 	bool Surface::isCompressed(Format format)
2924 	{
2925 		switch(format)
2926 		{
2927 		#if S3TC_SUPPORT
2928 		case FORMAT_DXT1:
2929 		case FORMAT_DXT3:
2930 		case FORMAT_DXT5:
2931 		#endif
2932 		case FORMAT_ATI1:
2933 		case FORMAT_ATI2:
2934 		case FORMAT_ETC1:
2935 		case FORMAT_R11_EAC:
2936 		case FORMAT_SIGNED_R11_EAC:
2937 		case FORMAT_RG11_EAC:
2938 		case FORMAT_SIGNED_RG11_EAC:
2939 		case FORMAT_RGB8_ETC2:
2940 		case FORMAT_SRGB8_ETC2:
2941 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2942 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2943 		case FORMAT_RGBA8_ETC2_EAC:
2944 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2945 		case FORMAT_RGBA_ASTC_4x4_KHR:
2946 		case FORMAT_RGBA_ASTC_5x4_KHR:
2947 		case FORMAT_RGBA_ASTC_5x5_KHR:
2948 		case FORMAT_RGBA_ASTC_6x5_KHR:
2949 		case FORMAT_RGBA_ASTC_6x6_KHR:
2950 		case FORMAT_RGBA_ASTC_8x5_KHR:
2951 		case FORMAT_RGBA_ASTC_8x6_KHR:
2952 		case FORMAT_RGBA_ASTC_8x8_KHR:
2953 		case FORMAT_RGBA_ASTC_10x5_KHR:
2954 		case FORMAT_RGBA_ASTC_10x6_KHR:
2955 		case FORMAT_RGBA_ASTC_10x8_KHR:
2956 		case FORMAT_RGBA_ASTC_10x10_KHR:
2957 		case FORMAT_RGBA_ASTC_12x10_KHR:
2958 		case FORMAT_RGBA_ASTC_12x12_KHR:
2959 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2960 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2961 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2962 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2963 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2964 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2965 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2966 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2967 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2968 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2969 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2970 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2971 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2972 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2973 			return true;
2974 		default:
2975 			return false;
2976 		}
2977 	}
2978 
isSignedNonNormalizedInteger(Format format)2979 	bool Surface::isSignedNonNormalizedInteger(Format format)
2980 	{
2981 		switch(format)
2982 		{
2983 		case FORMAT_A8B8G8R8I:
2984 		case FORMAT_X8B8G8R8I:
2985 		case FORMAT_G8R8I:
2986 		case FORMAT_R8I:
2987 		case FORMAT_A16B16G16R16I:
2988 		case FORMAT_X16B16G16R16I:
2989 		case FORMAT_G16R16I:
2990 		case FORMAT_R16I:
2991 		case FORMAT_A32B32G32R32I:
2992 		case FORMAT_X32B32G32R32I:
2993 		case FORMAT_G32R32I:
2994 		case FORMAT_R32I:
2995 			return true;
2996 		default:
2997 			return false;
2998 		}
2999 	}
3000 
isUnsignedNonNormalizedInteger(Format format)3001 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
3002 	{
3003 		switch(format)
3004 		{
3005 		case FORMAT_A8B8G8R8UI:
3006 		case FORMAT_X8B8G8R8UI:
3007 		case FORMAT_G8R8UI:
3008 		case FORMAT_R8UI:
3009 		case FORMAT_A16B16G16R16UI:
3010 		case FORMAT_X16B16G16R16UI:
3011 		case FORMAT_G16R16UI:
3012 		case FORMAT_R16UI:
3013 		case FORMAT_A32B32G32R32UI:
3014 		case FORMAT_X32B32G32R32UI:
3015 		case FORMAT_G32R32UI:
3016 		case FORMAT_R32UI:
3017 			return true;
3018 		default:
3019 			return false;
3020 		}
3021 	}
3022 
isNonNormalizedInteger(Format format)3023 	bool Surface::isNonNormalizedInteger(Format format)
3024 	{
3025 		return isSignedNonNormalizedInteger(format) ||
3026 		       isUnsignedNonNormalizedInteger(format);
3027 	}
3028 
isNormalizedInteger(Format format)3029 	bool Surface::isNormalizedInteger(Format format)
3030 	{
3031 		return !isFloatFormat(format) &&
3032 		       !isNonNormalizedInteger(format) &&
3033 		       !isCompressed(format) &&
3034 		       !isDepth(format) &&
3035 		       !isStencil(format);
3036 	}
3037 
componentCount(Format format)3038 	int Surface::componentCount(Format format)
3039 	{
3040 		switch(format)
3041 		{
3042 		case FORMAT_R5G6B5:         return 3;
3043 		case FORMAT_X8R8G8B8:       return 3;
3044 		case FORMAT_X8B8G8R8I:      return 3;
3045 		case FORMAT_X8B8G8R8:       return 3;
3046 		case FORMAT_A8R8G8B8:       return 4;
3047 		case FORMAT_SRGB8_X8:       return 3;
3048 		case FORMAT_SRGB8_A8:       return 4;
3049 		case FORMAT_A8B8G8R8I:      return 4;
3050 		case FORMAT_A8B8G8R8:       return 4;
3051 		case FORMAT_G8R8I:          return 2;
3052 		case FORMAT_G8R8:           return 2;
3053 		case FORMAT_R8I_SNORM:      return 1;
3054 		case FORMAT_G8R8I_SNORM:    return 2;
3055 		case FORMAT_X8B8G8R8I_SNORM:return 3;
3056 		case FORMAT_A8B8G8R8I_SNORM:return 4;
3057 		case FORMAT_R8UI:           return 1;
3058 		case FORMAT_G8R8UI:         return 2;
3059 		case FORMAT_X8B8G8R8UI:     return 3;
3060 		case FORMAT_A8B8G8R8UI:     return 4;
3061 		case FORMAT_A2B10G10R10:    return 4;
3062 		case FORMAT_G16R16I:        return 2;
3063 		case FORMAT_G16R16UI:       return 2;
3064 		case FORMAT_G16R16:         return 2;
3065 		case FORMAT_G32R32I:        return 2;
3066 		case FORMAT_G32R32UI:       return 2;
3067 		case FORMAT_X16B16G16R16I:  return 3;
3068 		case FORMAT_X16B16G16R16UI: return 3;
3069 		case FORMAT_A16B16G16R16I:  return 4;
3070 		case FORMAT_A16B16G16R16UI: return 4;
3071 		case FORMAT_A16B16G16R16:   return 4;
3072 		case FORMAT_X32B32G32R32I:  return 3;
3073 		case FORMAT_X32B32G32R32UI: return 3;
3074 		case FORMAT_A32B32G32R32I:  return 4;
3075 		case FORMAT_A32B32G32R32UI: return 4;
3076 		case FORMAT_V8U8:           return 2;
3077 		case FORMAT_Q8W8V8U8:       return 4;
3078 		case FORMAT_X8L8V8U8:       return 3;
3079 		case FORMAT_V16U16:         return 2;
3080 		case FORMAT_A16W16V16U16:   return 4;
3081 		case FORMAT_Q16W16V16U16:   return 4;
3082 		case FORMAT_R32F:           return 1;
3083 		case FORMAT_G32R32F:        return 2;
3084 		case FORMAT_X32B32G32R32F:  return 3;
3085 		case FORMAT_A32B32G32R32F:  return 4;
3086 		case FORMAT_D32F:           return 1;
3087 		case FORMAT_D32F_LOCKABLE:  return 1;
3088 		case FORMAT_D32FS8_TEXTURE: return 1;
3089 		case FORMAT_D32FS8_SHADOW:  return 1;
3090 		case FORMAT_A8:             return 1;
3091 		case FORMAT_R8I:            return 1;
3092 		case FORMAT_R8:             return 1;
3093 		case FORMAT_R16I:           return 1;
3094 		case FORMAT_R16UI:          return 1;
3095 		case FORMAT_R32I:           return 1;
3096 		case FORMAT_R32UI:          return 1;
3097 		case FORMAT_L8:             return 1;
3098 		case FORMAT_L16:            return 1;
3099 		case FORMAT_A8L8:           return 2;
3100 		case FORMAT_YV12_BT601:     return 3;
3101 		case FORMAT_YV12_BT709:     return 3;
3102 		case FORMAT_YV12_JFIF:      return 3;
3103 		default:
3104 			ASSERT(false);
3105 		}
3106 
3107 		return 1;
3108 	}
3109 
allocateBuffer(int width,int height,int depth,Format format)3110 	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
3111 	{
3112 		// Render targets require 2x2 quads
3113 		int width2 = (width + 1) & ~1;
3114 		int height2 = (height + 1) & ~1;
3115 
3116 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3117 		// and stencil operations also read 8 bytes per four 8-bit stencil values,
3118 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3119 		return allocate(size(width2, height2, depth, format) + 4);
3120 	}
3121 
memfill4(void * buffer,int pattern,int bytes)3122 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3123 	{
3124 		while((size_t)buffer & 0x1 && bytes >= 1)
3125 		{
3126 			*(char*)buffer = (char)pattern;
3127 			(char*&)buffer += 1;
3128 			bytes -= 1;
3129 		}
3130 
3131 		while((size_t)buffer & 0x3 && bytes >= 2)
3132 		{
3133 			*(short*)buffer = (short)pattern;
3134 			(short*&)buffer += 1;
3135 			bytes -= 2;
3136 		}
3137 
3138 		#if defined(__i386__) || defined(__x86_64__)
3139 			if(CPUID::supportsSSE())
3140 			{
3141 				while((size_t)buffer & 0xF && bytes >= 4)
3142 				{
3143 					*(int*)buffer = pattern;
3144 					(int*&)buffer += 1;
3145 					bytes -= 4;
3146 				}
3147 
3148 				__m128 quad = _mm_set_ps1((float&)pattern);
3149 
3150 				float *pointer = (float*)buffer;
3151 				int qxwords = bytes / 64;
3152 				bytes -= qxwords * 64;
3153 
3154 				while(qxwords--)
3155 				{
3156 					_mm_stream_ps(pointer + 0, quad);
3157 					_mm_stream_ps(pointer + 4, quad);
3158 					_mm_stream_ps(pointer + 8, quad);
3159 					_mm_stream_ps(pointer + 12, quad);
3160 
3161 					pointer += 16;
3162 				}
3163 
3164 				buffer = pointer;
3165 			}
3166 		#endif
3167 
3168 		while(bytes >= 4)
3169 		{
3170 			*(int*)buffer = (int)pattern;
3171 			(int*&)buffer += 1;
3172 			bytes -= 4;
3173 		}
3174 
3175 		while(bytes >= 2)
3176 		{
3177 			*(short*)buffer = (short)pattern;
3178 			(short*&)buffer += 1;
3179 			bytes -= 2;
3180 		}
3181 
3182 		while(bytes >= 1)
3183 		{
3184 			*(char*)buffer = (char)pattern;
3185 			(char*&)buffer += 1;
3186 			bytes -= 1;
3187 		}
3188 	}
3189 
sync()3190 	void Surface::sync()
3191 	{
3192 		resource->lock(EXCLUSIVE);
3193 		resource->unlock();
3194 	}
3195 
isEntire(const Rect & rect) const3196 	bool Surface::isEntire(const Rect& rect) const
3197 	{
3198 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3199 	}
3200 
getRect() const3201 	Rect Surface::getRect() const
3202 	{
3203 		return Rect(0, 0, internal.width, internal.height);
3204 	}
3205 
clearDepth(float depth,int x0,int y0,int width,int height)3206 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3207 	{
3208 		if(width == 0 || height == 0) return;
3209 
3210 		// Not overlapping
3211 		if(x0 > internal.width) return;
3212 		if(y0 > internal.height) return;
3213 		if(x0 + width < 0) return;
3214 		if(y0 + height < 0) return;
3215 
3216 		// Clip against dimensions
3217 		if(x0 < 0) {width += x0; x0 = 0;}
3218 		if(x0 + width > internal.width) width = internal.width - x0;
3219 		if(y0 < 0) {height += y0; y0 = 0;}
3220 		if(y0 + height > internal.height) height = internal.height - y0;
3221 
3222 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3223 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3224 
3225 		int width2 = (internal.width + 1) & ~1;
3226 
3227 		int x1 = x0 + width;
3228 		int y1 = y0 + height;
3229 
3230 		if(internal.format == FORMAT_D32F_LOCKABLE ||
3231 		   internal.format == FORMAT_D32FS8_TEXTURE ||
3232 		   internal.format == FORMAT_D32FS8_SHADOW)
3233 		{
3234 			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3235 
3236 			for(int z = 0; z < internal.depth; z++)
3237 			{
3238 				for(int y = y0; y < y1; y++)
3239 				{
3240 					memfill4(target, (int&)depth, 4 * width);
3241 					target += width2;
3242 				}
3243 			}
3244 
3245 			unlockInternal();
3246 		}
3247 		else   // Quad layout
3248 		{
3249 			if(complementaryDepthBuffer)
3250 			{
3251 				depth = 1 - depth;
3252 			}
3253 
3254 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3255 
3256 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3257 			int oddX1 = (x1 & ~1) * 2;
3258 			int evenX0 = ((x0 + 1) & ~1) * 2;
3259 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3260 
3261 			for(int z = 0; z < internal.depth; z++)
3262 			{
3263 				for(int y = y0; y < y1; y++)
3264 				{
3265 					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3266 
3267 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3268 					{
3269 						if((x0 & 1) != 0)
3270 						{
3271 							target[oddX0 + 0] = depth;
3272 							target[oddX0 + 2] = depth;
3273 						}
3274 
3275 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3276 					//	{
3277 					//		target[x2 + 0] = depth;
3278 					//		target[x2 + 1] = depth;
3279 					//		target[x2 + 2] = depth;
3280 					//		target[x2 + 3] = depth;
3281 					//	}
3282 
3283 					//	__asm
3284 					//	{
3285 					//		movss xmm0, depth
3286 					//		shufps xmm0, xmm0, 0x00
3287 					//
3288 					//		mov eax, x0
3289 					//		add eax, 1
3290 					//		and eax, 0xFFFFFFFE
3291 					//		cmp eax, x1
3292 					//		jge qEnd
3293 					//
3294 					//		mov edi, target
3295 					//
3296 					//	qLoop:
3297 					//		movntps [edi+8*eax], xmm0
3298 					//
3299 					//		add eax, 2
3300 					//		cmp eax, x1
3301 					//		jl qLoop
3302 					//	qEnd:
3303 					//	}
3304 
3305 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3306 
3307 						if((x1 & 1) != 0)
3308 						{
3309 							target[oddX1 + 0] = depth;
3310 							target[oddX1 + 2] = depth;
3311 						}
3312 
3313 						y++;
3314 					}
3315 					else
3316 					{
3317 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3318 						{
3319 							target[i] = depth;
3320 						}
3321 					}
3322 				}
3323 
3324 				buffer += internal.sliceP;
3325 			}
3326 
3327 			unlockInternal();
3328 		}
3329 	}
3330 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3331 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3332 	{
3333 		if(mask == 0 || width == 0 || height == 0) return;
3334 
3335 		// Not overlapping
3336 		if(x0 > internal.width) return;
3337 		if(y0 > internal.height) return;
3338 		if(x0 + width < 0) return;
3339 		if(y0 + height < 0) return;
3340 
3341 		// Clip against dimensions
3342 		if(x0 < 0) {width += x0; x0 = 0;}
3343 		if(x0 + width > internal.width) width = internal.width - x0;
3344 		if(y0 < 0) {height += y0; y0 = 0;}
3345 		if(y0 + height > internal.height) height = internal.height - y0;
3346 
3347 		int width2 = (internal.width + 1) & ~1;
3348 
3349 		int x1 = x0 + width;
3350 		int y1 = y0 + height;
3351 
3352 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3353 		int oddX1 = (x1 & ~1) * 2;
3354 		int evenX0 = ((x0 + 1) & ~1) * 2;
3355 		int evenBytes = oddX1 - evenX0;
3356 
3357 		unsigned char maskedS = s & mask;
3358 		unsigned char invMask = ~mask;
3359 		unsigned int fill = maskedS;
3360 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3361 
3362 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3363 
3364 		// Stencil buffers are assumed to use quad layout
3365 		for(int z = 0; z < stencil.depth; z++)
3366 		{
3367 			for(int y = y0; y < y1; y++)
3368 			{
3369 				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3370 
3371 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3372 				{
3373 					if((x0 & 1) != 0)
3374 					{
3375 						target[oddX0 + 0] = fill;
3376 						target[oddX0 + 2] = fill;
3377 					}
3378 
3379 					memfill4(&target[evenX0], fill, evenBytes);
3380 
3381 					if((x1 & 1) != 0)
3382 					{
3383 						target[oddX1 + 0] = fill;
3384 						target[oddX1 + 2] = fill;
3385 					}
3386 
3387 					y++;
3388 				}
3389 				else
3390 				{
3391 					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3392 					{
3393 						target[i] = maskedS | (target[i] & invMask);
3394 					}
3395 				}
3396 			}
3397 
3398 			buffer += stencil.sliceP;
3399 		}
3400 
3401 		unlockStencil();
3402 	}
3403 
fill(const Color<float> & color,int x0,int y0,int width,int height)3404 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3405 	{
3406 		unsigned char *row;
3407 		Buffer *buffer;
3408 
3409 		if(internal.dirty)
3410 		{
3411 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3412 			buffer = &internal;
3413 		}
3414 		else
3415 		{
3416 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3417 			buffer = &external;
3418 		}
3419 
3420 		if(buffer->bytes <= 4)
3421 		{
3422 			int c;
3423 			buffer->write(&c, color);
3424 
3425 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3426 			if(buffer->bytes <= 2) c = (c << 16) | c;
3427 
3428 			for(int y = 0; y < height; y++)
3429 			{
3430 				memfill4(row, c, width * buffer->bytes);
3431 
3432 				row += buffer->pitchB;
3433 			}
3434 		}
3435 		else   // Generic
3436 		{
3437 			for(int y = 0; y < height; y++)
3438 			{
3439 				unsigned char *element = row;
3440 
3441 				for(int x = 0; x < width; x++)
3442 				{
3443 					buffer->write(element, color);
3444 
3445 					element += buffer->bytes;
3446 				}
3447 
3448 				row += buffer->pitchB;
3449 			}
3450 		}
3451 
3452 		if(buffer == &internal)
3453 		{
3454 			unlockInternal();
3455 		}
3456 		else
3457 		{
3458 			unlockExternal();
3459 		}
3460 	}
3461 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3462 	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3463 	{
3464 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3465 
3466 		sw::Color<float> color;
3467 
3468 		if(!filter)
3469 		{
3470 			color = source->internal.read((int)srcX, (int)srcY);
3471 		}
3472 		else   // Bilinear filtering
3473 		{
3474 			color = source->internal.sample(srcX, srcY);
3475 		}
3476 
3477 		internal.write(x, y, color);
3478 	}
3479 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3480 	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3481 	{
3482 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3483 
3484 		sw::Color<float> color;
3485 
3486 		if(!filter)
3487 		{
3488 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3489 		}
3490 		else   // Bilinear filtering
3491 		{
3492 			color = source->internal.sample(srcX, srcY, srcZ);
3493 		}
3494 
3495 		internal.write(x, y, z, color);
3496 	}
3497 
hasStencil() const3498 	bool Surface::hasStencil() const
3499 	{
3500 		return isStencil(external.format);
3501 	}
3502 
hasDepth() const3503 	bool Surface::hasDepth() const
3504 	{
3505 		return isDepth(external.format);
3506 	}
3507 
hasPalette() const3508 	bool Surface::hasPalette() const
3509 	{
3510 		return isPalette(external.format);
3511 	}
3512 
isRenderTarget() const3513 	bool Surface::isRenderTarget() const
3514 	{
3515 		return renderTarget;
3516 	}
3517 
hasDirtyMipmaps() const3518 	bool Surface::hasDirtyMipmaps() const
3519 	{
3520 		return dirtyMipmaps;
3521 	}
3522 
cleanMipmaps()3523 	void Surface::cleanMipmaps()
3524 	{
3525 		dirtyMipmaps = false;
3526 	}
3527 
getResource()3528 	Resource *Surface::getResource()
3529 	{
3530 		return resource;
3531 	}
3532 
identicalFormats() const3533 	bool Surface::identicalFormats() const
3534 	{
3535 		return external.format == internal.format &&
3536 		       external.width  == internal.width &&
3537 		       external.height == internal.height &&
3538 		       external.depth  == internal.depth &&
3539 		       external.pitchB == internal.pitchB &&
3540 		       external.sliceB == internal.sliceB;
3541 	}
3542 
selectInternalFormat(Format format) const3543 	Format Surface::selectInternalFormat(Format format) const
3544 	{
3545 		switch(format)
3546 		{
3547 		case FORMAT_NULL:
3548 			return FORMAT_NULL;
3549 		case FORMAT_P8:
3550 		case FORMAT_A8P8:
3551 		case FORMAT_A4R4G4B4:
3552 		case FORMAT_A1R5G5B5:
3553 		case FORMAT_A8R3G3B2:
3554 			return FORMAT_A8R8G8B8;
3555 		case FORMAT_A8:
3556 			return FORMAT_A8;
3557 		case FORMAT_R8I:
3558 			return FORMAT_R8I;
3559 		case FORMAT_R8UI:
3560 			return FORMAT_R8UI;
3561 		case FORMAT_R8I_SNORM:
3562 			return FORMAT_R8I_SNORM;
3563 		case FORMAT_R8:
3564 			return FORMAT_R8;
3565 		case FORMAT_R16I:
3566 			return FORMAT_R16I;
3567 		case FORMAT_R16UI:
3568 			return FORMAT_R16UI;
3569 		case FORMAT_R32I:
3570 			return FORMAT_R32I;
3571 		case FORMAT_R32UI:
3572 			return FORMAT_R32UI;
3573 		case FORMAT_X16B16G16R16I:
3574 		case FORMAT_A16B16G16R16I:
3575 			return FORMAT_A16B16G16R16I;
3576 		case FORMAT_X16B16G16R16UI:
3577 		case FORMAT_A16B16G16R16UI:
3578 			return FORMAT_A16B16G16R16UI;
3579 		case FORMAT_A2R10G10B10:
3580 		case FORMAT_A2B10G10R10:
3581 		case FORMAT_A16B16G16R16:
3582 			return FORMAT_A16B16G16R16;
3583 		case FORMAT_X32B32G32R32I:
3584 		case FORMAT_A32B32G32R32I:
3585 			return FORMAT_A32B32G32R32I;
3586 		case FORMAT_X32B32G32R32UI:
3587 		case FORMAT_A32B32G32R32UI:
3588 			return FORMAT_A32B32G32R32UI;
3589 		case FORMAT_G8R8I:
3590 			return FORMAT_G8R8I;
3591 		case FORMAT_G8R8UI:
3592 			return FORMAT_G8R8UI;
3593 		case FORMAT_G8R8I_SNORM:
3594 			return FORMAT_G8R8I_SNORM;
3595 		case FORMAT_G8R8:
3596 			return FORMAT_G8R8;
3597 		case FORMAT_G16R16I:
3598 			return FORMAT_G16R16I;
3599 		case FORMAT_G16R16UI:
3600 			return FORMAT_G16R16UI;
3601 		case FORMAT_G16R16:
3602 			return FORMAT_G16R16;
3603 		case FORMAT_G32R32I:
3604 			return FORMAT_G32R32I;
3605 		case FORMAT_G32R32UI:
3606 			return FORMAT_G32R32UI;
3607 		case FORMAT_A8R8G8B8:
3608 			if(lockable || !quadLayoutEnabled)
3609 			{
3610 				return FORMAT_A8R8G8B8;
3611 			}
3612 			else
3613 			{
3614 				return FORMAT_A8G8R8B8Q;
3615 			}
3616 		case FORMAT_A8B8G8R8I:
3617 			return FORMAT_A8B8G8R8I;
3618 		case FORMAT_A8B8G8R8UI:
3619 			return FORMAT_A8B8G8R8UI;
3620 		case FORMAT_A8B8G8R8I_SNORM:
3621 			return FORMAT_A8B8G8R8I_SNORM;
3622 		case FORMAT_R5G5B5A1:
3623 		case FORMAT_R4G4B4A4:
3624 		case FORMAT_A8B8G8R8:
3625 			return FORMAT_A8B8G8R8;
3626 		case FORMAT_R5G6B5:
3627 			return FORMAT_R5G6B5;
3628 		case FORMAT_R3G3B2:
3629 		case FORMAT_R8G8B8:
3630 		case FORMAT_X4R4G4B4:
3631 		case FORMAT_X1R5G5B5:
3632 		case FORMAT_X8R8G8B8:
3633 			if(lockable || !quadLayoutEnabled)
3634 			{
3635 				return FORMAT_X8R8G8B8;
3636 			}
3637 			else
3638 			{
3639 				return FORMAT_X8G8R8B8Q;
3640 			}
3641 		case FORMAT_X8B8G8R8I:
3642 			return FORMAT_X8B8G8R8I;
3643 		case FORMAT_X8B8G8R8UI:
3644 			return FORMAT_X8B8G8R8UI;
3645 		case FORMAT_X8B8G8R8I_SNORM:
3646 			return FORMAT_X8B8G8R8I_SNORM;
3647 		case FORMAT_B8G8R8:
3648 		case FORMAT_X8B8G8R8:
3649 			return FORMAT_X8B8G8R8;
3650 		case FORMAT_SRGB8_X8:
3651 			return FORMAT_SRGB8_X8;
3652 		case FORMAT_SRGB8_A8:
3653 			return FORMAT_SRGB8_A8;
3654 		// Compressed formats
3655 		#if S3TC_SUPPORT
3656 		case FORMAT_DXT1:
3657 		case FORMAT_DXT3:
3658 		case FORMAT_DXT5:
3659 		#endif
3660 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3661 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3662 		case FORMAT_RGBA8_ETC2_EAC:
3663 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3664 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3665 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3666 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3667 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3668 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3669 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3670 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3671 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3672 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3673 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3674 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3675 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3676 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3677 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3678 			return FORMAT_A8R8G8B8;
3679 		case FORMAT_RGBA_ASTC_4x4_KHR:
3680 		case FORMAT_RGBA_ASTC_5x4_KHR:
3681 		case FORMAT_RGBA_ASTC_5x5_KHR:
3682 		case FORMAT_RGBA_ASTC_6x5_KHR:
3683 		case FORMAT_RGBA_ASTC_6x6_KHR:
3684 		case FORMAT_RGBA_ASTC_8x5_KHR:
3685 		case FORMAT_RGBA_ASTC_8x6_KHR:
3686 		case FORMAT_RGBA_ASTC_8x8_KHR:
3687 		case FORMAT_RGBA_ASTC_10x5_KHR:
3688 		case FORMAT_RGBA_ASTC_10x6_KHR:
3689 		case FORMAT_RGBA_ASTC_10x8_KHR:
3690 		case FORMAT_RGBA_ASTC_10x10_KHR:
3691 		case FORMAT_RGBA_ASTC_12x10_KHR:
3692 		case FORMAT_RGBA_ASTC_12x12_KHR:
3693 			// ASTC supports HDR, so a floating point format is required to represent it properly
3694 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3695 		case FORMAT_ATI1:
3696 		case FORMAT_R11_EAC:
3697 			return FORMAT_R8;
3698 		case FORMAT_SIGNED_R11_EAC:
3699 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3700 		case FORMAT_ATI2:
3701 		case FORMAT_RG11_EAC:
3702 			return FORMAT_G8R8;
3703 		case FORMAT_SIGNED_RG11_EAC:
3704 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3705 		case FORMAT_ETC1:
3706 		case FORMAT_RGB8_ETC2:
3707 		case FORMAT_SRGB8_ETC2:
3708 			return FORMAT_X8R8G8B8;
3709 		// Bumpmap formats
3710 		case FORMAT_V8U8:			return FORMAT_V8U8;
3711 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3712 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3713 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3714 		case FORMAT_V16U16:			return FORMAT_V16U16;
3715 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3716 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3717 		// Floating-point formats
3718 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3719 		case FORMAT_R16F:			return FORMAT_R32F;
3720 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3721 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3722 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3723 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3724 		case FORMAT_R32F:			return FORMAT_R32F;
3725 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3726 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3727 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3728 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3729 		// Luminance formats
3730 		case FORMAT_L8:				return FORMAT_L8;
3731 		case FORMAT_A4L4:			return FORMAT_A8L8;
3732 		case FORMAT_L16:			return FORMAT_L16;
3733 		case FORMAT_A8L8:			return FORMAT_A8L8;
3734 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3735 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3736 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3737 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3738 		// Depth/stencil formats
3739 		case FORMAT_D16:
3740 		case FORMAT_D32:
3741 		case FORMAT_D24X8:
3742 		case FORMAT_D24S8:
3743 		case FORMAT_D24FS8:
3744 			if(hasParent)   // Texture
3745 			{
3746 				return FORMAT_D32FS8_SHADOW;
3747 			}
3748 			else if(complementaryDepthBuffer)
3749 			{
3750 				return FORMAT_D32F_COMPLEMENTARY;
3751 			}
3752 			else
3753 			{
3754 				return FORMAT_D32F;
3755 			}
3756 		case FORMAT_D32F:           return FORMAT_D32F;
3757 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3758 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3759 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3760 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3761 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3762 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3763 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3764 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3765 		default:
3766 			ASSERT(false);
3767 		}
3768 
3769 		return FORMAT_NULL;
3770 	}
3771 
setTexturePalette(unsigned int * palette)3772 	void Surface::setTexturePalette(unsigned int *palette)
3773 	{
3774 		Surface::palette = palette;
3775 		Surface::paletteID++;
3776 	}
3777 
resolve()3778 	void Surface::resolve()
3779 	{
3780 		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3781 		{
3782 			return;
3783 		}
3784 
3785 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3786 
3787 		int width = internal.width;
3788 		int height = internal.height;
3789 		int pitch = internal.pitchB;
3790 		int slice = internal.sliceB;
3791 
3792 		unsigned char *source0 = (unsigned char*)source;
3793 		unsigned char *source1 = source0 + slice;
3794 		unsigned char *source2 = source1 + slice;
3795 		unsigned char *source3 = source2 + slice;
3796 		unsigned char *source4 = source3 + slice;
3797 		unsigned char *source5 = source4 + slice;
3798 		unsigned char *source6 = source5 + slice;
3799 		unsigned char *source7 = source6 + slice;
3800 		unsigned char *source8 = source7 + slice;
3801 		unsigned char *source9 = source8 + slice;
3802 		unsigned char *sourceA = source9 + slice;
3803 		unsigned char *sourceB = sourceA + slice;
3804 		unsigned char *sourceC = sourceB + slice;
3805 		unsigned char *sourceD = sourceC + slice;
3806 		unsigned char *sourceE = sourceD + slice;
3807 		unsigned char *sourceF = sourceE + slice;
3808 
3809 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
3810 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
3811 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
3812 		{
3813 			#if defined(__i386__) || defined(__x86_64__)
3814 				if(CPUID::supportsSSE2() && (width % 4) == 0)
3815 				{
3816 					if(internal.depth == 2)
3817 					{
3818 						for(int y = 0; y < height; y++)
3819 						{
3820 							for(int x = 0; x < width; x += 4)
3821 							{
3822 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3823 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3824 
3825 								c0 = _mm_avg_epu8(c0, c1);
3826 
3827 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3828 							}
3829 
3830 							source0 += pitch;
3831 							source1 += pitch;
3832 						}
3833 					}
3834 					else if(internal.depth == 4)
3835 					{
3836 						for(int y = 0; y < height; y++)
3837 						{
3838 							for(int x = 0; x < width; x += 4)
3839 							{
3840 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3841 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3842 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3843 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3844 
3845 								c0 = _mm_avg_epu8(c0, c1);
3846 								c2 = _mm_avg_epu8(c2, c3);
3847 								c0 = _mm_avg_epu8(c0, c2);
3848 
3849 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3850 							}
3851 
3852 							source0 += pitch;
3853 							source1 += pitch;
3854 							source2 += pitch;
3855 							source3 += pitch;
3856 						}
3857 					}
3858 					else if(internal.depth == 8)
3859 					{
3860 						for(int y = 0; y < height; y++)
3861 						{
3862 							for(int x = 0; x < width; x += 4)
3863 							{
3864 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3865 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3866 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3867 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3868 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3869 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3870 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3871 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3872 
3873 								c0 = _mm_avg_epu8(c0, c1);
3874 								c2 = _mm_avg_epu8(c2, c3);
3875 								c4 = _mm_avg_epu8(c4, c5);
3876 								c6 = _mm_avg_epu8(c6, c7);
3877 								c0 = _mm_avg_epu8(c0, c2);
3878 								c4 = _mm_avg_epu8(c4, c6);
3879 								c0 = _mm_avg_epu8(c0, c4);
3880 
3881 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3882 							}
3883 
3884 							source0 += pitch;
3885 							source1 += pitch;
3886 							source2 += pitch;
3887 							source3 += pitch;
3888 							source4 += pitch;
3889 							source5 += pitch;
3890 							source6 += pitch;
3891 							source7 += pitch;
3892 						}
3893 					}
3894 					else if(internal.depth == 16)
3895 					{
3896 						for(int y = 0; y < height; y++)
3897 						{
3898 							for(int x = 0; x < width; x += 4)
3899 							{
3900 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3901 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3902 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3903 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3904 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3905 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3906 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3907 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3908 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3909 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3910 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3911 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3912 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3913 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3914 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3915 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3916 
3917 								c0 = _mm_avg_epu8(c0, c1);
3918 								c2 = _mm_avg_epu8(c2, c3);
3919 								c4 = _mm_avg_epu8(c4, c5);
3920 								c6 = _mm_avg_epu8(c6, c7);
3921 								c8 = _mm_avg_epu8(c8, c9);
3922 								cA = _mm_avg_epu8(cA, cB);
3923 								cC = _mm_avg_epu8(cC, cD);
3924 								cE = _mm_avg_epu8(cE, cF);
3925 								c0 = _mm_avg_epu8(c0, c2);
3926 								c4 = _mm_avg_epu8(c4, c6);
3927 								c8 = _mm_avg_epu8(c8, cA);
3928 								cC = _mm_avg_epu8(cC, cE);
3929 								c0 = _mm_avg_epu8(c0, c4);
3930 								c8 = _mm_avg_epu8(c8, cC);
3931 								c0 = _mm_avg_epu8(c0, c8);
3932 
3933 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3934 							}
3935 
3936 							source0 += pitch;
3937 							source1 += pitch;
3938 							source2 += pitch;
3939 							source3 += pitch;
3940 							source4 += pitch;
3941 							source5 += pitch;
3942 							source6 += pitch;
3943 							source7 += pitch;
3944 							source8 += pitch;
3945 							source9 += pitch;
3946 							sourceA += pitch;
3947 							sourceB += pitch;
3948 							sourceC += pitch;
3949 							sourceD += pitch;
3950 							sourceE += pitch;
3951 							sourceF += pitch;
3952 						}
3953 					}
3954 					else ASSERT(false);
3955 				}
3956 				else
3957 			#endif
3958 			{
3959 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3960 
3961 				if(internal.depth == 2)
3962 				{
3963 					for(int y = 0; y < height; y++)
3964 					{
3965 						for(int x = 0; x < width; x++)
3966 						{
3967 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3968 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3969 
3970 							c0 = AVERAGE(c0, c1);
3971 
3972 							*(unsigned int*)(source0 + 4 * x) = c0;
3973 						}
3974 
3975 						source0 += pitch;
3976 						source1 += pitch;
3977 					}
3978 				}
3979 				else if(internal.depth == 4)
3980 				{
3981 					for(int y = 0; y < height; y++)
3982 					{
3983 						for(int x = 0; x < width; x++)
3984 						{
3985 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3986 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3987 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3988 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3989 
3990 							c0 = AVERAGE(c0, c1);
3991 							c2 = AVERAGE(c2, c3);
3992 							c0 = AVERAGE(c0, c2);
3993 
3994 							*(unsigned int*)(source0 + 4 * x) = c0;
3995 						}
3996 
3997 						source0 += pitch;
3998 						source1 += pitch;
3999 						source2 += pitch;
4000 						source3 += pitch;
4001 					}
4002 				}
4003 				else if(internal.depth == 8)
4004 				{
4005 					for(int y = 0; y < height; y++)
4006 					{
4007 						for(int x = 0; x < width; x++)
4008 						{
4009 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4010 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4011 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4012 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4013 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4014 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4015 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4016 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4017 
4018 							c0 = AVERAGE(c0, c1);
4019 							c2 = AVERAGE(c2, c3);
4020 							c4 = AVERAGE(c4, c5);
4021 							c6 = AVERAGE(c6, c7);
4022 							c0 = AVERAGE(c0, c2);
4023 							c4 = AVERAGE(c4, c6);
4024 							c0 = AVERAGE(c0, c4);
4025 
4026 							*(unsigned int*)(source0 + 4 * x) = c0;
4027 						}
4028 
4029 						source0 += pitch;
4030 						source1 += pitch;
4031 						source2 += pitch;
4032 						source3 += pitch;
4033 						source4 += pitch;
4034 						source5 += pitch;
4035 						source6 += pitch;
4036 						source7 += pitch;
4037 					}
4038 				}
4039 				else if(internal.depth == 16)
4040 				{
4041 					for(int y = 0; y < height; y++)
4042 					{
4043 						for(int x = 0; x < width; x++)
4044 						{
4045 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4046 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4047 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4048 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4049 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4050 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4051 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4052 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4053 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4054 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4055 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4056 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4057 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4058 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4059 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4060 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4061 
4062 							c0 = AVERAGE(c0, c1);
4063 							c2 = AVERAGE(c2, c3);
4064 							c4 = AVERAGE(c4, c5);
4065 							c6 = AVERAGE(c6, c7);
4066 							c8 = AVERAGE(c8, c9);
4067 							cA = AVERAGE(cA, cB);
4068 							cC = AVERAGE(cC, cD);
4069 							cE = AVERAGE(cE, cF);
4070 							c0 = AVERAGE(c0, c2);
4071 							c4 = AVERAGE(c4, c6);
4072 							c8 = AVERAGE(c8, cA);
4073 							cC = AVERAGE(cC, cE);
4074 							c0 = AVERAGE(c0, c4);
4075 							c8 = AVERAGE(c8, cC);
4076 							c0 = AVERAGE(c0, c8);
4077 
4078 							*(unsigned int*)(source0 + 4 * x) = c0;
4079 						}
4080 
4081 						source0 += pitch;
4082 						source1 += pitch;
4083 						source2 += pitch;
4084 						source3 += pitch;
4085 						source4 += pitch;
4086 						source5 += pitch;
4087 						source6 += pitch;
4088 						source7 += pitch;
4089 						source8 += pitch;
4090 						source9 += pitch;
4091 						sourceA += pitch;
4092 						sourceB += pitch;
4093 						sourceC += pitch;
4094 						sourceD += pitch;
4095 						sourceE += pitch;
4096 						sourceF += pitch;
4097 					}
4098 				}
4099 				else ASSERT(false);
4100 
4101 				#undef AVERAGE
4102 			}
4103 		}
4104 		else if(internal.format == FORMAT_G16R16)
4105 		{
4106 
4107 			#if defined(__i386__) || defined(__x86_64__)
4108 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4109 				{
4110 					if(internal.depth == 2)
4111 					{
4112 						for(int y = 0; y < height; y++)
4113 						{
4114 							for(int x = 0; x < width; x += 4)
4115 							{
4116 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4117 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4118 
4119 								c0 = _mm_avg_epu16(c0, c1);
4120 
4121 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4122 							}
4123 
4124 							source0 += pitch;
4125 							source1 += pitch;
4126 						}
4127 					}
4128 					else if(internal.depth == 4)
4129 					{
4130 						for(int y = 0; y < height; y++)
4131 						{
4132 							for(int x = 0; x < width; x += 4)
4133 							{
4134 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4135 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4136 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4137 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4138 
4139 								c0 = _mm_avg_epu16(c0, c1);
4140 								c2 = _mm_avg_epu16(c2, c3);
4141 								c0 = _mm_avg_epu16(c0, c2);
4142 
4143 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4144 							}
4145 
4146 							source0 += pitch;
4147 							source1 += pitch;
4148 							source2 += pitch;
4149 							source3 += pitch;
4150 						}
4151 					}
4152 					else if(internal.depth == 8)
4153 					{
4154 						for(int y = 0; y < height; y++)
4155 						{
4156 							for(int x = 0; x < width; x += 4)
4157 							{
4158 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4159 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4160 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4161 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4162 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4163 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4164 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4165 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4166 
4167 								c0 = _mm_avg_epu16(c0, c1);
4168 								c2 = _mm_avg_epu16(c2, c3);
4169 								c4 = _mm_avg_epu16(c4, c5);
4170 								c6 = _mm_avg_epu16(c6, c7);
4171 								c0 = _mm_avg_epu16(c0, c2);
4172 								c4 = _mm_avg_epu16(c4, c6);
4173 								c0 = _mm_avg_epu16(c0, c4);
4174 
4175 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4176 							}
4177 
4178 							source0 += pitch;
4179 							source1 += pitch;
4180 							source2 += pitch;
4181 							source3 += pitch;
4182 							source4 += pitch;
4183 							source5 += pitch;
4184 							source6 += pitch;
4185 							source7 += pitch;
4186 						}
4187 					}
4188 					else if(internal.depth == 16)
4189 					{
4190 						for(int y = 0; y < height; y++)
4191 						{
4192 							for(int x = 0; x < width; x += 4)
4193 							{
4194 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4195 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4196 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4197 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4198 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4199 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4200 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4201 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4202 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4203 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4204 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4205 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4206 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4207 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4208 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4209 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4210 
4211 								c0 = _mm_avg_epu16(c0, c1);
4212 								c2 = _mm_avg_epu16(c2, c3);
4213 								c4 = _mm_avg_epu16(c4, c5);
4214 								c6 = _mm_avg_epu16(c6, c7);
4215 								c8 = _mm_avg_epu16(c8, c9);
4216 								cA = _mm_avg_epu16(cA, cB);
4217 								cC = _mm_avg_epu16(cC, cD);
4218 								cE = _mm_avg_epu16(cE, cF);
4219 								c0 = _mm_avg_epu16(c0, c2);
4220 								c4 = _mm_avg_epu16(c4, c6);
4221 								c8 = _mm_avg_epu16(c8, cA);
4222 								cC = _mm_avg_epu16(cC, cE);
4223 								c0 = _mm_avg_epu16(c0, c4);
4224 								c8 = _mm_avg_epu16(c8, cC);
4225 								c0 = _mm_avg_epu16(c0, c8);
4226 
4227 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4228 							}
4229 
4230 							source0 += pitch;
4231 							source1 += pitch;
4232 							source2 += pitch;
4233 							source3 += pitch;
4234 							source4 += pitch;
4235 							source5 += pitch;
4236 							source6 += pitch;
4237 							source7 += pitch;
4238 							source8 += pitch;
4239 							source9 += pitch;
4240 							sourceA += pitch;
4241 							sourceB += pitch;
4242 							sourceC += pitch;
4243 							sourceD += pitch;
4244 							sourceE += pitch;
4245 							sourceF += pitch;
4246 						}
4247 					}
4248 					else ASSERT(false);
4249 				}
4250 				else
4251 			#endif
4252 			{
4253 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4254 
4255 				if(internal.depth == 2)
4256 				{
4257 					for(int y = 0; y < height; y++)
4258 					{
4259 						for(int x = 0; x < width; x++)
4260 						{
4261 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4262 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4263 
4264 							c0 = AVERAGE(c0, c1);
4265 
4266 							*(unsigned int*)(source0 + 4 * x) = c0;
4267 						}
4268 
4269 						source0 += pitch;
4270 						source1 += pitch;
4271 					}
4272 				}
4273 				else if(internal.depth == 4)
4274 				{
4275 					for(int y = 0; y < height; y++)
4276 					{
4277 						for(int x = 0; x < width; x++)
4278 						{
4279 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4280 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4281 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4282 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4283 
4284 							c0 = AVERAGE(c0, c1);
4285 							c2 = AVERAGE(c2, c3);
4286 							c0 = AVERAGE(c0, c2);
4287 
4288 							*(unsigned int*)(source0 + 4 * x) = c0;
4289 						}
4290 
4291 						source0 += pitch;
4292 						source1 += pitch;
4293 						source2 += pitch;
4294 						source3 += pitch;
4295 					}
4296 				}
4297 				else if(internal.depth == 8)
4298 				{
4299 					for(int y = 0; y < height; y++)
4300 					{
4301 						for(int x = 0; x < width; x++)
4302 						{
4303 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4304 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4305 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4306 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4307 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4308 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4309 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4310 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4311 
4312 							c0 = AVERAGE(c0, c1);
4313 							c2 = AVERAGE(c2, c3);
4314 							c4 = AVERAGE(c4, c5);
4315 							c6 = AVERAGE(c6, c7);
4316 							c0 = AVERAGE(c0, c2);
4317 							c4 = AVERAGE(c4, c6);
4318 							c0 = AVERAGE(c0, c4);
4319 
4320 							*(unsigned int*)(source0 + 4 * x) = c0;
4321 						}
4322 
4323 						source0 += pitch;
4324 						source1 += pitch;
4325 						source2 += pitch;
4326 						source3 += pitch;
4327 						source4 += pitch;
4328 						source5 += pitch;
4329 						source6 += pitch;
4330 						source7 += pitch;
4331 					}
4332 				}
4333 				else if(internal.depth == 16)
4334 				{
4335 					for(int y = 0; y < height; y++)
4336 					{
4337 						for(int x = 0; x < width; x++)
4338 						{
4339 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4340 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4341 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4342 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4343 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4344 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4345 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4346 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4347 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4348 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4349 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4350 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4351 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4352 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4353 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4354 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4355 
4356 							c0 = AVERAGE(c0, c1);
4357 							c2 = AVERAGE(c2, c3);
4358 							c4 = AVERAGE(c4, c5);
4359 							c6 = AVERAGE(c6, c7);
4360 							c8 = AVERAGE(c8, c9);
4361 							cA = AVERAGE(cA, cB);
4362 							cC = AVERAGE(cC, cD);
4363 							cE = AVERAGE(cE, cF);
4364 							c0 = AVERAGE(c0, c2);
4365 							c4 = AVERAGE(c4, c6);
4366 							c8 = AVERAGE(c8, cA);
4367 							cC = AVERAGE(cC, cE);
4368 							c0 = AVERAGE(c0, c4);
4369 							c8 = AVERAGE(c8, cC);
4370 							c0 = AVERAGE(c0, c8);
4371 
4372 							*(unsigned int*)(source0 + 4 * x) = c0;
4373 						}
4374 
4375 						source0 += pitch;
4376 						source1 += pitch;
4377 						source2 += pitch;
4378 						source3 += pitch;
4379 						source4 += pitch;
4380 						source5 += pitch;
4381 						source6 += pitch;
4382 						source7 += pitch;
4383 						source8 += pitch;
4384 						source9 += pitch;
4385 						sourceA += pitch;
4386 						sourceB += pitch;
4387 						sourceC += pitch;
4388 						sourceD += pitch;
4389 						sourceE += pitch;
4390 						sourceF += pitch;
4391 					}
4392 				}
4393 				else ASSERT(false);
4394 
4395 				#undef AVERAGE
4396 			}
4397 		}
4398 		else if(internal.format == FORMAT_A16B16G16R16)
4399 		{
4400 			#if defined(__i386__) || defined(__x86_64__)
4401 				if(CPUID::supportsSSE2() && (width % 2) == 0)
4402 				{
4403 					if(internal.depth == 2)
4404 					{
4405 						for(int y = 0; y < height; y++)
4406 						{
4407 							for(int x = 0; x < width; x += 2)
4408 							{
4409 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4410 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4411 
4412 								c0 = _mm_avg_epu16(c0, c1);
4413 
4414 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4415 							}
4416 
4417 							source0 += pitch;
4418 							source1 += pitch;
4419 						}
4420 					}
4421 					else if(internal.depth == 4)
4422 					{
4423 						for(int y = 0; y < height; y++)
4424 						{
4425 							for(int x = 0; x < width; x += 2)
4426 							{
4427 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4428 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4429 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4430 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4431 
4432 								c0 = _mm_avg_epu16(c0, c1);
4433 								c2 = _mm_avg_epu16(c2, c3);
4434 								c0 = _mm_avg_epu16(c0, c2);
4435 
4436 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4437 							}
4438 
4439 							source0 += pitch;
4440 							source1 += pitch;
4441 							source2 += pitch;
4442 							source3 += pitch;
4443 						}
4444 					}
4445 					else if(internal.depth == 8)
4446 					{
4447 						for(int y = 0; y < height; y++)
4448 						{
4449 							for(int x = 0; x < width; x += 2)
4450 							{
4451 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4452 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4453 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4454 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4455 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4456 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4457 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4458 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4459 
4460 								c0 = _mm_avg_epu16(c0, c1);
4461 								c2 = _mm_avg_epu16(c2, c3);
4462 								c4 = _mm_avg_epu16(c4, c5);
4463 								c6 = _mm_avg_epu16(c6, c7);
4464 								c0 = _mm_avg_epu16(c0, c2);
4465 								c4 = _mm_avg_epu16(c4, c6);
4466 								c0 = _mm_avg_epu16(c0, c4);
4467 
4468 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4469 							}
4470 
4471 							source0 += pitch;
4472 							source1 += pitch;
4473 							source2 += pitch;
4474 							source3 += pitch;
4475 							source4 += pitch;
4476 							source5 += pitch;
4477 							source6 += pitch;
4478 							source7 += pitch;
4479 						}
4480 					}
4481 					else if(internal.depth == 16)
4482 					{
4483 						for(int y = 0; y < height; y++)
4484 						{
4485 							for(int x = 0; x < width; x += 2)
4486 							{
4487 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4488 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4489 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4490 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4491 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4492 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4493 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4494 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4495 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4496 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4497 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4498 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4499 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4500 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4501 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4502 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4503 
4504 								c0 = _mm_avg_epu16(c0, c1);
4505 								c2 = _mm_avg_epu16(c2, c3);
4506 								c4 = _mm_avg_epu16(c4, c5);
4507 								c6 = _mm_avg_epu16(c6, c7);
4508 								c8 = _mm_avg_epu16(c8, c9);
4509 								cA = _mm_avg_epu16(cA, cB);
4510 								cC = _mm_avg_epu16(cC, cD);
4511 								cE = _mm_avg_epu16(cE, cF);
4512 								c0 = _mm_avg_epu16(c0, c2);
4513 								c4 = _mm_avg_epu16(c4, c6);
4514 								c8 = _mm_avg_epu16(c8, cA);
4515 								cC = _mm_avg_epu16(cC, cE);
4516 								c0 = _mm_avg_epu16(c0, c4);
4517 								c8 = _mm_avg_epu16(c8, cC);
4518 								c0 = _mm_avg_epu16(c0, c8);
4519 
4520 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4521 							}
4522 
4523 							source0 += pitch;
4524 							source1 += pitch;
4525 							source2 += pitch;
4526 							source3 += pitch;
4527 							source4 += pitch;
4528 							source5 += pitch;
4529 							source6 += pitch;
4530 							source7 += pitch;
4531 							source8 += pitch;
4532 							source9 += pitch;
4533 							sourceA += pitch;
4534 							sourceB += pitch;
4535 							sourceC += pitch;
4536 							sourceD += pitch;
4537 							sourceE += pitch;
4538 							sourceF += pitch;
4539 						}
4540 					}
4541 					else ASSERT(false);
4542 				}
4543 				else
4544 			#endif
4545 			{
4546 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4547 
4548 				if(internal.depth == 2)
4549 				{
4550 					for(int y = 0; y < height; y++)
4551 					{
4552 						for(int x = 0; x < 2 * width; x++)
4553 						{
4554 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4555 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4556 
4557 							c0 = AVERAGE(c0, c1);
4558 
4559 							*(unsigned int*)(source0 + 4 * x) = c0;
4560 						}
4561 
4562 						source0 += pitch;
4563 						source1 += pitch;
4564 					}
4565 				}
4566 				else if(internal.depth == 4)
4567 				{
4568 					for(int y = 0; y < height; y++)
4569 					{
4570 						for(int x = 0; x < 2 * width; x++)
4571 						{
4572 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4573 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4574 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4575 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4576 
4577 							c0 = AVERAGE(c0, c1);
4578 							c2 = AVERAGE(c2, c3);
4579 							c0 = AVERAGE(c0, c2);
4580 
4581 							*(unsigned int*)(source0 + 4 * x) = c0;
4582 						}
4583 
4584 						source0 += pitch;
4585 						source1 += pitch;
4586 						source2 += pitch;
4587 						source3 += pitch;
4588 					}
4589 				}
4590 				else if(internal.depth == 8)
4591 				{
4592 					for(int y = 0; y < height; y++)
4593 					{
4594 						for(int x = 0; x < 2 * width; x++)
4595 						{
4596 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4597 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4598 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4599 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4600 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4601 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4602 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4603 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4604 
4605 							c0 = AVERAGE(c0, c1);
4606 							c2 = AVERAGE(c2, c3);
4607 							c4 = AVERAGE(c4, c5);
4608 							c6 = AVERAGE(c6, c7);
4609 							c0 = AVERAGE(c0, c2);
4610 							c4 = AVERAGE(c4, c6);
4611 							c0 = AVERAGE(c0, c4);
4612 
4613 							*(unsigned int*)(source0 + 4 * x) = c0;
4614 						}
4615 
4616 						source0 += pitch;
4617 						source1 += pitch;
4618 						source2 += pitch;
4619 						source3 += pitch;
4620 						source4 += pitch;
4621 						source5 += pitch;
4622 						source6 += pitch;
4623 						source7 += pitch;
4624 					}
4625 				}
4626 				else if(internal.depth == 16)
4627 				{
4628 					for(int y = 0; y < height; y++)
4629 					{
4630 						for(int x = 0; x < 2 * width; x++)
4631 						{
4632 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4633 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4634 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4635 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4636 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4637 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4638 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4639 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4640 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4641 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4642 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4643 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4644 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4645 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4646 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4647 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4648 
4649 							c0 = AVERAGE(c0, c1);
4650 							c2 = AVERAGE(c2, c3);
4651 							c4 = AVERAGE(c4, c5);
4652 							c6 = AVERAGE(c6, c7);
4653 							c8 = AVERAGE(c8, c9);
4654 							cA = AVERAGE(cA, cB);
4655 							cC = AVERAGE(cC, cD);
4656 							cE = AVERAGE(cE, cF);
4657 							c0 = AVERAGE(c0, c2);
4658 							c4 = AVERAGE(c4, c6);
4659 							c8 = AVERAGE(c8, cA);
4660 							cC = AVERAGE(cC, cE);
4661 							c0 = AVERAGE(c0, c4);
4662 							c8 = AVERAGE(c8, cC);
4663 							c0 = AVERAGE(c0, c8);
4664 
4665 							*(unsigned int*)(source0 + 4 * x) = c0;
4666 						}
4667 
4668 						source0 += pitch;
4669 						source1 += pitch;
4670 						source2 += pitch;
4671 						source3 += pitch;
4672 						source4 += pitch;
4673 						source5 += pitch;
4674 						source6 += pitch;
4675 						source7 += pitch;
4676 						source8 += pitch;
4677 						source9 += pitch;
4678 						sourceA += pitch;
4679 						sourceB += pitch;
4680 						sourceC += pitch;
4681 						sourceD += pitch;
4682 						sourceE += pitch;
4683 						sourceF += pitch;
4684 					}
4685 				}
4686 				else ASSERT(false);
4687 
4688 				#undef AVERAGE
4689 			}
4690 		}
4691 		else if(internal.format == FORMAT_R32F)
4692 		{
4693 			#if defined(__i386__) || defined(__x86_64__)
4694 				if(CPUID::supportsSSE() && (width % 4) == 0)
4695 				{
4696 					if(internal.depth == 2)
4697 					{
4698 						for(int y = 0; y < height; y++)
4699 						{
4700 							for(int x = 0; x < width; x += 4)
4701 							{
4702 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4703 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4704 
4705 								c0 = _mm_add_ps(c0, c1);
4706 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4707 
4708 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4709 							}
4710 
4711 							source0 += pitch;
4712 							source1 += pitch;
4713 						}
4714 					}
4715 					else if(internal.depth == 4)
4716 					{
4717 						for(int y = 0; y < height; y++)
4718 						{
4719 							for(int x = 0; x < width; x += 4)
4720 							{
4721 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4722 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4723 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4724 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4725 
4726 								c0 = _mm_add_ps(c0, c1);
4727 								c2 = _mm_add_ps(c2, c3);
4728 								c0 = _mm_add_ps(c0, c2);
4729 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4730 
4731 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4732 							}
4733 
4734 							source0 += pitch;
4735 							source1 += pitch;
4736 							source2 += pitch;
4737 							source3 += pitch;
4738 						}
4739 					}
4740 					else if(internal.depth == 8)
4741 					{
4742 						for(int y = 0; y < height; y++)
4743 						{
4744 							for(int x = 0; x < width; x += 4)
4745 							{
4746 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4747 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4748 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4749 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4750 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4751 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4752 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4753 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4754 
4755 								c0 = _mm_add_ps(c0, c1);
4756 								c2 = _mm_add_ps(c2, c3);
4757 								c4 = _mm_add_ps(c4, c5);
4758 								c6 = _mm_add_ps(c6, c7);
4759 								c0 = _mm_add_ps(c0, c2);
4760 								c4 = _mm_add_ps(c4, c6);
4761 								c0 = _mm_add_ps(c0, c4);
4762 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4763 
4764 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4765 							}
4766 
4767 							source0 += pitch;
4768 							source1 += pitch;
4769 							source2 += pitch;
4770 							source3 += pitch;
4771 							source4 += pitch;
4772 							source5 += pitch;
4773 							source6 += pitch;
4774 							source7 += pitch;
4775 						}
4776 					}
4777 					else if(internal.depth == 16)
4778 					{
4779 						for(int y = 0; y < height; y++)
4780 						{
4781 							for(int x = 0; x < width; x += 4)
4782 							{
4783 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4784 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4785 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4786 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4787 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4788 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4789 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4790 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4791 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4792 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4793 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4794 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4795 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4796 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4797 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4798 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4799 
4800 								c0 = _mm_add_ps(c0, c1);
4801 								c2 = _mm_add_ps(c2, c3);
4802 								c4 = _mm_add_ps(c4, c5);
4803 								c6 = _mm_add_ps(c6, c7);
4804 								c8 = _mm_add_ps(c8, c9);
4805 								cA = _mm_add_ps(cA, cB);
4806 								cC = _mm_add_ps(cC, cD);
4807 								cE = _mm_add_ps(cE, cF);
4808 								c0 = _mm_add_ps(c0, c2);
4809 								c4 = _mm_add_ps(c4, c6);
4810 								c8 = _mm_add_ps(c8, cA);
4811 								cC = _mm_add_ps(cC, cE);
4812 								c0 = _mm_add_ps(c0, c4);
4813 								c8 = _mm_add_ps(c8, cC);
4814 								c0 = _mm_add_ps(c0, c8);
4815 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4816 
4817 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4818 							}
4819 
4820 							source0 += pitch;
4821 							source1 += pitch;
4822 							source2 += pitch;
4823 							source3 += pitch;
4824 							source4 += pitch;
4825 							source5 += pitch;
4826 							source6 += pitch;
4827 							source7 += pitch;
4828 							source8 += pitch;
4829 							source9 += pitch;
4830 							sourceA += pitch;
4831 							sourceB += pitch;
4832 							sourceC += pitch;
4833 							sourceD += pitch;
4834 							sourceE += pitch;
4835 							sourceF += pitch;
4836 						}
4837 					}
4838 					else ASSERT(false);
4839 				}
4840 				else
4841 			#endif
4842 			{
4843 				if(internal.depth == 2)
4844 				{
4845 					for(int y = 0; y < height; y++)
4846 					{
4847 						for(int x = 0; x < width; x++)
4848 						{
4849 							float c0 = *(float*)(source0 + 4 * x);
4850 							float c1 = *(float*)(source1 + 4 * x);
4851 
4852 							c0 = c0 + c1;
4853 							c0 *= 1.0f / 2.0f;
4854 
4855 							*(float*)(source0 + 4 * x) = c0;
4856 						}
4857 
4858 						source0 += pitch;
4859 						source1 += pitch;
4860 					}
4861 				}
4862 				else if(internal.depth == 4)
4863 				{
4864 					for(int y = 0; y < height; y++)
4865 					{
4866 						for(int x = 0; x < width; x++)
4867 						{
4868 							float c0 = *(float*)(source0 + 4 * x);
4869 							float c1 = *(float*)(source1 + 4 * x);
4870 							float c2 = *(float*)(source2 + 4 * x);
4871 							float c3 = *(float*)(source3 + 4 * x);
4872 
4873 							c0 = c0 + c1;
4874 							c2 = c2 + c3;
4875 							c0 = c0 + c2;
4876 							c0 *= 1.0f / 4.0f;
4877 
4878 							*(float*)(source0 + 4 * x) = c0;
4879 						}
4880 
4881 						source0 += pitch;
4882 						source1 += pitch;
4883 						source2 += pitch;
4884 						source3 += pitch;
4885 					}
4886 				}
4887 				else if(internal.depth == 8)
4888 				{
4889 					for(int y = 0; y < height; y++)
4890 					{
4891 						for(int x = 0; x < width; x++)
4892 						{
4893 							float c0 = *(float*)(source0 + 4 * x);
4894 							float c1 = *(float*)(source1 + 4 * x);
4895 							float c2 = *(float*)(source2 + 4 * x);
4896 							float c3 = *(float*)(source3 + 4 * x);
4897 							float c4 = *(float*)(source4 + 4 * x);
4898 							float c5 = *(float*)(source5 + 4 * x);
4899 							float c6 = *(float*)(source6 + 4 * x);
4900 							float c7 = *(float*)(source7 + 4 * x);
4901 
4902 							c0 = c0 + c1;
4903 							c2 = c2 + c3;
4904 							c4 = c4 + c5;
4905 							c6 = c6 + c7;
4906 							c0 = c0 + c2;
4907 							c4 = c4 + c6;
4908 							c0 = c0 + c4;
4909 							c0 *= 1.0f / 8.0f;
4910 
4911 							*(float*)(source0 + 4 * x) = c0;
4912 						}
4913 
4914 						source0 += pitch;
4915 						source1 += pitch;
4916 						source2 += pitch;
4917 						source3 += pitch;
4918 						source4 += pitch;
4919 						source5 += pitch;
4920 						source6 += pitch;
4921 						source7 += pitch;
4922 					}
4923 				}
4924 				else if(internal.depth == 16)
4925 				{
4926 					for(int y = 0; y < height; y++)
4927 					{
4928 						for(int x = 0; x < width; x++)
4929 						{
4930 							float c0 = *(float*)(source0 + 4 * x);
4931 							float c1 = *(float*)(source1 + 4 * x);
4932 							float c2 = *(float*)(source2 + 4 * x);
4933 							float c3 = *(float*)(source3 + 4 * x);
4934 							float c4 = *(float*)(source4 + 4 * x);
4935 							float c5 = *(float*)(source5 + 4 * x);
4936 							float c6 = *(float*)(source6 + 4 * x);
4937 							float c7 = *(float*)(source7 + 4 * x);
4938 							float c8 = *(float*)(source8 + 4 * x);
4939 							float c9 = *(float*)(source9 + 4 * x);
4940 							float cA = *(float*)(sourceA + 4 * x);
4941 							float cB = *(float*)(sourceB + 4 * x);
4942 							float cC = *(float*)(sourceC + 4 * x);
4943 							float cD = *(float*)(sourceD + 4 * x);
4944 							float cE = *(float*)(sourceE + 4 * x);
4945 							float cF = *(float*)(sourceF + 4 * x);
4946 
4947 							c0 = c0 + c1;
4948 							c2 = c2 + c3;
4949 							c4 = c4 + c5;
4950 							c6 = c6 + c7;
4951 							c8 = c8 + c9;
4952 							cA = cA + cB;
4953 							cC = cC + cD;
4954 							cE = cE + cF;
4955 							c0 = c0 + c2;
4956 							c4 = c4 + c6;
4957 							c8 = c8 + cA;
4958 							cC = cC + cE;
4959 							c0 = c0 + c4;
4960 							c8 = c8 + cC;
4961 							c0 = c0 + c8;
4962 							c0 *= 1.0f / 16.0f;
4963 
4964 							*(float*)(source0 + 4 * x) = c0;
4965 						}
4966 
4967 						source0 += pitch;
4968 						source1 += pitch;
4969 						source2 += pitch;
4970 						source3 += pitch;
4971 						source4 += pitch;
4972 						source5 += pitch;
4973 						source6 += pitch;
4974 						source7 += pitch;
4975 						source8 += pitch;
4976 						source9 += pitch;
4977 						sourceA += pitch;
4978 						sourceB += pitch;
4979 						sourceC += pitch;
4980 						sourceD += pitch;
4981 						sourceE += pitch;
4982 						sourceF += pitch;
4983 					}
4984 				}
4985 				else ASSERT(false);
4986 			}
4987 		}
4988 		else if(internal.format == FORMAT_G32R32F)
4989 		{
4990 			#if defined(__i386__) || defined(__x86_64__)
4991 				if(CPUID::supportsSSE() && (width % 2) == 0)
4992 				{
4993 					if(internal.depth == 2)
4994 					{
4995 						for(int y = 0; y < height; y++)
4996 						{
4997 							for(int x = 0; x < width; x += 2)
4998 							{
4999 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5000 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5001 
5002 								c0 = _mm_add_ps(c0, c1);
5003 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5004 
5005 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5006 							}
5007 
5008 							source0 += pitch;
5009 							source1 += pitch;
5010 						}
5011 					}
5012 					else if(internal.depth == 4)
5013 					{
5014 						for(int y = 0; y < height; y++)
5015 						{
5016 							for(int x = 0; x < width; x += 2)
5017 							{
5018 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5019 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5020 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5021 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5022 
5023 								c0 = _mm_add_ps(c0, c1);
5024 								c2 = _mm_add_ps(c2, c3);
5025 								c0 = _mm_add_ps(c0, c2);
5026 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5027 
5028 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5029 							}
5030 
5031 							source0 += pitch;
5032 							source1 += pitch;
5033 							source2 += pitch;
5034 							source3 += pitch;
5035 						}
5036 					}
5037 					else if(internal.depth == 8)
5038 					{
5039 						for(int y = 0; y < height; y++)
5040 						{
5041 							for(int x = 0; x < width; x += 2)
5042 							{
5043 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5044 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5045 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5046 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5047 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5048 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5049 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5050 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5051 
5052 								c0 = _mm_add_ps(c0, c1);
5053 								c2 = _mm_add_ps(c2, c3);
5054 								c4 = _mm_add_ps(c4, c5);
5055 								c6 = _mm_add_ps(c6, c7);
5056 								c0 = _mm_add_ps(c0, c2);
5057 								c4 = _mm_add_ps(c4, c6);
5058 								c0 = _mm_add_ps(c0, c4);
5059 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5060 
5061 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5062 							}
5063 
5064 							source0 += pitch;
5065 							source1 += pitch;
5066 							source2 += pitch;
5067 							source3 += pitch;
5068 							source4 += pitch;
5069 							source5 += pitch;
5070 							source6 += pitch;
5071 							source7 += pitch;
5072 						}
5073 					}
5074 					else if(internal.depth == 16)
5075 					{
5076 						for(int y = 0; y < height; y++)
5077 						{
5078 							for(int x = 0; x < width; x += 2)
5079 							{
5080 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5081 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5082 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5083 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5084 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5085 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5086 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5087 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5088 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5089 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5090 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5091 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5092 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5093 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5094 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5095 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5096 
5097 								c0 = _mm_add_ps(c0, c1);
5098 								c2 = _mm_add_ps(c2, c3);
5099 								c4 = _mm_add_ps(c4, c5);
5100 								c6 = _mm_add_ps(c6, c7);
5101 								c8 = _mm_add_ps(c8, c9);
5102 								cA = _mm_add_ps(cA, cB);
5103 								cC = _mm_add_ps(cC, cD);
5104 								cE = _mm_add_ps(cE, cF);
5105 								c0 = _mm_add_ps(c0, c2);
5106 								c4 = _mm_add_ps(c4, c6);
5107 								c8 = _mm_add_ps(c8, cA);
5108 								cC = _mm_add_ps(cC, cE);
5109 								c0 = _mm_add_ps(c0, c4);
5110 								c8 = _mm_add_ps(c8, cC);
5111 								c0 = _mm_add_ps(c0, c8);
5112 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5113 
5114 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5115 							}
5116 
5117 							source0 += pitch;
5118 							source1 += pitch;
5119 							source2 += pitch;
5120 							source3 += pitch;
5121 							source4 += pitch;
5122 							source5 += pitch;
5123 							source6 += pitch;
5124 							source7 += pitch;
5125 							source8 += pitch;
5126 							source9 += pitch;
5127 							sourceA += pitch;
5128 							sourceB += pitch;
5129 							sourceC += pitch;
5130 							sourceD += pitch;
5131 							sourceE += pitch;
5132 							sourceF += pitch;
5133 						}
5134 					}
5135 					else ASSERT(false);
5136 				}
5137 				else
5138 			#endif
5139 			{
5140 				if(internal.depth == 2)
5141 				{
5142 					for(int y = 0; y < height; y++)
5143 					{
5144 						for(int x = 0; x < 2 * width; x++)
5145 						{
5146 							float c0 = *(float*)(source0 + 4 * x);
5147 							float c1 = *(float*)(source1 + 4 * x);
5148 
5149 							c0 = c0 + c1;
5150 							c0 *= 1.0f / 2.0f;
5151 
5152 							*(float*)(source0 + 4 * x) = c0;
5153 						}
5154 
5155 						source0 += pitch;
5156 						source1 += pitch;
5157 					}
5158 				}
5159 				else if(internal.depth == 4)
5160 				{
5161 					for(int y = 0; y < height; y++)
5162 					{
5163 						for(int x = 0; x < 2 * width; x++)
5164 						{
5165 							float c0 = *(float*)(source0 + 4 * x);
5166 							float c1 = *(float*)(source1 + 4 * x);
5167 							float c2 = *(float*)(source2 + 4 * x);
5168 							float c3 = *(float*)(source3 + 4 * x);
5169 
5170 							c0 = c0 + c1;
5171 							c2 = c2 + c3;
5172 							c0 = c0 + c2;
5173 							c0 *= 1.0f / 4.0f;
5174 
5175 							*(float*)(source0 + 4 * x) = c0;
5176 						}
5177 
5178 						source0 += pitch;
5179 						source1 += pitch;
5180 						source2 += pitch;
5181 						source3 += pitch;
5182 					}
5183 				}
5184 				else if(internal.depth == 8)
5185 				{
5186 					for(int y = 0; y < height; y++)
5187 					{
5188 						for(int x = 0; x < 2 * width; x++)
5189 						{
5190 							float c0 = *(float*)(source0 + 4 * x);
5191 							float c1 = *(float*)(source1 + 4 * x);
5192 							float c2 = *(float*)(source2 + 4 * x);
5193 							float c3 = *(float*)(source3 + 4 * x);
5194 							float c4 = *(float*)(source4 + 4 * x);
5195 							float c5 = *(float*)(source5 + 4 * x);
5196 							float c6 = *(float*)(source6 + 4 * x);
5197 							float c7 = *(float*)(source7 + 4 * x);
5198 
5199 							c0 = c0 + c1;
5200 							c2 = c2 + c3;
5201 							c4 = c4 + c5;
5202 							c6 = c6 + c7;
5203 							c0 = c0 + c2;
5204 							c4 = c4 + c6;
5205 							c0 = c0 + c4;
5206 							c0 *= 1.0f / 8.0f;
5207 
5208 							*(float*)(source0 + 4 * x) = c0;
5209 						}
5210 
5211 						source0 += pitch;
5212 						source1 += pitch;
5213 						source2 += pitch;
5214 						source3 += pitch;
5215 						source4 += pitch;
5216 						source5 += pitch;
5217 						source6 += pitch;
5218 						source7 += pitch;
5219 					}
5220 				}
5221 				else if(internal.depth == 16)
5222 				{
5223 					for(int y = 0; y < height; y++)
5224 					{
5225 						for(int x = 0; x < 2 * width; x++)
5226 						{
5227 							float c0 = *(float*)(source0 + 4 * x);
5228 							float c1 = *(float*)(source1 + 4 * x);
5229 							float c2 = *(float*)(source2 + 4 * x);
5230 							float c3 = *(float*)(source3 + 4 * x);
5231 							float c4 = *(float*)(source4 + 4 * x);
5232 							float c5 = *(float*)(source5 + 4 * x);
5233 							float c6 = *(float*)(source6 + 4 * x);
5234 							float c7 = *(float*)(source7 + 4 * x);
5235 							float c8 = *(float*)(source8 + 4 * x);
5236 							float c9 = *(float*)(source9 + 4 * x);
5237 							float cA = *(float*)(sourceA + 4 * x);
5238 							float cB = *(float*)(sourceB + 4 * x);
5239 							float cC = *(float*)(sourceC + 4 * x);
5240 							float cD = *(float*)(sourceD + 4 * x);
5241 							float cE = *(float*)(sourceE + 4 * x);
5242 							float cF = *(float*)(sourceF + 4 * x);
5243 
5244 							c0 = c0 + c1;
5245 							c2 = c2 + c3;
5246 							c4 = c4 + c5;
5247 							c6 = c6 + c7;
5248 							c8 = c8 + c9;
5249 							cA = cA + cB;
5250 							cC = cC + cD;
5251 							cE = cE + cF;
5252 							c0 = c0 + c2;
5253 							c4 = c4 + c6;
5254 							c8 = c8 + cA;
5255 							cC = cC + cE;
5256 							c0 = c0 + c4;
5257 							c8 = c8 + cC;
5258 							c0 = c0 + c8;
5259 							c0 *= 1.0f / 16.0f;
5260 
5261 							*(float*)(source0 + 4 * x) = c0;
5262 						}
5263 
5264 						source0 += pitch;
5265 						source1 += pitch;
5266 						source2 += pitch;
5267 						source3 += pitch;
5268 						source4 += pitch;
5269 						source5 += pitch;
5270 						source6 += pitch;
5271 						source7 += pitch;
5272 						source8 += pitch;
5273 						source9 += pitch;
5274 						sourceA += pitch;
5275 						sourceB += pitch;
5276 						sourceC += pitch;
5277 						sourceD += pitch;
5278 						sourceE += pitch;
5279 						sourceF += pitch;
5280 					}
5281 				}
5282 				else ASSERT(false);
5283 			}
5284 		}
5285 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
5286 		{
5287 			#if defined(__i386__) || defined(__x86_64__)
5288 				if(CPUID::supportsSSE())
5289 				{
5290 					if(internal.depth == 2)
5291 					{
5292 						for(int y = 0; y < height; y++)
5293 						{
5294 							for(int x = 0; x < width; x++)
5295 							{
5296 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5297 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5298 
5299 								c0 = _mm_add_ps(c0, c1);
5300 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5301 
5302 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5303 							}
5304 
5305 							source0 += pitch;
5306 							source1 += pitch;
5307 						}
5308 					}
5309 					else if(internal.depth == 4)
5310 					{
5311 						for(int y = 0; y < height; y++)
5312 						{
5313 							for(int x = 0; x < width; x++)
5314 							{
5315 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5316 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5317 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5318 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5319 
5320 								c0 = _mm_add_ps(c0, c1);
5321 								c2 = _mm_add_ps(c2, c3);
5322 								c0 = _mm_add_ps(c0, c2);
5323 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5324 
5325 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5326 							}
5327 
5328 							source0 += pitch;
5329 							source1 += pitch;
5330 							source2 += pitch;
5331 							source3 += pitch;
5332 						}
5333 					}
5334 					else if(internal.depth == 8)
5335 					{
5336 						for(int y = 0; y < height; y++)
5337 						{
5338 							for(int x = 0; x < width; x++)
5339 							{
5340 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5341 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5342 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5343 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5344 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5345 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5346 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5347 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5348 
5349 								c0 = _mm_add_ps(c0, c1);
5350 								c2 = _mm_add_ps(c2, c3);
5351 								c4 = _mm_add_ps(c4, c5);
5352 								c6 = _mm_add_ps(c6, c7);
5353 								c0 = _mm_add_ps(c0, c2);
5354 								c4 = _mm_add_ps(c4, c6);
5355 								c0 = _mm_add_ps(c0, c4);
5356 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5357 
5358 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5359 							}
5360 
5361 							source0 += pitch;
5362 							source1 += pitch;
5363 							source2 += pitch;
5364 							source3 += pitch;
5365 							source4 += pitch;
5366 							source5 += pitch;
5367 							source6 += pitch;
5368 							source7 += pitch;
5369 						}
5370 					}
5371 					else if(internal.depth == 16)
5372 					{
5373 						for(int y = 0; y < height; y++)
5374 						{
5375 							for(int x = 0; x < width; x++)
5376 							{
5377 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5378 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5379 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5380 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5381 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5382 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5383 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5384 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5385 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5386 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5387 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5388 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5389 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5390 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5391 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5392 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5393 
5394 								c0 = _mm_add_ps(c0, c1);
5395 								c2 = _mm_add_ps(c2, c3);
5396 								c4 = _mm_add_ps(c4, c5);
5397 								c6 = _mm_add_ps(c6, c7);
5398 								c8 = _mm_add_ps(c8, c9);
5399 								cA = _mm_add_ps(cA, cB);
5400 								cC = _mm_add_ps(cC, cD);
5401 								cE = _mm_add_ps(cE, cF);
5402 								c0 = _mm_add_ps(c0, c2);
5403 								c4 = _mm_add_ps(c4, c6);
5404 								c8 = _mm_add_ps(c8, cA);
5405 								cC = _mm_add_ps(cC, cE);
5406 								c0 = _mm_add_ps(c0, c4);
5407 								c8 = _mm_add_ps(c8, cC);
5408 								c0 = _mm_add_ps(c0, c8);
5409 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5410 
5411 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5412 							}
5413 
5414 							source0 += pitch;
5415 							source1 += pitch;
5416 							source2 += pitch;
5417 							source3 += pitch;
5418 							source4 += pitch;
5419 							source5 += pitch;
5420 							source6 += pitch;
5421 							source7 += pitch;
5422 							source8 += pitch;
5423 							source9 += pitch;
5424 							sourceA += pitch;
5425 							sourceB += pitch;
5426 							sourceC += pitch;
5427 							sourceD += pitch;
5428 							sourceE += pitch;
5429 							sourceF += pitch;
5430 						}
5431 					}
5432 					else ASSERT(false);
5433 				}
5434 				else
5435 			#endif
5436 			{
5437 				if(internal.depth == 2)
5438 				{
5439 					for(int y = 0; y < height; y++)
5440 					{
5441 						for(int x = 0; x < 4 * width; x++)
5442 						{
5443 							float c0 = *(float*)(source0 + 4 * x);
5444 							float c1 = *(float*)(source1 + 4 * x);
5445 
5446 							c0 = c0 + c1;
5447 							c0 *= 1.0f / 2.0f;
5448 
5449 							*(float*)(source0 + 4 * x) = c0;
5450 						}
5451 
5452 						source0 += pitch;
5453 						source1 += pitch;
5454 					}
5455 				}
5456 				else if(internal.depth == 4)
5457 				{
5458 					for(int y = 0; y < height; y++)
5459 					{
5460 						for(int x = 0; x < 4 * width; x++)
5461 						{
5462 							float c0 = *(float*)(source0 + 4 * x);
5463 							float c1 = *(float*)(source1 + 4 * x);
5464 							float c2 = *(float*)(source2 + 4 * x);
5465 							float c3 = *(float*)(source3 + 4 * x);
5466 
5467 							c0 = c0 + c1;
5468 							c2 = c2 + c3;
5469 							c0 = c0 + c2;
5470 							c0 *= 1.0f / 4.0f;
5471 
5472 							*(float*)(source0 + 4 * x) = c0;
5473 						}
5474 
5475 						source0 += pitch;
5476 						source1 += pitch;
5477 						source2 += pitch;
5478 						source3 += pitch;
5479 					}
5480 				}
5481 				else if(internal.depth == 8)
5482 				{
5483 					for(int y = 0; y < height; y++)
5484 					{
5485 						for(int x = 0; x < 4 * width; x++)
5486 						{
5487 							float c0 = *(float*)(source0 + 4 * x);
5488 							float c1 = *(float*)(source1 + 4 * x);
5489 							float c2 = *(float*)(source2 + 4 * x);
5490 							float c3 = *(float*)(source3 + 4 * x);
5491 							float c4 = *(float*)(source4 + 4 * x);
5492 							float c5 = *(float*)(source5 + 4 * x);
5493 							float c6 = *(float*)(source6 + 4 * x);
5494 							float c7 = *(float*)(source7 + 4 * x);
5495 
5496 							c0 = c0 + c1;
5497 							c2 = c2 + c3;
5498 							c4 = c4 + c5;
5499 							c6 = c6 + c7;
5500 							c0 = c0 + c2;
5501 							c4 = c4 + c6;
5502 							c0 = c0 + c4;
5503 							c0 *= 1.0f / 8.0f;
5504 
5505 							*(float*)(source0 + 4 * x) = c0;
5506 						}
5507 
5508 						source0 += pitch;
5509 						source1 += pitch;
5510 						source2 += pitch;
5511 						source3 += pitch;
5512 						source4 += pitch;
5513 						source5 += pitch;
5514 						source6 += pitch;
5515 						source7 += pitch;
5516 					}
5517 				}
5518 				else if(internal.depth == 16)
5519 				{
5520 					for(int y = 0; y < height; y++)
5521 					{
5522 						for(int x = 0; x < 4 * width; x++)
5523 						{
5524 							float c0 = *(float*)(source0 + 4 * x);
5525 							float c1 = *(float*)(source1 + 4 * x);
5526 							float c2 = *(float*)(source2 + 4 * x);
5527 							float c3 = *(float*)(source3 + 4 * x);
5528 							float c4 = *(float*)(source4 + 4 * x);
5529 							float c5 = *(float*)(source5 + 4 * x);
5530 							float c6 = *(float*)(source6 + 4 * x);
5531 							float c7 = *(float*)(source7 + 4 * x);
5532 							float c8 = *(float*)(source8 + 4 * x);
5533 							float c9 = *(float*)(source9 + 4 * x);
5534 							float cA = *(float*)(sourceA + 4 * x);
5535 							float cB = *(float*)(sourceB + 4 * x);
5536 							float cC = *(float*)(sourceC + 4 * x);
5537 							float cD = *(float*)(sourceD + 4 * x);
5538 							float cE = *(float*)(sourceE + 4 * x);
5539 							float cF = *(float*)(sourceF + 4 * x);
5540 
5541 							c0 = c0 + c1;
5542 							c2 = c2 + c3;
5543 							c4 = c4 + c5;
5544 							c6 = c6 + c7;
5545 							c8 = c8 + c9;
5546 							cA = cA + cB;
5547 							cC = cC + cD;
5548 							cE = cE + cF;
5549 							c0 = c0 + c2;
5550 							c4 = c4 + c6;
5551 							c8 = c8 + cA;
5552 							cC = cC + cE;
5553 							c0 = c0 + c4;
5554 							c8 = c8 + cC;
5555 							c0 = c0 + c8;
5556 							c0 *= 1.0f / 16.0f;
5557 
5558 							*(float*)(source0 + 4 * x) = c0;
5559 						}
5560 
5561 						source0 += pitch;
5562 						source1 += pitch;
5563 						source2 += pitch;
5564 						source3 += pitch;
5565 						source4 += pitch;
5566 						source5 += pitch;
5567 						source6 += pitch;
5568 						source7 += pitch;
5569 						source8 += pitch;
5570 						source9 += pitch;
5571 						sourceA += pitch;
5572 						sourceB += pitch;
5573 						sourceC += pitch;
5574 						sourceD += pitch;
5575 						sourceE += pitch;
5576 						sourceF += pitch;
5577 					}
5578 				}
5579 				else ASSERT(false);
5580 			}
5581 		}
5582 		else if(internal.format == FORMAT_R5G6B5)
5583 		{
5584 			#if defined(__i386__) || defined(__x86_64__)
5585 				if(CPUID::supportsSSE2() && (width % 8) == 0)
5586 				{
5587 					if(internal.depth == 2)
5588 					{
5589 						for(int y = 0; y < height; y++)
5590 						{
5591 							for(int x = 0; x < width; x += 8)
5592 							{
5593 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5594 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5595 
5596 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5597 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5598 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5599 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5600 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5601 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5602 
5603 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5604 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5605 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5606 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5607 								c0 = _mm_or_si128(c0, c1);
5608 
5609 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5610 							}
5611 
5612 							source0 += pitch;
5613 							source1 += pitch;
5614 						}
5615 					}
5616 					else if(internal.depth == 4)
5617 					{
5618 						for(int y = 0; y < height; y++)
5619 						{
5620 							for(int x = 0; x < width; x += 8)
5621 							{
5622 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5623 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5624 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5625 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5626 
5627 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5628 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5629 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5630 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5631 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5632 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5633 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5634 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5635 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5636 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5637 
5638 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5639 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5640 								c0 = _mm_avg_epu8(c0, c2);
5641 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5642 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5643 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5644 								c1 = _mm_avg_epu16(c1, c3);
5645 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5646 								c0 = _mm_or_si128(c0, c1);
5647 
5648 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5649 							}
5650 
5651 							source0 += pitch;
5652 							source1 += pitch;
5653 							source2 += pitch;
5654 							source3 += pitch;
5655 						}
5656 					}
5657 					else if(internal.depth == 8)
5658 					{
5659 						for(int y = 0; y < height; y++)
5660 						{
5661 							for(int x = 0; x < width; x += 8)
5662 							{
5663 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5664 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5665 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5666 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5667 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5668 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5669 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5670 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5671 
5672 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5673 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5674 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5675 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5676 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5677 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5678 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5679 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5680 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5681 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5682 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5683 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5684 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5685 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5686 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5687 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5688 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5689 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5690 
5691 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5692 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5693 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5694 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5695 								c0 = _mm_avg_epu8(c0, c2);
5696 								c4 = _mm_avg_epu8(c4, c6);
5697 								c0 = _mm_avg_epu8(c0, c4);
5698 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5699 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5700 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5701 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5702 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5703 								c1 = _mm_avg_epu16(c1, c3);
5704 								c5 = _mm_avg_epu16(c5, c7);
5705 								c1 = _mm_avg_epu16(c1, c5);
5706 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5707 								c0 = _mm_or_si128(c0, c1);
5708 
5709 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5710 							}
5711 
5712 							source0 += pitch;
5713 							source1 += pitch;
5714 							source2 += pitch;
5715 							source3 += pitch;
5716 							source4 += pitch;
5717 							source5 += pitch;
5718 							source6 += pitch;
5719 							source7 += pitch;
5720 						}
5721 					}
5722 					else if(internal.depth == 16)
5723 					{
5724 						for(int y = 0; y < height; y++)
5725 						{
5726 							for(int x = 0; x < width; x += 8)
5727 							{
5728 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5729 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5730 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5731 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5732 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5733 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5734 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5735 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5736 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5737 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5738 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5739 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5740 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5741 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5742 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5743 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5744 
5745 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5746 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5747 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5748 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5749 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5750 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5751 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5752 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5753 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5754 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5755 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5756 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5757 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5758 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5759 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5760 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5761 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5762 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5763 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5764 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5765 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5766 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5767 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5768 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5769 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5770 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5771 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5772 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5773 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5774 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5775 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5776 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5777 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5778 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5779 
5780 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5781 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5782 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5783 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5784 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5785 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5786 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5787 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5788 								c0 = _mm_avg_epu8(c0, c2);
5789 								c4 = _mm_avg_epu8(c4, c6);
5790 								c8 = _mm_avg_epu8(c8, cA);
5791 								cC = _mm_avg_epu8(cC, cE);
5792 								c0 = _mm_avg_epu8(c0, c4);
5793 								c8 = _mm_avg_epu8(c8, cC);
5794 								c0 = _mm_avg_epu8(c0, c8);
5795 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5796 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5797 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5798 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5799 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5800 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
5801 								cB = _mm_avg_epu16(cA__g_, cB__g_);
5802 								cD = _mm_avg_epu16(cC__g_, cD__g_);
5803 								cF = _mm_avg_epu16(cE__g_, cF__g_);
5804 								c1 = _mm_avg_epu8(c1, c3);
5805 								c5 = _mm_avg_epu8(c5, c7);
5806 								c9 = _mm_avg_epu8(c9, cB);
5807 								cD = _mm_avg_epu8(cD, cF);
5808 								c1 = _mm_avg_epu8(c1, c5);
5809 								c9 = _mm_avg_epu8(c9, cD);
5810 								c1 = _mm_avg_epu8(c1, c9);
5811 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5812 								c0 = _mm_or_si128(c0, c1);
5813 
5814 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5815 							}
5816 
5817 							source0 += pitch;
5818 							source1 += pitch;
5819 							source2 += pitch;
5820 							source3 += pitch;
5821 							source4 += pitch;
5822 							source5 += pitch;
5823 							source6 += pitch;
5824 							source7 += pitch;
5825 							source8 += pitch;
5826 							source9 += pitch;
5827 							sourceA += pitch;
5828 							sourceB += pitch;
5829 							sourceC += pitch;
5830 							sourceD += pitch;
5831 							sourceE += pitch;
5832 							sourceF += pitch;
5833 						}
5834 					}
5835 					else ASSERT(false);
5836 				}
5837 				else
5838 			#endif
5839 			{
5840 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5841 
5842 				if(internal.depth == 2)
5843 				{
5844 					for(int y = 0; y < height; y++)
5845 					{
5846 						for(int x = 0; x < width; x++)
5847 						{
5848 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5849 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5850 
5851 							c0 = AVERAGE(c0, c1);
5852 
5853 							*(unsigned short*)(source0 + 2 * x) = c0;
5854 						}
5855 
5856 						source0 += pitch;
5857 						source1 += pitch;
5858 					}
5859 				}
5860 				else if(internal.depth == 4)
5861 				{
5862 					for(int y = 0; y < height; y++)
5863 					{
5864 						for(int x = 0; x < width; x++)
5865 						{
5866 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5867 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5868 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5869 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5870 
5871 							c0 = AVERAGE(c0, c1);
5872 							c2 = AVERAGE(c2, c3);
5873 							c0 = AVERAGE(c0, c2);
5874 
5875 							*(unsigned short*)(source0 + 2 * x) = c0;
5876 						}
5877 
5878 						source0 += pitch;
5879 						source1 += pitch;
5880 						source2 += pitch;
5881 						source3 += pitch;
5882 					}
5883 				}
5884 				else if(internal.depth == 8)
5885 				{
5886 					for(int y = 0; y < height; y++)
5887 					{
5888 						for(int x = 0; x < width; x++)
5889 						{
5890 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5891 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5892 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5893 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5894 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5895 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5896 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5897 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5898 
5899 							c0 = AVERAGE(c0, c1);
5900 							c2 = AVERAGE(c2, c3);
5901 							c4 = AVERAGE(c4, c5);
5902 							c6 = AVERAGE(c6, c7);
5903 							c0 = AVERAGE(c0, c2);
5904 							c4 = AVERAGE(c4, c6);
5905 							c0 = AVERAGE(c0, c4);
5906 
5907 							*(unsigned short*)(source0 + 2 * x) = c0;
5908 						}
5909 
5910 						source0 += pitch;
5911 						source1 += pitch;
5912 						source2 += pitch;
5913 						source3 += pitch;
5914 						source4 += pitch;
5915 						source5 += pitch;
5916 						source6 += pitch;
5917 						source7 += pitch;
5918 					}
5919 				}
5920 				else if(internal.depth == 16)
5921 				{
5922 					for(int y = 0; y < height; y++)
5923 					{
5924 						for(int x = 0; x < width; x++)
5925 						{
5926 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5927 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5928 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5929 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5930 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5931 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5932 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5933 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5934 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5935 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5936 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5937 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5938 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5939 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5940 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5941 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5942 
5943 							c0 = AVERAGE(c0, c1);
5944 							c2 = AVERAGE(c2, c3);
5945 							c4 = AVERAGE(c4, c5);
5946 							c6 = AVERAGE(c6, c7);
5947 							c8 = AVERAGE(c8, c9);
5948 							cA = AVERAGE(cA, cB);
5949 							cC = AVERAGE(cC, cD);
5950 							cE = AVERAGE(cE, cF);
5951 							c0 = AVERAGE(c0, c2);
5952 							c4 = AVERAGE(c4, c6);
5953 							c8 = AVERAGE(c8, cA);
5954 							cC = AVERAGE(cC, cE);
5955 							c0 = AVERAGE(c0, c4);
5956 							c8 = AVERAGE(c8, cC);
5957 							c0 = AVERAGE(c0, c8);
5958 
5959 							*(unsigned short*)(source0 + 2 * x) = c0;
5960 						}
5961 
5962 						source0 += pitch;
5963 						source1 += pitch;
5964 						source2 += pitch;
5965 						source3 += pitch;
5966 						source4 += pitch;
5967 						source5 += pitch;
5968 						source6 += pitch;
5969 						source7 += pitch;
5970 						source8 += pitch;
5971 						source9 += pitch;
5972 						sourceA += pitch;
5973 						sourceB += pitch;
5974 						sourceC += pitch;
5975 						sourceD += pitch;
5976 						sourceE += pitch;
5977 						sourceF += pitch;
5978 					}
5979 				}
5980 				else ASSERT(false);
5981 
5982 				#undef AVERAGE
5983 			}
5984 		}
5985 		else
5986 		{
5987 		//	UNIMPLEMENTED();
5988 		}
5989 	}
5990 }
5991