Lines Matching +full:- +full:x
22 // clang-format off
23 // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
29 …NT_SUPPORT // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
40 /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41 has simple "y * Pitch + x" addressing but has spatial locality only in
42 horizontal direction--i.e. horizontal pixel neighbors stored next to each other
45 Since many graphics operations involve multi-dimensional data access, to
47 alternative storage conventions which have multi-dimensional spatial locality--
51 "Tiling/Swizzling" is storage convention that increases multi-dimensional
53 laid out in row-major order across surface, with entire content of each tile
60 /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
62 chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
65 Since tile size and dimensions all powers of two, the spatial-to-linear mapping
67 mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68 tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69 those 12 index bits can be individually mapped to a bit in the 12-bit offset of
77 \-- Y[4:0] --/ \----- X[6:0] -----/
79 ...stores bytes of tile in row-major order, with horizontal neighbors stored
81 bits were mapped to the low-order...
84 \----- X[6:0] -----/ \-- Y[4:0] --/
86 ...bytes of tile would be stored in column-major order, with vertical neighbors
89 Individual X and Y bits can be separated and interspersed in mapping to
90 increase locality via sub-tiling--e.g...
93 \-- Sub-Tile ---/
95 ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96 with sub-tile content further stored in row-major order, with horizontal byte
97 neighbors within sub-tile stored contiguously and vertical neighbors only 16
98 bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99 pixels--which is powerful spatial locality for graphics processing.
103 indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
106 Linear[11:0] = Y Y Y X X X Y Y X X X X
108 ...where X and Y index bits are implied to be zero-based-counted in order they
111 In software, spatial-to-linear mapping conveniently described with bit mask for
113 index is mapped to that position in the linear offset--e.g....
115 Linear[11:0] = Y Y Y X X X Y Y X X X X
131 int x, y, z; member
154 #define X ,'x' macro
165 …x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : …
172 SWIZZLE(( INTEL_TILE_X o o o o Y Y Y X X X X X X X X X ));
173 SWIZZLE(( INTEL_TILE_Y o o o o X X X Y Y Y Y Y X X X X ));
176 SWIZZLE(( INTEL_TILE_W o o o o X X X Y Y Y Y X Y X Y X ));
179 SWIZZLE(( INTEL_TILE_YF_128 o o o o X Y X Y X X Y Y X X X X ));
180 SWIZZLE(( INTEL_TILE_YF_64 o o o o X Y X Y X X Y Y X X X X ));
181 SWIZZLE(( INTEL_TILE_YF_32 o o o o X Y X Y X Y Y Y X X X X ));
182 SWIZZLE(( INTEL_TILE_YF_16 o o o o X Y X Y X Y Y Y X X X X ));
183 SWIZZLE(( INTEL_TILE_YF_8 o o o o X Y X Y Y Y Y Y X X X X ));
185 SWIZZLE(( INTEL_TILE_YS_128 X Y X Y X Y X Y X X Y Y X X X X ));
186 SWIZZLE(( INTEL_TILE_YS_64 X Y X Y X Y X Y X X Y Y X X X X ));
187 SWIZZLE(( INTEL_TILE_YS_32 X Y X Y X Y X Y X Y Y Y X X X X ));
188 SWIZZLE(( INTEL_TILE_YS_16 X Y X Y X Y X Y X Y Y Y X X X X ));
189 SWIZZLE(( INTEL_TILE_YS_8 X Y X Y X Y X Y Y Y Y Y X X X X ));
191 SWIZZLE(( INTEL_TILE_YF_MSAA2_128 o o o o S Y X Y X X Y Y X X X X ));
192 SWIZZLE(( INTEL_TILE_YF_MSAA2_64 o o o o S Y X Y X X Y Y X X X X ));
193 SWIZZLE(( INTEL_TILE_YF_MSAA2_32 o o o o S Y X Y X Y Y Y X X X X ));
194 SWIZZLE(( INTEL_TILE_YF_MSAA2_16 o o o o S Y X Y X Y Y Y X X X X ));
195 SWIZZLE(( INTEL_TILE_YF_MSAA2_8 o o o o S Y X Y Y Y Y Y X X X X ));
197 SWIZZLE(( INTEL_TILE_YS_MSAA2_128 S Y X Y X Y X Y X X Y Y X X X X ));
198 SWIZZLE(( INTEL_TILE_YS_MSAA2_64 S Y X Y X Y X Y X X Y Y X X X X ));
199 SWIZZLE(( INTEL_TILE_YS_MSAA2_32 S Y X Y X Y X Y X Y Y Y X X X X ));
200 SWIZZLE(( INTEL_TILE_YS_MSAA2_16 S Y X Y X Y X Y X Y Y Y X X X X ));
201 SWIZZLE(( INTEL_TILE_YS_MSAA2_8 S Y X Y X Y X Y Y Y Y Y X X X X ));
203 SWIZZLE(( INTEL_TILE_YF_MSAA4_128 o o o o S S X Y X X Y Y X X X X ));
204 SWIZZLE(( INTEL_TILE_YF_MSAA4_64 o o o o S S X Y X X Y Y X X X X ));
205 SWIZZLE(( INTEL_TILE_YF_MSAA4_32 o o o o S S X Y X Y Y Y X X X X ));
206 SWIZZLE(( INTEL_TILE_YF_MSAA4_16 o o o o S S X Y X Y Y Y X X X X ));
207 SWIZZLE(( INTEL_TILE_YF_MSAA4_8 o o o o S S X Y Y Y Y Y X X X X ));
209 SWIZZLE(( INTEL_TILE_YS_MSAA4_128 S S X Y X Y X Y X X Y Y X X X X ));
210 SWIZZLE(( INTEL_TILE_YS_MSAA4_64 S S X Y X Y X Y X X Y Y X X X X ));
211 SWIZZLE(( INTEL_TILE_YS_MSAA4_32 S S X Y X Y X Y X Y Y Y X X X X ));
212 SWIZZLE(( INTEL_TILE_YS_MSAA4_16 S S X Y X Y X Y X Y Y Y X X X X ));
213 SWIZZLE(( INTEL_TILE_YS_MSAA4_8 S S X Y X Y X Y Y Y Y Y X X X X ));
215 SWIZZLE(( INTEL_TILE_YF_MSAA8_128 o o o o S S S Y X X Y Y X X X X ));
216 SWIZZLE(( INTEL_TILE_YF_MSAA8_64 o o o o S S S Y X X Y Y X X X X ));
217 SWIZZLE(( INTEL_TILE_YF_MSAA8_32 o o o o S S S Y X Y Y Y X X X X ));
218 SWIZZLE(( INTEL_TILE_YF_MSAA8_16 o o o o S S S Y X Y Y Y X X X X ));
219 SWIZZLE(( INTEL_TILE_YF_MSAA8_8 o o o o S S S Y Y Y Y Y X X X X ));
221 SWIZZLE(( INTEL_TILE_YS_MSAA8_128 S S S Y X Y X Y X X Y Y X X X X ));
222 SWIZZLE(( INTEL_TILE_YS_MSAA8_64 S S S Y X Y X Y X X Y Y X X X X ));
223 SWIZZLE(( INTEL_TILE_YS_MSAA8_32 S S S Y X Y X Y X Y Y Y X X X X ));
224 SWIZZLE(( INTEL_TILE_YS_MSAA8_16 S S S Y X Y X Y X Y Y Y X X X X ));
225 SWIZZLE(( INTEL_TILE_YS_MSAA8_8 S S S Y X Y X Y Y Y Y Y X X X X ));
227 SWIZZLE(( INTEL_TILE_YF_MSAA16_128 o o o o S S S S X X Y Y X X X X ));
228 SWIZZLE(( INTEL_TILE_YF_MSAA16_64 o o o o S S S S X X Y Y X X X X ));
229 SWIZZLE(( INTEL_TILE_YF_MSAA16_32 o o o o S S S S X Y Y Y X X X X ));
230 SWIZZLE(( INTEL_TILE_YF_MSAA16_16 o o o o S S S S X Y Y Y X X X X ));
231 SWIZZLE(( INTEL_TILE_YF_MSAA16_8 o o o o S S S S Y Y Y Y X X X X ));
233 SWIZZLE(( INTEL_TILE_YS_MSAA16_128 S S S S X Y X Y X X Y Y X X X X ));
234 SWIZZLE(( INTEL_TILE_YS_MSAA16_64 S S S S X Y X Y X X Y Y X X X X ));
235 SWIZZLE(( INTEL_TILE_YS_MSAA16_32 S S S S X Y X Y X Y Y Y X X X X ));
236 SWIZZLE(( INTEL_TILE_YS_MSAA16_16 S S S S X Y X Y X Y Y Y X X X X ));
237 SWIZZLE(( INTEL_TILE_YS_MSAA16_8 S S S S X Y X Y Y Y Y Y X X X X ));
239 SWIZZLE(( INTEL_TILE_YF_3D_128 o o o o Y Z X X Z Z Y Y X X X X ));
240 SWIZZLE(( INTEL_TILE_YF_3D_64 o o o o Y Z X X Z Z Y Y X X X X ));
241 SWIZZLE(( INTEL_TILE_YF_3D_32 o o o o Y Z X Y Z Z Y Y X X X X ));
242 SWIZZLE(( INTEL_TILE_YF_3D_16 o o o o Y Z Y Z Z Z Y Y X X X X ));
243 SWIZZLE(( INTEL_TILE_YF_3D_8 o o o o Y Z Y Z Z Z Y Y X X X X ));
245 SWIZZLE(( INTEL_TILE_YS_3D_128 X Y Z X Y Z X X Z Z Y Y X X X X ));
246 SWIZZLE(( INTEL_TILE_YS_3D_64 X Y Z X Y Z X X Z Z Y Y X X X X ));
247 SWIZZLE(( INTEL_TILE_YS_3D_32 X Y Z X Y Z X Y Z Z Y Y X X X X ));
248 SWIZZLE(( INTEL_TILE_YS_3D_16 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
249 SWIZZLE(( INTEL_TILE_YS_3D_8 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
252 SWIZZLE(( INTEL_TILE_4 o o o o Y Y X Y X X Y Y X X X X ));
254 SWIZZLE(( INTEL_TILE_64_128 Y X X X Y Y X Y X X Y Y X X X X ));
255 SWIZZLE(( INTEL_TILE_64_64 Y X X X Y Y X Y X X Y Y X X X X ));
256 SWIZZLE(( INTEL_TILE_64_32 Y Y X X Y Y X Y X X Y Y X X X X ));
257 SWIZZLE(( INTEL_TILE_64_16 Y Y X X Y Y X Y X X Y Y X X X X ));
258 SWIZZLE(( INTEL_TILE_64_8 Y Y Y X Y Y X Y X X Y Y X X X X ));
260 SWIZZLE(( INTEL_TILE_64_MSAA2_128 Y X X X Y Y X Y S X Y Y X X X X ));
261 SWIZZLE(( INTEL_TILE_64_MSAA2_64 Y X X X Y Y X Y S X Y Y X X X X ));
262 SWIZZLE(( INTEL_TILE_64_MSAA2_32 Y Y X X Y Y X Y S X Y Y X X X X ));
263 SWIZZLE(( INTEL_TILE_64_MSAA2_16 Y Y X X Y Y X Y S X Y Y X X X X ));
264 SWIZZLE(( INTEL_TILE_64_MSAA2_8 Y Y Y X Y Y X Y S X Y Y X X X X ));
266 SWIZZLE(( INTEL_TILE_64_MSAA_128 Y X X X Y Y X S S X Y Y X X X X ));
267 SWIZZLE(( INTEL_TILE_64_MSAA_64 Y X X X Y Y X S S X Y Y X X X X ));
268 SWIZZLE(( INTEL_TILE_64_MSAA_32 Y Y X X Y Y X S S X Y Y X X X X ));
269 SWIZZLE(( INTEL_TILE_64_MSAA_16 Y Y X X Y Y X S S X Y Y X X X X ));
270 SWIZZLE(( INTEL_TILE_64_MSAA_8 Y Y Y X Y Y X S S X Y Y X X X X ));
272 SWIZZLE(( INTEL_TILE_64_3D_128 Z Z Y X X X Z Y Z X Y Y X X X X ));
273 SWIZZLE(( INTEL_TILE_64_3D_64 Z Z Y X X X Z Y Z X Y Y X X X X ));
274 SWIZZLE(( INTEL_TILE_64_3D_32 Z Z Y X Y X Z Y Z X Y Y X X X X ));
275 SWIZZLE(( INTEL_TILE_64_3D_16 Z Z Z Y Y X Z Y Z X Y Y X X X X ));
276 SWIZZLE(( INTEL_TILE_64_3D_8 Z Z Z X Y Y Z Y Z X Y Y X X X X ));
280 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_128 Y X X X Y Y X S X X Y Y X X X X ));
281 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_64 Y Y X X Y Y X S X X Y Y X X X X ));
282 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_32 Y Y Y X Y Y X S X X Y Y X X X X ));
283 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_16 Y Y Y X Y Y X S X X Y Y X X X X ));
284 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_8 Y Y Y Y Y Y X S X X Y Y X X X X ));
286 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_128 Y X X X Y Y S S X X Y Y X X X X ));
287 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_64 Y X X X Y Y S S X X Y Y X X X X ));
288 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_32 Y Y X X Y Y S S X X Y Y X X X X ));
289 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_16 Y Y X X Y Y S S X X Y Y X X X X ));
290 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_8 Y Y Y X Y Y S S X X Y Y X X X X ));
292 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_128 Y Y X X Y X S S S X Y Y X X X X ));
293 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_64 Y Y X X Y X S S S X Y Y X X X X ));
294 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_32 Y Y X X Y X S S S X Y Y X X X X ));
295 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_16 Y Y Y X Y X S S S X Y Y X X X X ));
296 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_8 Y Y Y X Y X S S S X Y Y X X X X ));
298 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_128 Y X X X Y X S S S S Y Y X X X X ));
299 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_64 Y Y X X Y X S S S S Y Y X X X X ));
300 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_32 Y Y X X Y X S S S S Y Y X X X X ));
301 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_16 Y Y X X Y X S S S S Y Y X X X X ));
302 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_8 Y Y Y X Y X S S S S Y Y X X X X ));
304 SWIZZLE(( INTEL_TILE_64_V2_3D_128 Z Z Y X X Y Z Z X X Y Y X X X X ));
305 SWIZZLE(( INTEL_TILE_64_V2_3D_64 Z Z Y X X Y Z Z X X Y Y X X X X ));
306 SWIZZLE(( INTEL_TILE_64_V2_3D_32 Z Z Y X Y Y Z Z X X Y Y X X X X ));
307 SWIZZLE(( INTEL_TILE_64_V2_3D_16 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
308 SWIZZLE(( INTEL_TILE_64_V2_3D_8 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
311 #undef X
322 formats, logically accessing such surfaces with CPU-based software is non-
326 dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
327 between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
328 high-performance, swizzling BLT implementation to be used both in production
336 int Pitch, Height; // Row-pitch in bytes, and height, of surface.
345 …int Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pi…
372 #include "assert.h" // Quoted to allow local-directory override.
388 #define POPCNT4(x) (PopCnt4[(x) & 0xf]) argument
389 #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x)) argument
394 /* Return swizzled offset of dimensionally-specified surface byte. */ in SwizzleOffset()
397 int Pitch, // Pointer to applicable surface row-pitch. in SwizzleOffset()
402 /* Given logically-specified (x, y, z) byte within swizzled surface, in SwizzleOffset()
403 function returns byte's linear/memory offset from surface's base--i.e. it in SwizzleOffset()
404 performs the swizzled, spatial-to-linear mapping. in SwizzleOffset()
410 (probably single-dimension, intra-tile offsets) and uses a fast computation in SwizzleOffset()
411 (e.g. LUT's, hard-codings, PDEP). */ in SwizzleOffset()
415 char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction in SwizzleOffset()
419 int TileWidthBits = POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes) in SwizzleOffset()
420 int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height) in SwizzleOffset()
421 int TileDepthBits = POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples) in SwizzleOffset()
426 int x, y, z; // Position of specified byte within tile that contains it. in SwizzleOffset() local
428 if(PDepSupported == -1) in SwizzleOffset()
447 (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) == in SwizzleOffset()
448 (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z)); in SwizzleOffset()
450 assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)... in SwizzleOffset()
451 (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16)); in SwizzleOffset()
456 { // Break Positioning into Tile-Granular and Intra-Tile Components... in SwizzleOffset()
457 … TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes. in SwizzleOffset()
458 z = OffsetZ & ((1 << TileDepthBits) - 1); in SwizzleOffset()
461 y = OffsetY & ((1 << TileHeightBits) - 1); in SwizzleOffset()
464 x = OffsetX & ((1 << TileWidthBits) - 1); in SwizzleOffset()
468 … (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order. in SwizzleOffset()
474 PDEP(x, pSwizzle->Mask.x) + in SwizzleOffset()
475 PDEP(y, pSwizzle->Mask.y) + in SwizzleOffset()
476 PDEP(z, pSwizzle->Mask.z); in SwizzleOffset()
481 int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z; in SwizzleOffset()
486 MaskQ = bitMask & pSwizzle->Mask.Q; \ in SwizzleOffset()
490 PROCESS(x); in SwizzleOffset()
517 CopyWidthBytes in terms of unswizzled surface's element-pitches: in CpuSwizzleBlt()
529 assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL)); in CpuSwizzleBlt()
531 LinearToSwizzled = !pSrc->pSwizzle; in CpuSwizzleBlt()
537 else // Swizzled-to-Linear... in CpuSwizzleBlt()
547 (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0)); in CpuSwizzleBlt()
550 pDest->Element.Size == pSrc->Element.Size); in CpuSwizzleBlt()
553 !(pDest->Element.Pitch && !pDest->Element.Size)); in CpuSwizzleBlt()
556 (pDest->Element.Size <= pDest->Element.Pitch) && in CpuSwizzleBlt()
557 (pSrc->Element.Size <= pSrc->Element.Pitch)); in CpuSwizzleBlt()
559 assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch... in CpuSwizzleBlt()
560 (pLinearSurface->Element.Pitch == 0) || in CpuSwizzleBlt()
561 ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0)); in CpuSwizzleBlt()
569 // Sub-element transfer... in CpuSwizzleBlt()
570 ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) || in CpuSwizzleBlt()
571 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) && in CpuSwizzleBlt()
573 ((pLinearSurface->OffsetX + CopyWidthBytes) <= in CpuSwizzleBlt()
574 (pLinearSurface->Pitch + in CpuSwizzleBlt()
576 (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) && in CpuSwizzleBlt()
577 ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) && in CpuSwizzleBlt()
578 ((pSwizzledSurface->OffsetX + in CpuSwizzleBlt()
580 … (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch) in CpuSwizzleBlt()
582 (pSwizzledSurface->Pitch + in CpuSwizzleBlt()
584 (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) && in CpuSwizzleBlt()
585 ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height) in CpuSwizzleBlt()
589 ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) && in CpuSwizzleBlt()
590 ((pDest->OffsetY + CopyHeight) <= pDest->Height) && in CpuSwizzleBlt()
591 ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) && in CpuSwizzleBlt()
592 ((pSrc->OffsetY + CopyHeight) <= pSrc->Height); in CpuSwizzleBlt()
598 char *pDest0 = (char *) pDest->pBase; in CpuSwizzleBlt()
599 char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight; in CpuSwizzleBlt()
600 char *pSrc0 = (char *) pSrc->pBase; in CpuSwizzleBlt()
601 char *pSrc1 = (char *) pSrc->pBase + pSrc->Pitch * CopyHeight; in CpuSwizzleBlt()
617 int x0 = pSwizzledSurface->OffsetX; in CpuSwizzleBlt()
619 int y0 = pSwizzledSurface->OffsetY; in CpuSwizzleBlt()
621 int x, y; in CpuSwizzleBlt() local
625 (char *) pLinearSurface->pBase + in CpuSwizzleBlt()
626 pLinearSurface->OffsetY * pLinearSurface->Pitch + in CpuSwizzleBlt()
627 pLinearSurface->OffsetX; in CpuSwizzleBlt()
632 assert( // No Sub-Element Transfer... in CpuSwizzleBlt()
633 (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) && in CpuSwizzleBlt()
634 (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch)); in CpuSwizzleBlt()
639 for(x = x0; x < x1; x++) in CpuSwizzleBlt()
642 (char *) pSwizzledSurface->pBase + in CpuSwizzleBlt()
644 pSwizzledSurface->pSwizzle, in CpuSwizzleBlt()
645 pSwizzledSurface->Pitch, in CpuSwizzleBlt()
646 x, y, pSwizzledSurface->OffsetZ); in CpuSwizzleBlt()
660 pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes; in CpuSwizzleBlt()
676 #define LOW_BIT(x) (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index) in CpuSwizzleBlt() argument
679 #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index) in CpuSwizzleBlt() argument
683 #define LOW_BIT(x) __builtin_ctz(x) in CpuSwizzleBlt()
684 #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x)) in CpuSwizzleBlt()
692 } __m24; // 24-bit/3-byte memory element. in CpuSwizzleBlt()
716 …#define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap)… in CpuSwizzleBlt() argument
719 … SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ) in CpuSwizzleBlt()
724 char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA in CpuSwizzleBlt()
726 … int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x); // Log2(Tile Width in Bytes) in CpuSwizzleBlt()
727 int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y); // Log2(Tile Height) in CpuSwizzleBlt()
728 …int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Sam… in CpuSwizzleBlt()
729 int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits); in CpuSwizzleBlt()
737 (char *) pSwizzledSurface->pBase + in CpuSwizzleBlt()
738 SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ); in CpuSwizzleBlt()
742 if(StreamingLoadSupported == -1) in CpuSwizzleBlt()
767 drastically different memory orderings--Moving linearly through in CpuSwizzleBlt()
771 convenient--especially when BLT rectangles not constrained to in CpuSwizzleBlt()
773 memory is often more performance-friendly--especially when that in CpuSwizzleBlt()
774 memory is CPU-mapped as WC (Write Combining), which is often in CpuSwizzleBlt()
783 sizzled memory are aligned, cache-line-sized memory chunks. If in CpuSwizzleBlt()
785 of partial WC buffer use (whether from WC memory use or non- in CpuSwizzleBlt()
788 The size of 2D chunks with cache-line-sized linearity in in CpuSwizzleBlt()
789 swizzled memory is determined by swizzle mapping's low-order in CpuSwizzleBlt()
790 six bits (for 64-byte cache lines). Most swizzles use in CpuSwizzleBlt()
791 "Y Y X X X X" in their low-order bits, which means their cache in CpuSwizzleBlt()
792 lines store 16x4 chunks--So our implementation will use those in CpuSwizzleBlt()
799 chunks stored in row-major order--i.e. those whose swizzle in CpuSwizzleBlt()
800 mapping bits have a series of X's in the low-order, followed by in CpuSwizzleBlt()
801 Y's in the higher-order. Where a swizzle mapping inflection in CpuSwizzleBlt()
802 from Y back to X occurs, contiguous row-ordering is lost, and in CpuSwizzleBlt()
803 we would use that smaller, row-ordered chunk size. */ in CpuSwizzleBlt()
807 // Narrow optimized transfer Width by looking for inflection from X's... in CpuSwizzleBlt()
809 while( (TargetMask = SwizzleMaxXfer.Width - 1) && in CpuSwizzleBlt()
810 ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask)) in CpuSwizzleBlt()
818 while( (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) && in CpuSwizzleBlt()
819 ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask)) in CpuSwizzleBlt()
828 CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary" in CpuSwizzleBlt()
829 …(MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth… in CpuSwizzleBlt()
832 …(CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxX… in CpuSwizzleBlt()
834 CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun); in CpuSwizzleBlt()
838 … // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel... in CpuSwizzleBlt()
839 if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) || in CpuSwizzleBlt()
840 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) in CpuSwizzleBlt()
855 dimensional components--e.g. so Y-swizzling doesn't have to be in CpuSwizzleBlt()
856 recomputed in X-loop. in CpuSwizzleBlt()
861 keep them swizzled using swizzled incrementing inside the loops-- in CpuSwizzleBlt()
865 Intra-tile swizzled incrementing can be done by using the inverse in CpuSwizzleBlt()
866 of a spatial component's swizzle mask to ripple-carry a +1 to and in CpuSwizzleBlt()
867 across the bits of a currently swizzled value--e.g. with... in CpuSwizzleBlt()
869 SwizzledOffsetY: Y X Y X Y Y X X X X in CpuSwizzleBlt()
872 ----------------------- in CpuSwizzleBlt()
874 ...set low-order ~MaskY bits will always ripple-carry the in CpuSwizzleBlt()
885 mask--So the final intra-tile swizzled increment is... in CpuSwizzleBlt()
888 ...where Q is the applicable X/Y/Z dimensional component. in CpuSwizzleBlt()
890 Or since in two's compliment, (~MaskQ + 1) = -MaskQ... in CpuSwizzleBlt()
892 SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ in CpuSwizzleBlt()
894 Since tile sizes are powers of two and tiles laid out in row-major in CpuSwizzleBlt()
896 additionally be used for inter-tile incrementing of X component by in CpuSwizzleBlt()
897 extending applicable mask to include offset bits beyond the tile-- in CpuSwizzleBlt()
898 so arithmetic carries out of intra-tile X component will ripple to in CpuSwizzleBlt()
899 advance swizzled inter-tile X offset to next tile. Same is not true in CpuSwizzleBlt()
900 of inter-tile Y incrementing since surface pitches not restricted in CpuSwizzleBlt()
904 … int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)... in CpuSwizzleBlt()
905 ~(pSwizzledSurface->pSwizzle->Mask.x | in CpuSwizzleBlt()
906 pSwizzledSurface->pSwizzle->Mask.y | in CpuSwizzleBlt()
907 pSwizzledSurface->pSwizzle->Mask.z); in CpuSwizzleBlt()
913 for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1) in CpuSwizzleBlt()
915 MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX; in CpuSwizzleBlt()
920 MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0); in CpuSwizzleBlt()
925 int IntraTileY = y0 & ((1 << TileHeightBits) - 1); in CpuSwizzleBlt()
926 int TileAlignedY = y0 - IntraTileY; in CpuSwizzleBlt()
933 … TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile". in CpuSwizzleBlt()
939 /* Traverse BLT rectangle, transferring small, optimally-aligned 2D in CpuSwizzleBlt()
948 MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y); in CpuSwizzleBlt()
957 /* We'll define "XFER" macro to contain BLT X-loop work. in CpuSwizzleBlt()
964 - Transfer Direction (Linear <--> Swizzled) in CpuSwizzleBlt()
965 - Optimal 2D Transfer Chunk Size in CpuSwizzleBlt()
966 - Available/Desired CPU Transfer Instructions in CpuSwizzleBlt()
967 - Unaligned Crust in CpuSwizzleBlt()
969 Don't want X-loop to have conditional logic to handle in CpuSwizzleBlt()
970 variations since would retard performance--but neither do we in CpuSwizzleBlt()
971 want messy multitude of slightly different, copy-pasted code in CpuSwizzleBlt()
973 allowing instantiation of multiple X-loop variations--i.e. XFER in CpuSwizzleBlt()
974 calls from conditional Y-loop code will expand into separate, in CpuSwizzleBlt()
975 conditional-free, "lean and mean" X-loops. in CpuSwizzleBlt()
977 Some conditional logic remains in XFER chain--but only outside in CpuSwizzleBlt()
978 X-loop. The two IF statements that remain in X-loop (i.e. those in CpuSwizzleBlt()
979 in XFER_LOAD/STORE) expand to compile-time constant conditional in CpuSwizzleBlt()
980 expressions, so with optimizing compiler, no runtime- in CpuSwizzleBlt()
981 conditional code will be generated--i.e. constant conditionals in CpuSwizzleBlt()
1031 SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX; \ in CpuSwizzleBlt()
1058 ((intptr_t) pSwizzledSurface->pBase % 16 == 0) && in CpuSwizzleBlt()
1059 (pSwizzledSurface->Pitch % 16 == 0)); in CpuSwizzleBlt()
1062 if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) || in CpuSwizzleBlt()
1063 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) in CpuSwizzleBlt()
1067 switch(pLinearSurface->Element.Size) in CpuSwizzleBlt()
1069 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1070 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1071 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1072 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1073 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1074 … pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.… in CpuSwizzleBlt()
1080 switch(pLinearSurface->Element.Size) in CpuSwizzleBlt()
1086 …OVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1090 …OVDQU_M, MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1094 … MOVQ_M, MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1095 … MOVD_M, MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1096 … MOV3_M, MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1097 … MOVW_M, MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1098 … MOVB_M, MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddres… in CpuSwizzleBlt()
1108 …OVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break; in CpuSwizzleBlt()
1110 …ER(MOVW_M, MOVW_R, 2, 2, pSwizzledAddress, 2, pLinearAddress, pLinearSurface->Pitch, 1); break; in CpuSwizzleBlt()
1123 …XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1); in CpuSwizzleBlt()
1127 …XFER(MOVDQU_M, MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1); in CpuSwizzleBlt()
1132 …case 2: XFER(MOVW_M, MOVW_R, 2, 2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 2… in CpuSwizzleBlt()
1140 SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight]; in CpuSwizzleBlt()
1145 /* X-loop only advanced pLinearAddress by CopyWidthBytes--even in CpuSwizzleBlt()
1147 pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes; in CpuSwizzleBlt()
1151 _mm_sfence(); // Flush Non-Temporal Writes in CpuSwizzleBlt()
1162 // clang-format on