• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*
3 ************************************************************************************************************************
4 *
5 *  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
6 *
7 ***********************************************************************************************************************/
8 
9 /**
10 ****************************************************************************************************
11 * @file  addrswizzler.cpp
12 * @brief Contains code for efficient CPU swizzling.
13 ****************************************************************************************************
14 */
15 
16 #include "addrswizzler.h"
17 
18 namespace Addr
19 {
20 
21 /**
22 ****************************************************************************************************
23 *   LutAddresser::LutAddresser
24 *
25 *   @brief
26 *       Constructor for the LutAddresser class.
27 ****************************************************************************************************
28 */
LutAddresser()29 LutAddresser::LutAddresser()
30     :
31     m_pXLut(&m_lutData[0]),
32     m_pYLut(&m_lutData[0]),
33     m_pZLut(&m_lutData[0]),
34     m_pSLut(&m_lutData[0]),
35     m_xLutMask(0),
36     m_yLutMask(0),
37     m_zLutMask(0),
38     m_sLutMask(0),
39     m_blockBits(0),
40     m_blockSize(),
41     m_bpeLog2(0),
42     m_bit(),
43     m_lutData()
44 {
45 }
46 
47 /**
48 ****************************************************************************************************
49 *   LutAddresser::Init
50 *
51 *   @brief
52 *       Calculates general properties about the swizzle
53 ****************************************************************************************************
54 */
Init(const ADDR_BIT_SETTING * pEq,UINT_32 eqSize,ADDR_EXTENT3D blockSize,UINT_8 blockBits)55 void LutAddresser::Init(
56     const ADDR_BIT_SETTING* pEq,
57     UINT_32                 eqSize,
58     ADDR_EXTENT3D           blockSize,
59     UINT_8                  blockBits)
60 {
61     ADDR_ASSERT(eqSize <= ADDR_MAX_EQUATION_BIT);
62     memcpy(&m_bit[0], pEq, sizeof(ADDR_BIT_SETTING) * eqSize);
63     m_blockSize = blockSize;
64     m_blockBits = blockBits;
65 
66     InitSwizzleProps();
67     InitLuts();
68 }
69 
70 /**
71 ****************************************************************************************************
72 *   LutAddresser::InitSwizzleProps
73 *
74 *   @brief
75 *       Calculates general properties about the swizzle
76 ****************************************************************************************************
77 */
InitSwizzleProps()78 void LutAddresser::InitSwizzleProps()
79 {
80     // Calculate BPE from the swizzle. This can be derived from the number of invalid low bits.
81     m_bpeLog2 = 0;
82     for (UINT_32 i = 0; i < MaxElementBytesLog2; i++)
83     {
84         if (m_bit[i].value != 0)
85         {
86             break;
87         }
88         m_bpeLog2++;
89     }
90 
91     // Generate a mask/size for each channel's LUT. This may be larger than the block size.
92     // If a given 'source' bit (eg. 'x0') is used for any part of the equation, fill that in the mask.
93     for (UINT_32 i = 0; i < ADDR_MAX_EQUATION_BIT; i++)
94     {
95         m_xLutMask |= m_bit[i].x;
96         m_yLutMask |= m_bit[i].y;
97         m_zLutMask |= m_bit[i].z;
98         m_sLutMask |= m_bit[i].s;
99     }
100 
101     // An expandX of 1 is a no-op
102     m_maxExpandX = 1;
103     if (m_sLutMask == 0)
104     {
105         // Calculate expandX from the swizzle. This can be derived from the number of consecutive,
106         // increasing low x bits
107         for (UINT_32 i = 0; i < 3; i++)
108         {
109             const auto& curBit = m_bit[m_bpeLog2 + i];
110             ADDR_ASSERT(curBit.value != 0);
111             if ((IsPow2(curBit.value) == false) || // More than one bit contributes
112                 (curBit.x == 0)                 || // Bit is from Y/Z/S channel
113                 (curBit.x != m_maxExpandX))        // X bits are out of order
114             {
115                 break;
116             }
117             m_maxExpandX *= 2;
118         }
119     }
120 }
121 
122 /**
123 ****************************************************************************************************
124 *   LutAddresser::InitLuts
125 *
126 *   @brief
127 *       Creates lookup tables for each channel.
128 ****************************************************************************************************
129 */
InitLuts()130 void LutAddresser::InitLuts()
131 {
132     UINT_32 curOffset = 0;
133     m_pXLut = &m_lutData[0];
134     for (UINT_32 x = 0; x < (m_xLutMask + 1); x++)
135     {
136         m_pXLut[x] = EvalEquation(x, 0, 0, 0);
137     }
138     curOffset += m_xLutMask + 1;
139     ADDR_ASSERT(curOffset <= MaxLutSize);
140 
141     if (m_yLutMask != 0)
142     {
143         m_pYLut = &m_lutData[curOffset];
144         for (UINT_32 y = 0; y < (m_yLutMask + 1); y++)
145         {
146             m_pYLut[y] = EvalEquation(0, y, 0, 0);
147         }
148         curOffset += m_yLutMask + 1;
149         ADDR_ASSERT(curOffset <= MaxLutSize);
150     }
151     else
152     {
153         m_pYLut = &m_lutData[0];
154         ADDR_ASSERT(m_pYLut[0] == 0);
155     }
156 
157     if (m_zLutMask != 0)
158     {
159         m_pZLut = &m_lutData[curOffset];
160         for (UINT_32 z = 0; z < (m_zLutMask + 1); z++)
161         {
162             m_pZLut[z] = EvalEquation(0, 0, z, 0);
163         }
164         curOffset += m_zLutMask + 1;
165         ADDR_ASSERT(curOffset <= MaxLutSize);
166     }
167     else
168     {
169         m_pZLut = &m_lutData[0];
170         ADDR_ASSERT(m_pZLut[0] == 0);
171     }
172 
173     if (m_sLutMask != 0)
174     {
175         m_pSLut = &m_lutData[curOffset];
176         for (UINT_32 s = 0; s < (m_sLutMask + 1); s++)
177         {
178             m_pSLut[s] = EvalEquation(0, 0, 0, s);
179         }
180         curOffset += m_sLutMask + 1;
181         ADDR_ASSERT(curOffset <= MaxLutSize);
182     }
183     else
184     {
185         m_pSLut = &m_lutData[0];
186         ADDR_ASSERT(m_pSLut[0] == 0);
187     }
188 }
189 
190 /**
191 ****************************************************************************************************
192 *   LutAddresser::EvalEquation
193 *
194 *   @brief
195 *       Evaluates the equation at a given coordinate manually.
196 ****************************************************************************************************
197 */
EvalEquation(UINT_32 x,UINT_32 y,UINT_32 z,UINT_32 s)198 UINT_32 LutAddresser::EvalEquation(
199     UINT_32 x,
200     UINT_32 y,
201     UINT_32 z,
202     UINT_32 s)
203 {
204     UINT_32 out = 0;
205 
206     for (UINT_32 i = 0; i < ADDR_MAX_EQUATION_BIT; i++)
207     {
208         if (m_bit[i].value == 0)
209         {
210             if (out != 0)
211             {
212                 // Invalid bits at the top of the equation
213                 break;
214             }
215             else
216             {
217                 continue;
218             }
219         }
220 
221         if (x != 0)
222         {
223             UINT_32 xSrcs = m_bit[i].x;
224             while (xSrcs != 0)
225             {
226                 UINT_32 xIdx = BitScanForward(xSrcs);
227                 out ^= (((x >> xIdx) & 1) << i);
228                 xSrcs = UnsetLeastBit(xSrcs);
229             }
230         }
231 
232         if (y != 0)
233         {
234             UINT_32 ySrcs = m_bit[i].y;
235             while (ySrcs != 0)
236             {
237                 UINT_32 yIdx = BitScanForward(ySrcs);
238                 out ^= (((y >> yIdx) & 1) << i);
239                 ySrcs = UnsetLeastBit(ySrcs);
240             }
241         }
242 
243         if (z != 0)
244         {
245             UINT_32 zSrcs = m_bit[i].z;
246             while (zSrcs != 0)
247             {
248                 UINT_32 zIdx = BitScanForward(zSrcs);
249                 out ^= (((z >> zIdx) & 1) << i);
250                 zSrcs = UnsetLeastBit(zSrcs);
251             }
252         }
253 
254         if (s != 0)
255         {
256             UINT_32 sSrcs = m_bit[i].s;
257             while (sSrcs != 0)
258             {
259                 UINT_32 sIdx = BitScanForward(sSrcs);
260                 out ^= (((s >> sIdx) & 1) << i);
261                 sSrcs = UnsetLeastBit(sSrcs);
262             }
263         }
264     }
265 
266     return out;
267 }
268 
269 
270 /**
271 ****************************************************************************************************
272 *   Copy2DSliceUnaligned
273 *
274 *   @brief
275 *       Copies an arbitrary 2D pixel region to or from a surface.
276 ****************************************************************************************************
277 */
278 template <int BPELog2, int ExpandX, bool ImgIsDest>
Copy2DSliceUnaligned(void * pImgBlockSliceStart,void * pBuf,size_t bufStrideY,UINT_32 imageBlocksY,ADDR_COORD2D origin,ADDR_EXTENT2D extent,UINT_32 sliceXor,const LutAddresser & addresser)279 void Copy2DSliceUnaligned(
280     void*               pImgBlockSliceStart, // Block corresponding to beginning of slice
281     void*               pBuf,                // Pointer to data starting from the copy origin.
282     size_t              bufStrideY,          // Stride of each row in pBuf
283     UINT_32             imageBlocksY,        // Width of the image slice, in blocks.
284     ADDR_COORD2D        origin,              // Absolute origin, in elements
285     ADDR_EXTENT2D       extent,              // Size to copy, in elements
286     UINT_32             sliceXor,            // Includes pipeBankXor and z XOR
287     const LutAddresser& addresser)
288 {
289     UINT_32  xStart = origin.x;
290     UINT_32  xEnd   = origin.x + extent.width;
291 
292     constexpr UINT_32  PixBytes = (1 << BPELog2);
293 
294     // Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
295     pBuf = VoidPtrDec(pBuf, xStart * PixBytes);
296 
297     // Do things one row at a time for unaligned regions.
298     for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
299     {
300         UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
301         UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
302 
303         UINT_32 x = xStart;
304 
305         // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
306         // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
307         if (ExpandX > 1)
308         {
309             // Unaligned left edge
310             for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
311             {
312                 UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
313                 void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
314                 void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
315                 if (ImgIsDest)
316                 {
317                     memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
318                 }
319                 else
320                 {
321                     memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
322                 }
323             }
324             // Aligned middle
325             for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
326             {
327                 UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
328                 void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
329                 void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
330                 if (ImgIsDest)
331                 {
332                     memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
333                 }
334                 else
335                 {
336                     memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
337                 }
338             }
339         }
340         // Unaligned end (or the whole thing when ExpandX == 1)
341         for (; x < xEnd; x++)
342         {
343             // Get the index of the block within the slice
344             UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
345             // Apply that index to get the base address of the current block.
346             void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
347             // Grab the x-xor and XOR it all together, adding to get the final address
348             void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
349             if (ImgIsDest)
350             {
351                 memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
352             }
353             else
354             {
355                 memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
356             }
357         }
358 
359         pBuf = VoidPtrInc(pBuf, bufStrideY);
360     }
361 }
362 
363 /**
364 ****************************************************************************************************
365 *   LutAddresser::GetCopyMemImgFunc
366 *
367 *   @brief
368 *       Determines and returns which copy function to use for copying to images
369 ****************************************************************************************************
370 */
GetCopyMemImgFunc() const371 UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
372 {
373     // While these are all the same function, the codegen gets really bad if the size of each pixel
374     // is not known at compile time. Hence, templates.
375     const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
376     {
377         // ExpandX =  1, 2, 4
378         { Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE
379         { Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE
380         { Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE
381         { Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE
382         { Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE
383     };
384 
385     UnalignedCopyMemImgFunc pfnRet = nullptr;
386     ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
387     if (m_maxExpandX >= 4)
388     {
389         pfnRet = Funcs[m_bpeLog2][2];
390     }
391     else if (m_maxExpandX >= 2)
392     {
393         pfnRet = Funcs[m_bpeLog2][1];
394     }
395     else
396     {
397         pfnRet = Funcs[m_bpeLog2][0];
398     }
399     return pfnRet;
400 }
401 
402 /**
403 ****************************************************************************************************
404 *   LutAddresser::GetCopyImgMemFunc
405 *
406 *   @brief
407 *       Determines and returns which copy function to use for copying from images
408 ****************************************************************************************************
409 */
GetCopyImgMemFunc() const410 UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const
411 {
412     // While these are all the same function, the codegen gets really bad if the size of each pixel
413     // is not known at compile time. Hence, templates.
414     const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
415     {
416         // ExpandX =  1, 2, 4
417         { Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE
418         { Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE
419         { Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE
420         { Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE
421         { Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE
422     };
423 
424     UnalignedCopyMemImgFunc pfnRet = nullptr;
425     ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
426     if (m_maxExpandX >= 4)
427     {
428         pfnRet = Funcs[m_bpeLog2][2];
429     }
430     else if (m_maxExpandX >= 2)
431     {
432         pfnRet = Funcs[m_bpeLog2][1];
433     }
434     else
435     {
436         pfnRet = Funcs[m_bpeLog2][0];
437     }
438     return pfnRet;
439 }
440 
441 }
442