1
2 /*
3 ************************************************************************************************************************
4 *
5 * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
6 *
7 ***********************************************************************************************************************/
8
9 /**
10 ****************************************************************************************************
11 * @file addrswizzler.cpp
12 * @brief Contains code for efficient CPU swizzling.
13 ****************************************************************************************************
14 */
15
16 #include "addrswizzler.h"
17
18 namespace Addr
19 {
20
21 /**
22 ****************************************************************************************************
23 * LutAddresser::LutAddresser
24 *
25 * @brief
26 * Constructor for the LutAddresser class.
27 ****************************************************************************************************
28 */
LutAddresser()29 LutAddresser::LutAddresser()
30 :
31 m_pXLut(&m_lutData[0]),
32 m_pYLut(&m_lutData[0]),
33 m_pZLut(&m_lutData[0]),
34 m_pSLut(&m_lutData[0]),
35 m_xLutMask(0),
36 m_yLutMask(0),
37 m_zLutMask(0),
38 m_sLutMask(0),
39 m_blockBits(0),
40 m_blockSize(),
41 m_bpeLog2(0),
42 m_bit(),
43 m_lutData()
44 {
45 }
46
47 /**
48 ****************************************************************************************************
49 * LutAddresser::Init
50 *
51 * @brief
52 * Calculates general properties about the swizzle
53 ****************************************************************************************************
54 */
Init(const ADDR_BIT_SETTING * pEq,UINT_32 eqSize,ADDR_EXTENT3D blockSize,UINT_8 blockBits)55 void LutAddresser::Init(
56 const ADDR_BIT_SETTING* pEq,
57 UINT_32 eqSize,
58 ADDR_EXTENT3D blockSize,
59 UINT_8 blockBits)
60 {
61 ADDR_ASSERT(eqSize <= ADDR_MAX_EQUATION_BIT);
62 memcpy(&m_bit[0], pEq, sizeof(ADDR_BIT_SETTING) * eqSize);
63 m_blockSize = blockSize;
64 m_blockBits = blockBits;
65
66 InitSwizzleProps();
67 InitLuts();
68 }
69
70 /**
71 ****************************************************************************************************
72 * LutAddresser::InitSwizzleProps
73 *
74 * @brief
75 * Calculates general properties about the swizzle
76 ****************************************************************************************************
77 */
InitSwizzleProps()78 void LutAddresser::InitSwizzleProps()
79 {
80 // Calculate BPE from the swizzle. This can be derived from the number of invalid low bits.
81 m_bpeLog2 = 0;
82 for (UINT_32 i = 0; i < MaxElementBytesLog2; i++)
83 {
84 if (m_bit[i].value != 0)
85 {
86 break;
87 }
88 m_bpeLog2++;
89 }
90
91 // Generate a mask/size for each channel's LUT. This may be larger than the block size.
92 // If a given 'source' bit (eg. 'x0') is used for any part of the equation, fill that in the mask.
93 for (UINT_32 i = 0; i < ADDR_MAX_EQUATION_BIT; i++)
94 {
95 m_xLutMask |= m_bit[i].x;
96 m_yLutMask |= m_bit[i].y;
97 m_zLutMask |= m_bit[i].z;
98 m_sLutMask |= m_bit[i].s;
99 }
100
101 // An expandX of 1 is a no-op
102 m_maxExpandX = 1;
103 if (m_sLutMask == 0)
104 {
105 // Calculate expandX from the swizzle. This can be derived from the number of consecutive,
106 // increasing low x bits
107 for (UINT_32 i = 0; i < 3; i++)
108 {
109 const auto& curBit = m_bit[m_bpeLog2 + i];
110 ADDR_ASSERT(curBit.value != 0);
111 if ((IsPow2(curBit.value) == false) || // More than one bit contributes
112 (curBit.x == 0) || // Bit is from Y/Z/S channel
113 (curBit.x != m_maxExpandX)) // X bits are out of order
114 {
115 break;
116 }
117 m_maxExpandX *= 2;
118 }
119 }
120 }
121
122 /**
123 ****************************************************************************************************
124 * LutAddresser::InitLuts
125 *
126 * @brief
127 * Creates lookup tables for each channel.
128 ****************************************************************************************************
129 */
InitLuts()130 void LutAddresser::InitLuts()
131 {
132 UINT_32 curOffset = 0;
133 m_pXLut = &m_lutData[0];
134 for (UINT_32 x = 0; x < (m_xLutMask + 1); x++)
135 {
136 m_pXLut[x] = EvalEquation(x, 0, 0, 0);
137 }
138 curOffset += m_xLutMask + 1;
139 ADDR_ASSERT(curOffset <= MaxLutSize);
140
141 if (m_yLutMask != 0)
142 {
143 m_pYLut = &m_lutData[curOffset];
144 for (UINT_32 y = 0; y < (m_yLutMask + 1); y++)
145 {
146 m_pYLut[y] = EvalEquation(0, y, 0, 0);
147 }
148 curOffset += m_yLutMask + 1;
149 ADDR_ASSERT(curOffset <= MaxLutSize);
150 }
151 else
152 {
153 m_pYLut = &m_lutData[0];
154 ADDR_ASSERT(m_pYLut[0] == 0);
155 }
156
157 if (m_zLutMask != 0)
158 {
159 m_pZLut = &m_lutData[curOffset];
160 for (UINT_32 z = 0; z < (m_zLutMask + 1); z++)
161 {
162 m_pZLut[z] = EvalEquation(0, 0, z, 0);
163 }
164 curOffset += m_zLutMask + 1;
165 ADDR_ASSERT(curOffset <= MaxLutSize);
166 }
167 else
168 {
169 m_pZLut = &m_lutData[0];
170 ADDR_ASSERT(m_pZLut[0] == 0);
171 }
172
173 if (m_sLutMask != 0)
174 {
175 m_pSLut = &m_lutData[curOffset];
176 for (UINT_32 s = 0; s < (m_sLutMask + 1); s++)
177 {
178 m_pSLut[s] = EvalEquation(0, 0, 0, s);
179 }
180 curOffset += m_sLutMask + 1;
181 ADDR_ASSERT(curOffset <= MaxLutSize);
182 }
183 else
184 {
185 m_pSLut = &m_lutData[0];
186 ADDR_ASSERT(m_pSLut[0] == 0);
187 }
188 }
189
190 /**
191 ****************************************************************************************************
192 * LutAddresser::EvalEquation
193 *
194 * @brief
195 * Evaluates the equation at a given coordinate manually.
196 ****************************************************************************************************
197 */
EvalEquation(UINT_32 x,UINT_32 y,UINT_32 z,UINT_32 s)198 UINT_32 LutAddresser::EvalEquation(
199 UINT_32 x,
200 UINT_32 y,
201 UINT_32 z,
202 UINT_32 s)
203 {
204 UINT_32 out = 0;
205
206 for (UINT_32 i = 0; i < ADDR_MAX_EQUATION_BIT; i++)
207 {
208 if (m_bit[i].value == 0)
209 {
210 if (out != 0)
211 {
212 // Invalid bits at the top of the equation
213 break;
214 }
215 else
216 {
217 continue;
218 }
219 }
220
221 if (x != 0)
222 {
223 UINT_32 xSrcs = m_bit[i].x;
224 while (xSrcs != 0)
225 {
226 UINT_32 xIdx = BitScanForward(xSrcs);
227 out ^= (((x >> xIdx) & 1) << i);
228 xSrcs = UnsetLeastBit(xSrcs);
229 }
230 }
231
232 if (y != 0)
233 {
234 UINT_32 ySrcs = m_bit[i].y;
235 while (ySrcs != 0)
236 {
237 UINT_32 yIdx = BitScanForward(ySrcs);
238 out ^= (((y >> yIdx) & 1) << i);
239 ySrcs = UnsetLeastBit(ySrcs);
240 }
241 }
242
243 if (z != 0)
244 {
245 UINT_32 zSrcs = m_bit[i].z;
246 while (zSrcs != 0)
247 {
248 UINT_32 zIdx = BitScanForward(zSrcs);
249 out ^= (((z >> zIdx) & 1) << i);
250 zSrcs = UnsetLeastBit(zSrcs);
251 }
252 }
253
254 if (s != 0)
255 {
256 UINT_32 sSrcs = m_bit[i].s;
257 while (sSrcs != 0)
258 {
259 UINT_32 sIdx = BitScanForward(sSrcs);
260 out ^= (((s >> sIdx) & 1) << i);
261 sSrcs = UnsetLeastBit(sSrcs);
262 }
263 }
264 }
265
266 return out;
267 }
268
269
270 /**
271 ****************************************************************************************************
272 * Copy2DSliceUnaligned
273 *
274 * @brief
275 * Copies an arbitrary 2D pixel region to or from a surface.
276 ****************************************************************************************************
277 */
278 template <int BPELog2, int ExpandX, bool ImgIsDest>
Copy2DSliceUnaligned(void * pImgBlockSliceStart,void * pBuf,size_t bufStrideY,UINT_32 imageBlocksY,ADDR_COORD2D origin,ADDR_EXTENT2D extent,UINT_32 sliceXor,const LutAddresser & addresser)279 void Copy2DSliceUnaligned(
280 void* pImgBlockSliceStart, // Block corresponding to beginning of slice
281 void* pBuf, // Pointer to data starting from the copy origin.
282 size_t bufStrideY, // Stride of each row in pBuf
283 UINT_32 imageBlocksY, // Width of the image slice, in blocks.
284 ADDR_COORD2D origin, // Absolute origin, in elements
285 ADDR_EXTENT2D extent, // Size to copy, in elements
286 UINT_32 sliceXor, // Includes pipeBankXor and z XOR
287 const LutAddresser& addresser)
288 {
289 UINT_32 xStart = origin.x;
290 UINT_32 xEnd = origin.x + extent.width;
291
292 constexpr UINT_32 PixBytes = (1 << BPELog2);
293
294 // Apply a negative offset now so later code can do eg. pBuf[x] instead of pBuf[x - origin.x]
295 pBuf = VoidPtrDec(pBuf, xStart * PixBytes);
296
297 // Do things one row at a time for unaligned regions.
298 for (UINT_32 y = origin.y; y < (origin.y + extent.height); y++)
299 {
300 UINT_32 yBlk = (y >> addresser.GetBlockYBits()) * imageBlocksY;
301 UINT_32 rowXor = sliceXor ^ addresser.GetAddressY(y);
302
303 UINT_32 x = xStart;
304
305 // Most swizzles pack 2-4 pixels horizontally. Take advantage of this even in non-microblock-aligned
306 // regions to commonly do 2-4x less work. This is still way less good than copying by whole microblocks though.
307 if (ExpandX > 1)
308 {
309 // Unaligned left edge
310 for (; x < Min(xEnd, PowTwoAlign(xStart, ExpandX)); x++)
311 {
312 UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
313 void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
314 void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
315 if (ImgIsDest)
316 {
317 memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
318 }
319 else
320 {
321 memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
322 }
323 }
324 // Aligned middle
325 for (; x < PowTwoAlignDown(xEnd, ExpandX); x += ExpandX)
326 {
327 UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
328 void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
329 void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
330 if (ImgIsDest)
331 {
332 memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes * ExpandX);
333 }
334 else
335 {
336 memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes * ExpandX);
337 }
338 }
339 }
340 // Unaligned end (or the whole thing when ExpandX == 1)
341 for (; x < xEnd; x++)
342 {
343 // Get the index of the block within the slice
344 UINT_32 blk = (yBlk + (x >> addresser.GetBlockXBits()));
345 // Apply that index to get the base address of the current block.
346 void* pImgBlock = VoidPtrInc(pImgBlockSliceStart, blk << addresser.GetBlockBits());
347 // Grab the x-xor and XOR it all together, adding to get the final address
348 void* pPix = VoidPtrInc(pImgBlock, rowXor ^ addresser.GetAddressX(x));
349 if (ImgIsDest)
350 {
351 memcpy(pPix, VoidPtrInc(pBuf, x * PixBytes), PixBytes);
352 }
353 else
354 {
355 memcpy(VoidPtrInc(pBuf, x * PixBytes), pPix, PixBytes);
356 }
357 }
358
359 pBuf = VoidPtrInc(pBuf, bufStrideY);
360 }
361 }
362
363 /**
364 ****************************************************************************************************
365 * LutAddresser::GetCopyMemImgFunc
366 *
367 * @brief
368 * Determines and returns which copy function to use for copying to images
369 ****************************************************************************************************
370 */
GetCopyMemImgFunc() const371 UnalignedCopyMemImgFunc LutAddresser::GetCopyMemImgFunc() const
372 {
373 // While these are all the same function, the codegen gets really bad if the size of each pixel
374 // is not known at compile time. Hence, templates.
375 const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
376 {
377 // ExpandX = 1, 2, 4
378 { Copy2DSliceUnaligned<0, 1, true>, Copy2DSliceUnaligned<0, 2, true>, Copy2DSliceUnaligned<0, 4, true> }, // 1BPE
379 { Copy2DSliceUnaligned<1, 1, true>, Copy2DSliceUnaligned<1, 2, true>, Copy2DSliceUnaligned<1, 4, true> }, // 2BPE
380 { Copy2DSliceUnaligned<2, 1, true>, Copy2DSliceUnaligned<2, 2, true>, Copy2DSliceUnaligned<2, 4, true> }, // 4BPE
381 { Copy2DSliceUnaligned<3, 1, true>, Copy2DSliceUnaligned<3, 2, true>, Copy2DSliceUnaligned<3, 4, true> }, // 8BPE
382 { Copy2DSliceUnaligned<4, 1, true>, Copy2DSliceUnaligned<4, 2, true>, Copy2DSliceUnaligned<4, 4, true> }, // 16BPE
383 };
384
385 UnalignedCopyMemImgFunc pfnRet = nullptr;
386 ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
387 if (m_maxExpandX >= 4)
388 {
389 pfnRet = Funcs[m_bpeLog2][2];
390 }
391 else if (m_maxExpandX >= 2)
392 {
393 pfnRet = Funcs[m_bpeLog2][1];
394 }
395 else
396 {
397 pfnRet = Funcs[m_bpeLog2][0];
398 }
399 return pfnRet;
400 }
401
402 /**
403 ****************************************************************************************************
404 * LutAddresser::GetCopyImgMemFunc
405 *
406 * @brief
407 * Determines and returns which copy function to use for copying from images
408 ****************************************************************************************************
409 */
GetCopyImgMemFunc() const410 UnalignedCopyMemImgFunc LutAddresser::GetCopyImgMemFunc() const
411 {
412 // While these are all the same function, the codegen gets really bad if the size of each pixel
413 // is not known at compile time. Hence, templates.
414 const UnalignedCopyMemImgFunc Funcs[MaxElementBytesLog2][3] =
415 {
416 // ExpandX = 1, 2, 4
417 { Copy2DSliceUnaligned<0, 1, false>, Copy2DSliceUnaligned<0, 2, false>, Copy2DSliceUnaligned<0, 4, false> }, // 1BPE
418 { Copy2DSliceUnaligned<1, 1, false>, Copy2DSliceUnaligned<1, 2, false>, Copy2DSliceUnaligned<1, 4, false> }, // 2BPE
419 { Copy2DSliceUnaligned<2, 1, false>, Copy2DSliceUnaligned<2, 2, false>, Copy2DSliceUnaligned<2, 4, false> }, // 4BPE
420 { Copy2DSliceUnaligned<3, 1, false>, Copy2DSliceUnaligned<3, 2, false>, Copy2DSliceUnaligned<3, 4, false> }, // 8BPE
421 { Copy2DSliceUnaligned<4, 1, false>, Copy2DSliceUnaligned<4, 2, false>, Copy2DSliceUnaligned<4, 4, false> }, // 16BPE
422 };
423
424 UnalignedCopyMemImgFunc pfnRet = nullptr;
425 ADDR_ASSERT(m_bpeLog2 < MaxElementBytesLog2);
426 if (m_maxExpandX >= 4)
427 {
428 pfnRet = Funcs[m_bpeLog2][2];
429 }
430 else if (m_maxExpandX >= 2)
431 {
432 pfnRet = Funcs[m_bpeLog2][1];
433 }
434 else
435 {
436 pfnRet = Funcs[m_bpeLog2][0];
437 }
438 return pfnRet;
439 }
440
441 }
442