• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ************************************************************************************************************************
3 *
4 *  Copyright (C) 2024 Advanced Micro Devices, Inc.  All rights reserved.
5 *
6 ***********************************************************************************************************************/
7 /**
8 ****************************************************************************************************
9 * @file  addrswizzler.cpp
10 * @brief Contains code for efficient CPU swizzling.
11 ****************************************************************************************************
12 */
13 #ifndef __ADDR_SWIZZLER_H__
14 #define __ADDR_SWIZZLER_H__
15 
16 #include "addrlib.h"
17 #include "addrcommon.h"
18 
19 namespace Addr
20 {
21 
22 // Forward decl
23 class LutAddresser;
24 
25 typedef void (*UnalignedCopyMemImgFunc)(
26     void*               pImgBlockSliceStart,  // Block corresponding to beginning of slice
27     void*               pBuf,                 // Pointer to data starting from the copy origin.
28     size_t              bufStrideY,           // Stride of each row in pBuf
29     UINT_32             imageBlocksY,         // Width of the image slice, in blocks.
30     ADDR_COORD2D        origin,               // Absolute origin, in elements
31     ADDR_EXTENT2D       extent,               // Size to copy, in elements
32     UINT_32             sliceXor,             // Includes pipeBankXor and z XOR
33     const LutAddresser& addresser);
34 
35 // This class calculates and holds up to four lookup tables (x/y/z/s) which can be used to cheaply calculate the
36 // position of a pixel within a block at the cost of some precomputation and memory usage.
37 //
38 // This works for all equations and does something like this:
39 //    offset = blockAddr ^ XLut[x & xMask] ^ YLut[Y & ymask]...
40 class LutAddresser
41 {
42 public:
43     constexpr static UINT_32 MaxLutSize = 2100; // Sized to fit the largest non-VAR LUT size
44 
45     LutAddresser();
46 
47     void Init(const ADDR_BIT_SETTING* pEq, UINT_32 eqSize, ADDR_EXTENT3D blockSize, UINT_8 blkBits);
48 
49     // Does a full calculation to get the offset within a block. Takes an *absolute* coordinate,
50     // not the coordinate within the block.
51     UINT_32  GetBlockOffset(
52         UINT_32 x,
53         UINT_32 y,
54         UINT_32 z,
55         UINT_32 s = 0,
56         UINT_32 pipeBankXor = 0)
57     {
58         return GetAddressX(x) ^ GetAddressY(y) ^ GetAddressZ(z) ^ GetAddressS(s) ^ pipeBankXor;
59     }
60 
61     // Get the block size
GetBlockBits()62     UINT_32  GetBlockBits() const { return m_blockBits; }
GetBlockXBits()63     UINT_32  GetBlockXBits() const { return Log2(m_blockSize.width); }
GetBlockYBits()64     UINT_32  GetBlockYBits() const { return Log2(m_blockSize.height); }
GetBlockZBits()65     UINT_32  GetBlockZBits() const { return Log2(m_blockSize.depth); }
66 
67     // "Fast single channel" functions to get the part that each channel contributes to be XORd together.
GetAddressX(UINT_32 x)68     UINT_32  GetAddressX(UINT_32  x) const { return m_pXLut[x & m_xLutMask];}
GetAddressY(UINT_32 y)69     UINT_32  GetAddressY(UINT_32  y) const { return m_pYLut[y & m_yLutMask];}
GetAddressZ(UINT_32 z)70     UINT_32  GetAddressZ(UINT_32  z) const { return m_pZLut[z & m_zLutMask];}
GetAddressS(UINT_32 s)71     UINT_32  GetAddressS(UINT_32  s) const { return m_pSLut[s & m_sLutMask];}
72 
73     // Get a function that can copy a single 2D slice of an image with this swizzle.
74     UnalignedCopyMemImgFunc GetCopyMemImgFunc() const;
75     UnalignedCopyMemImgFunc GetCopyImgMemFunc() const;
76 private:
77     // Calculate general properties of the swizzle equations
78     void InitSwizzleProps();
79     // Fills a LUT for each channel.
80     void InitLuts();
81     // Evaluate coordinate without LUTs
82     UINT_32 EvalEquation(UINT_32 x, UINT_32 y, UINT_32 z, UINT_32 s);
83 
84     // Pointers within m_lutData corresponding to where each LUT starts
85     // m_lutData[0] always has a value of 0 and thus can be considered an empty 1-entry LUT for "don't care" channels
86     UINT_32* m_pXLut;
87     UINT_32* m_pYLut;
88     UINT_32* m_pZLut;
89     UINT_32* m_pSLut;
90 
91     // Size of each LUT, minus 1 to form a mask. A mask of 0 is valid for an empty LUT.
92     UINT_32 m_xLutMask;
93     UINT_32 m_yLutMask;
94     UINT_32 m_zLutMask;
95     UINT_32 m_sLutMask;
96 
97     // Number of bits in the block (aka Log2(blkSize))
98     UINT_32  m_blockBits;
99 
100     // The block size
101     ADDR_EXTENT3D m_blockSize;
102 
103     // Number of 'x' bits at the bottom of the equation. Must be a pow2 and at least 1.
104     // This will be used as a simple optimization to batch together operations on adjacent x pixels.
105     UINT_32  m_maxExpandX;
106 
107     // BPE for this equation.
108     UINT_32  m_bpeLog2;
109 
110     // The full equation
111     ADDR_BIT_SETTING m_bit[ADDR_MAX_EQUATION_BIT];
112 
113     // Backing store for the LUT tables.
114     UINT_32 m_lutData[MaxLutSize];
115 };
116 
117 }
118 
119 #endif // __ADDR_SWIZZLER_H__