1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include "transpose.h"
14 #include "common/macros.h"
15
16 //
17 // Rows must be an even number. This is enforced elsewhere.
18 //
19 // The transpose requires (cols_log2 * rows/2) row-pair blends.
20 //
21 void
hsg_transpose(uint32_t const cols_log2,uint32_t const rows,void (* pfn_blend)(uint32_t const cols_log2,uint32_t const row_ll,uint32_t const row_ur,void * blend),void * blend,void (* pfn_remap)(uint32_t const row_from,uint32_t const row_to,void * remap),void * remap)22 hsg_transpose(uint32_t const cols_log2,
23 uint32_t const rows,
24 void (*pfn_blend)(uint32_t const cols_log2,
25 uint32_t const row_ll, // lower-left
26 uint32_t const row_ur, // upper-right
27 void * blend),
28 void * blend,
29 void (*pfn_remap)(uint32_t const row_from,
30 uint32_t const row_to,
31 void * remap),
32 void * remap)
33 {
34 // get mapping array
35 uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
36 uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
37
38 // init the mapping array
39 for (uint32_t ii=0; ii<rows; ii++)
40 map_curr[ii] = ii;
41
42 // successively transpose rows using blends
43 for (uint32_t cc=1; cc<=cols_log2; cc++)
44 {
45 uint32_t const mask = BITS_TO_MASK(cc);
46
47 for (uint32_t ii=0; ii<rows; ii++)
48 {
49 uint32_t const left = map_curr[ii];
50 uint32_t const stay = left & ~mask;
51
52 if (left != stay) // will be swapped away
53 {
54 for (uint32_t jj=0; jj<rows; jj++)
55 {
56 if (map_curr[jj] == stay)
57 {
58 map_next[jj] = stay;
59 map_next[ii] = stay + (rows << (cc-1));
60
61 pfn_blend(cc,ii,jj,blend); // log2,left,right,payload
62
63 break;
64 }
65 }
66 }
67 }
68
69 uint32_t * tmp = map_curr;
70
71 map_curr = map_next;
72 map_next = tmp;
73 }
74
75 // write out the remapping
76 for (uint32_t ii=0; ii<rows; ii++)
77 pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
78 }
79
80 //
81 // test it!
82 //
83
84 #ifdef HS_TRANSPOSE_DEBUG
85
86 #include <stdio.h>
87
88 static uint32_t cols; // implicit on SIMD/GPU
89
90 static
91 void
hsg_debug_blend(uint32_t const cols_log2,uint32_t const row_ll,uint32_t const row_ur,uint32_t * b)92 hsg_debug_blend(uint32_t const cols_log2,
93 uint32_t const row_ll, // lower-left
94 uint32_t const row_ur, // upper-right
95 uint32_t * b)
96 {
97 fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
98
99 uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b));
100 uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b));
101
102 memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
103 memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
104
105 for (uint32_t ii=0; ii<cols; ii++)
106 b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
107
108 for (uint32_t ii=0; ii<cols; ii++)
109 b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
110 }
111
112 static
113 void
hsg_debug_remap(uint32_t const row_from,uint32_t const row_to,uint32_t * const r)114 hsg_debug_remap(uint32_t const row_from,
115 uint32_t const row_to,
116 uint32_t * const r)
117 {
118 fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
119
120 r[row_to] = row_from;
121 }
122
123 static
124 void
hsg_debug_print(uint32_t const rows,uint32_t const * const m,uint32_t const * const r)125 hsg_debug_print(uint32_t const rows,
126 uint32_t const * const m,
127 uint32_t const * const r)
128 {
129 for (uint32_t rr=0; rr<rows; rr++) {
130 for (uint32_t cc=0; cc<cols; cc++)
131 fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
132 fprintf(stdout,"\n");
133 }
134 }
135
136 int
main(int argc,char * argv[])137 main(int argc, char * argv[])
138 {
139 uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
140 uint32_t const rows = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);
141
142 if (rows & 1)
143 return;
144
145 cols = 1 << cols_log2;
146
147 uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b));
148 uint32_t * const r = ALLOCA_MACRO( rows * sizeof(*r));
149
150 for (uint32_t rr=0; rr<rows; rr++) {
151 r[rr] = rr;
152 for (uint32_t cc=0; cc<cols; cc++)
153 b[rr*cols+cc] = cc*rows+rr;
154 }
155
156 hsg_debug_print(rows,b,r);
157
158 hsg_transpose(cols_log2,rows,
159 hsg_debug_blend,b,
160 hsg_debug_remap,r);
161
162 hsg_debug_print(rows,b,r);
163
164 return EXIT_SUCCESS;
165 }
166
167 #endif
168
169 //
170 //
171 //
172