1 //
2 // Copyright (c) 2021 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "procs.h"
17 #include "subhelpers.h"
18 #include "subgroup_common_templates.h"
19 #include "harness/typeWrappers.h"
20
21 namespace {
22 std::string sub_group_clustered_reduce_source = R"(
23 __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out,
24 uint cluster_size) {
25 Type r;
26 int gid = get_global_id(0);
27 XY(xy,gid);
28 xy[gid].w = 0;
29 Type v = in[gid];
30 if (sizeof(in[gid]) == sizeof(%s(v, 1))) {
31 xy[gid].w = sizeof(in[gid]);
32 }
33 switch (cluster_size) {
34 case 1: r = %s(v, 1); break;
35 case 2: r = %s(v, 2); break;
36 case 4: r = %s(v, 4); break;
37 case 8: r = %s(v, 8); break;
38 case 16: r = %s(v, 16); break;
39 case 32: r = %s(v, 32); break;
40 case 64: r = %s(v, 64); break;
41 case 128: r = %s(v, 128); break;
42 }
43 out[gid] = r;
44 }
45 )";
46
47 // DESCRIPTION:
48 // Test for reduce cluster functions
49 template <typename Ty, ArithmeticOp operation> struct RED_CLU
50 {
log_test__anon27c715880111::RED_CLU51 static void log_test(const WorkGroupParams &test_params,
52 const char *extra_text)
53 {
54 log_info(" sub_group_clustered_reduce_%s(%s, %zu bytes) ...%s\n",
55 operation_names(operation), TypeManager<Ty>::name(),
56 sizeof(Ty), extra_text);
57 }
58
gen__anon27c715880111::RED_CLU59 static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
60 {
61 int nw = test_params.local_workgroup_size;
62 int ns = test_params.subgroup_size;
63 int ng = test_params.global_workgroup_size;
64 ng = ng / nw;
65 generate_inputs<Ty, operation>(x, t, m, ns, nw, ng);
66 }
67
chk__anon27c715880111::RED_CLU68 static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
69 const WorkGroupParams &test_params)
70 {
71 int nw = test_params.local_workgroup_size;
72 int ns = test_params.subgroup_size;
73 int ng = test_params.global_workgroup_size;
74 int nj = (nw + ns - 1) / ns;
75 ng = ng / nw;
76
77 for (int k = 0; k < ng; ++k)
78 {
79 std::vector<cl_int> data_type_sizes;
80 // Map to array indexed to array indexed by local ID and sub group
81 for (int j = 0; j < nw; ++j)
82 {
83 mx[j] = x[j];
84 my[j] = y[j];
85 data_type_sizes.push_back(m[4 * j + 3]);
86 }
87
88 for (cl_int dts : data_type_sizes)
89 {
90 if (dts != sizeof(Ty))
91 {
92 log_error("ERROR: sub_group_clustered_reduce_%s(%s) "
93 "wrong data type size detected, expected: %zu, "
94 "used by device %d, in group %d\n",
95 operation_names(operation),
96 TypeManager<Ty>::name(), sizeof(Ty), dts, k);
97 return TEST_FAIL;
98 }
99 }
100
101 for (int j = 0; j < nj; ++j)
102 {
103 int ii = j * ns;
104 int n = ii + ns > nw ? nw - ii : ns;
105 std::vector<Ty> clusters_results;
106 int clusters_counter = ns / test_params.cluster_size;
107 clusters_results.resize(clusters_counter);
108
109 // Compute target
110 Ty tr = mx[ii];
111 for (int i = 0; i < n; ++i)
112 {
113 if (i % test_params.cluster_size == 0)
114 tr = mx[ii + i];
115 else
116 tr = calculate<Ty>(tr, mx[ii + i], operation);
117 clusters_results[i / test_params.cluster_size] = tr;
118 }
119
120 // Check result
121 for (int i = 0; i < n; ++i)
122 {
123 Ty rr = my[ii + i];
124 tr = clusters_results[i / test_params.cluster_size];
125 if (!compare(rr, tr))
126 {
127 log_error(
128 "ERROR: sub_group_clustered_reduce_%s(%s, %u) "
129 "mismatch for local id %d in sub group %d in group "
130 "%d\n",
131 operation_names(operation), TypeManager<Ty>::name(),
132 test_params.cluster_size, i, j, k);
133 return TEST_FAIL;
134 }
135 }
136 }
137
138 x += nw;
139 y += nw;
140 m += 4 * nw;
141 }
142 return TEST_PASS;
143 }
144 };
145
146 template <typename T>
run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)147 int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
148 {
149 int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
150 "sub_group_clustered_reduce_add");
151 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
152 "sub_group_clustered_reduce_max");
153 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
154 "sub_group_clustered_reduce_min");
155 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
156 "sub_group_clustered_reduce_mul");
157 return error;
158 }
run_cluster_and_or_xor_for_type(RunTestForType rft)159 template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
160 {
161 int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
162 "sub_group_clustered_reduce_and");
163 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
164 "sub_group_clustered_reduce_or");
165 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
166 "sub_group_clustered_reduce_xor");
167 return error;
168 }
169 template <typename T>
run_cluster_logical_and_or_xor_for_type(RunTestForType rft)170 int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
171 {
172 int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
173 "sub_group_clustered_reduce_logical_and");
174 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
175 "sub_group_clustered_reduce_logical_or");
176 error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
177 "sub_group_clustered_reduce_logical_xor");
178
179 return error;
180 }
181 }
182
test_subgroup_functions_clustered_reduce(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)183 int test_subgroup_functions_clustered_reduce(cl_device_id device,
184 cl_context context,
185 cl_command_queue queue,
186 int num_elements)
187 {
188 if (!is_extension_available(device, "cl_khr_subgroup_clustered_reduce"))
189 {
190 log_info("cl_khr_subgroup_clustered_reduce is not supported on this "
191 "device, skipping test.\n");
192 return TEST_SKIPPED_ITSELF;
193 }
194
195 constexpr size_t global_work_size = 2000;
196 constexpr size_t local_work_size = 200;
197 WorkGroupParams test_params(global_work_size, local_work_size, -1, 3);
198 test_params.save_kernel_source(sub_group_clustered_reduce_source);
199 RunTestForType rft(device, context, queue, num_elements, test_params);
200
201 int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
202 error |= run_cluster_red_add_max_min_mul_for_type<cl_uint>(rft);
203 error |= run_cluster_red_add_max_min_mul_for_type<cl_long>(rft);
204 error |= run_cluster_red_add_max_min_mul_for_type<cl_ulong>(rft);
205 error |= run_cluster_red_add_max_min_mul_for_type<cl_short>(rft);
206 error |= run_cluster_red_add_max_min_mul_for_type<cl_ushort>(rft);
207 error |= run_cluster_red_add_max_min_mul_for_type<cl_char>(rft);
208 error |= run_cluster_red_add_max_min_mul_for_type<cl_uchar>(rft);
209 error |= run_cluster_red_add_max_min_mul_for_type<cl_float>(rft);
210 error |= run_cluster_red_add_max_min_mul_for_type<cl_double>(rft);
211 error |= run_cluster_red_add_max_min_mul_for_type<subgroups::cl_half>(rft);
212
213 error |= run_cluster_and_or_xor_for_type<cl_int>(rft);
214 error |= run_cluster_and_or_xor_for_type<cl_uint>(rft);
215 error |= run_cluster_and_or_xor_for_type<cl_long>(rft);
216 error |= run_cluster_and_or_xor_for_type<cl_ulong>(rft);
217 error |= run_cluster_and_or_xor_for_type<cl_short>(rft);
218 error |= run_cluster_and_or_xor_for_type<cl_ushort>(rft);
219 error |= run_cluster_and_or_xor_for_type<cl_char>(rft);
220 error |= run_cluster_and_or_xor_for_type<cl_uchar>(rft);
221
222 error |= run_cluster_logical_and_or_xor_for_type<cl_int>(rft);
223 return error;
224 }
225