// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP #define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP #include #include #include // Common for all OpenCL C++ tests #include "../common.hpp" // Common for tests of work-group functions #include "common.hpp" // ----------------------------------------------------------------------------------- // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------ // ----------------------------------------------------------------------------------- #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) template std::string generate_wg_reduce_kernel_code() { return "__kernel void test_wg_reduce(global " + type_name() + " *input, global " + type_name() + " *output)\n" "{\n" " ulong tid = get_global_id(0);\n" "\n" " " + type_name() + " result = work_group_reduce_" + to_string(op) + "(input[tid]);\n" " output[tid] = result;\n" "}\n"; } #else template std::string generate_wg_reduce_kernel_code() { return "#include \n" "#include \n" "#include \n" "using namespace cl;\n" "__kernel void test_wg_reduce(global_ptr<" + type_name() + "[]> input, " "global_ptr<" + type_name() + "[]> output)\n" "{\n" " ulong tid = get_global_id(0);\n" " " + type_name() + " result = work_group_reduce(input[tid]);\n" " output[tid] = result;\n" "}\n"; } #endif template int verify_wg_reduce_add(const std::vector &in, const std::vector &out, size_t wg_size) { size_t i, j; for (i = 0; i < in.size(); i += wg_size) { CL_INT_TYPE sum = 0; // Work-group sum for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) sum += in[i + j]; // Check if all work-items in work-group stored correct value for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) { if (sum != out[i + j]) { log_info( "work_group_reduce_add %s: Error at %lu: expected = %lu, got = %lu\n", type_name().c_str(), i + j, static_cast(sum), static_cast(out[i + j])); return -1; } } } return 0; } template int verify_wg_reduce_min(const std::vector &in, const std::vector &out, size_t wg_size) { size_t i, j; for (i = 0; i < in.size(); i += wg_size) { CL_INT_TYPE min = (std::numeric_limits::max)(); // Work-group min for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) min = std::min(min, in[i + j]); // Check if all work-items in work-group stored correct value for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) { if (min != out[i + j]) { log_info( "work_group_reduce_min %s: Error at %lu: expected = %lu, got = %lu\n", type_name().c_str(), i + j, static_cast(min), static_cast(out[i + j])); return -1; } } } return 0; } template int verify_wg_reduce_max(const std::vector &in, const std::vector &out, size_t wg_size) { size_t i, j; for (i = 0; i < in.size(); i += wg_size) { CL_INT_TYPE max = (std::numeric_limits::min)(); // Work-group max for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) max = std::max(max, in[i + j]); // Check if all work-items in work-group stored correct value for (j = 0; j < ((in.size() - i) > wg_size ? wg_size : (in.size() - i)); j++) { if (max != out[i + j]) { log_info( "work_group_reduce_max %s: Error at %lu: expected = %lu, got = %lu\n", type_name().c_str(), i + j, static_cast(max), static_cast(out[i + j])); return -1; } } } return 0; } template int verify_wg_reduce(const std::vector &in, const std::vector &out, size_t wg_size) { switch (op) { case work_group_op::add: return verify_wg_reduce_add(in, out, wg_size); case work_group_op::min: return verify_wg_reduce_min(in, out, wg_size); case work_group_op::max: return verify_wg_reduce_max(in, out, wg_size); } return -1; } template int work_group_reduce(cl_device_id device, cl_context context, cl_command_queue queue, size_t count) { // don't run test for unsupported types if(!type_supported(device)) { return CL_SUCCESS; } cl_mem buffers[2]; cl_program program; cl_kernel kernel; size_t wg_size; size_t work_size[1]; int err; std::string code_str = generate_wg_reduce_kernel_code(); // ----------------------------------------------------------------------------------- // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------ // ----------------------------------------------------------------------------------- // Only OpenCL C++ to SPIR-V compilation #if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION) err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce"); RETURN_ON_ERROR(err) return err; // Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code) #elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS) err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce", "-cl-std=CL2.0", false); RETURN_ON_ERROR(err) #else err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_reduce"); RETURN_ON_ERROR(err) #endif err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo") // Calculate global work size size_t flat_work_size; size_t wg_number = static_cast( std::ceil(static_cast(count) / wg_size) ); flat_work_size = wg_number * wg_size; work_size[0] = flat_work_size; std::vector input = generate_input(flat_work_size, wg_size); std::vector output = generate_output(flat_work_size, wg_size); buffers[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(CL_INT_TYPE) * input.size(), NULL, &err); RETURN_ON_CL_ERROR(err, "clCreateBuffer"); buffers[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(CL_INT_TYPE) * output.size(), NULL, &err); RETURN_ON_CL_ERROR(err, "clCreateBuffer"); err = clEnqueueWriteBuffer( queue, buffers[0], CL_TRUE, 0, sizeof(CL_INT_TYPE) * input.size(), static_cast(input.data()), 0, NULL, NULL ); RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer"); err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]); err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]); RETURN_ON_CL_ERROR(err, "clSetKernelArg"); err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, &wg_size, 0, NULL, NULL); RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel"); err = clEnqueueReadBuffer( queue, buffers[1], CL_TRUE, 0, sizeof(CL_INT_TYPE) * output.size(), static_cast(output.data()), 0, NULL, NULL ); RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer"); if (verify_wg_reduce(input, output, wg_size) != CL_SUCCESS) { RETURN_ON_ERROR_MSG(-1, "work_group_reduce_%s %s failed", to_string(op).c_str(), type_name().c_str()); } log_info("work_group_reduce_%s %s passed\n", to_string(op).c_str(), type_name().c_str()); clReleaseMemObject(buffers[0]); clReleaseMemObject(buffers[1]); clReleaseKernel(kernel); clReleaseProgram(program); return err; } AUTO_TEST_CASE(test_work_group_reduce_add) (cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) { int error = CL_SUCCESS; int local_error = CL_SUCCESS; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; if(error != CL_SUCCESS) return -1; return CL_SUCCESS; } AUTO_TEST_CASE(test_work_group_reduce_min) (cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) { int error = CL_SUCCESS; int local_error = CL_SUCCESS; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; if(error != CL_SUCCESS) return -1; return CL_SUCCESS; } AUTO_TEST_CASE(test_work_group_reduce_max) (cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) { int error = CL_SUCCESS; int local_error = CL_SUCCESS; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; local_error = work_group_reduce(device, context, queue, n_elems); CHECK_ERROR(local_error) error |= local_error; if(error != CL_SUCCESS) return -1; return CL_SUCCESS; } #endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_REDUCE_HPP