// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "harness/compat.h" #include #include #include #include #include "procs.h" #include "harness/conversions.h" #include "harness/ThreadPool.h" #define NUM_TESTS 23 #define LONG_MATH_SHIFT_SIZE 26 #define QUICK_MATH_SHIFT_SIZE 16 static const char *kernel_code = "__kernel void test(__global %s%s *srcA, __global %s%s *srcB, __global %s%s *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " dst[tid] = srcA[tid] %s srcB[tid];\n" "}\n"; static const char *kernel_code_V3 = "__kernel void test(__global %s /*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( vload3( tid, srcA ) %s vload3( tid, srcB), tid, dst );\n" "}\n"; static const char *kernel_code_V3_scalar_vector = "__kernel void test(__global %s /*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( srcA[tid] %s vload3( tid, srcB), tid, dst );\n" "}\n"; static const char *kernel_code_V3_vector_scalar = "__kernel void test(__global %s /*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( vload3( tid, srcA ) %s srcB[tid], tid, dst );\n" "}\n"; // Separate kernel here because it does not fit the pattern static const char *not_kernel_code = "__kernel void test(__global %s%s *srcA, __global %s%s *srcB, __global %s%s *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " dst[tid] = %ssrcA[tid];\n" "}\n"; static const char *not_kernel_code_V3 = "__kernel void test(__global %s /*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( %s vload3( tid, srcA ), tid, dst );\n" "}\n"; static const char *kernel_code_scalar_shift = "__kernel void test(__global %s%s *srcA, __global %s%s *srcB, __global %s%s *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " dst[tid] = srcA[tid] %s srcB[tid]%s;\n" "}\n"; static const char *kernel_code_scalar_shift_V3 = "__kernel void test(__global %s/*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( vload3( tid, srcA) %s vload3( tid, srcB )%s, tid, dst );\n" "}\n"; static const char *kernel_code_question_colon = "__kernel void test(__global %s%s *srcA, __global %s%s *srcB, __global %s%s *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " dst[tid] = (srcA[tid]%s < srcB[tid]%s) ? srcA[tid] : srcB[tid];\n" "}\n"; static const char *kernel_code_question_colon_V3 = "__kernel void test(__global %s/*%s*/ *srcA, __global %s/*%s*/ *srcB, __global %s/*%s*/ *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " vstore3( (vload3( tid, srcA)%s < vload3(tid, srcB)%s) ? vload3( tid, srcA) : vload3( tid, srcB), tid, dst );\n" "}\n"; // External verification and data generation functions extern const char *tests[]; extern const char *test_names[]; extern int verify_long(int test, size_t vector_size, cl_long *inptrA, cl_long *inptrB, cl_long *outptr, size_t n); extern void init_long_data(uint64_t indx, int num_elements, cl_long *input_ptr[], MTdata d) ; extern int verify_ulong(int test, size_t vector_size, cl_ulong *inptrA, cl_ulong *inptrB, cl_ulong *outptr, size_t n); extern void init_ulong_data(uint64_t indx, int num_elements, cl_ulong *input_ptr[], MTdata d) ; extern int verify_int(int test, size_t vector_size, cl_int *inptrA, cl_int *inptrB, cl_int *outptr, size_t n); extern void init_int_data(uint64_t indx, int num_elements, cl_int *input_ptr[], MTdata d) ; extern int verify_uint(int test, size_t vector_size, cl_uint *inptrA, cl_uint *inptrB, cl_uint *outptr, size_t n); extern void init_uint_data(uint64_t indx, int num_elements, cl_uint *input_ptr[], MTdata d) ; extern int verify_short(int test, size_t vector_size, cl_short *inptrA, cl_short *inptrB, cl_short *outptr, size_t n); extern void init_short_data(uint64_t indx, int num_elements, cl_short *input_ptr[], MTdata d) ; extern int verify_ushort(int test, size_t vector_size, cl_ushort *inptrA, cl_ushort *inptrB, cl_ushort *outptr, size_t n); extern void init_ushort_data(uint64_t indx, int num_elements, cl_ushort *input_ptr[], MTdata d) ; extern int verify_char(int test, size_t vector_size, cl_char *inptrA, cl_char *inptrB, cl_char *outptr, size_t n); extern void init_char_data(uint64_t indx, int num_elements, cl_char *input_ptr[], MTdata d) ; extern int verify_uchar(int test, size_t vector_size, cl_uchar *inptrA, cl_uchar *inptrB, cl_uchar *outptr, size_t n); extern void init_uchar_data(uint64_t indx, int num_elements, cl_uchar *input_ptr[], MTdata d) ; // Supported type list const ExplicitType types[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, }; enum TestStyle { kDontCare=0, kBothVectors, kInputAScalar, kInputBScalar, kVectorScalarScalar, // for the ?: operator only; indicates vector ? scalar : scalar. kInputCAlsoScalar = 0x80 // Or'ed flag to indicate that the selector for the ?: operator is also scalar }; typedef struct _perThreadData { cl_mem m_streams[3]; cl_int *m_input_ptr[2], *m_output_ptr; size_t m_type_size; cl_program m_program[NUM_TESTS]; cl_kernel m_kernel[NUM_TESTS]; } perThreadData; perThreadData * perThreadDataNew() { perThreadData * pThis = (perThreadData *)malloc(sizeof(perThreadData)); memset(pThis->m_program, 0, sizeof(cl_program)*NUM_TESTS); memset(pThis->m_kernel, 0, sizeof(cl_kernel)*NUM_TESTS); pThis->m_input_ptr[0] = pThis->m_input_ptr[1] = NULL; pThis->m_output_ptr = NULL; return pThis; } void perThreadDataDestroy(perThreadData * pThis) { int i; // cleanup clReleaseMemObject(pThis->m_streams[0]); clReleaseMemObject(pThis->m_streams[1]); clReleaseMemObject(pThis->m_streams[2]); for (i=0; im_kernel[i] != NULL) clReleaseKernel(pThis->m_kernel[i]); if (pThis->m_program[i] != NULL) clReleaseProgram(pThis->m_program[i]); } free(pThis->m_input_ptr[0]); free(pThis->m_input_ptr[1]); free(pThis->m_output_ptr); free(pThis); } cl_int perThreadDataInit(perThreadData * pThis, ExplicitType type, int num_elements, int vectorSize, int inputAVecSize, int inputBVecSize, cl_context context, int start_test_ID, int end_test_ID, int testID) { int i; const char * sizeNames[] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; const char *type_name = get_explicit_type_name(type); pThis->m_type_size = get_explicit_type_size(type); int err; // Used for the && and || tests where the vector case returns a signed value const char *signed_type_name; switch (type) { case kChar: case kUChar: signed_type_name = get_explicit_type_name(kChar); break; case kShort: case kUShort: signed_type_name = get_explicit_type_name(kShort); break; case kInt: case kUInt: signed_type_name = get_explicit_type_name(kInt); break; case kLong: case kULong: signed_type_name = get_explicit_type_name(kLong); break; default: log_error("Invalid type.\n"); return -1; break; } pThis->m_input_ptr[0] = (cl_int*)malloc(pThis->m_type_size * num_elements * vectorSize); pThis->m_input_ptr[1] = (cl_int*)malloc(pThis->m_type_size * num_elements * vectorSize); pThis->m_output_ptr = (cl_int*)malloc(pThis->m_type_size * num_elements * vectorSize); pThis->m_streams[0] = clCreateBuffer( context, CL_MEM_READ_WRITE, pThis->m_type_size * num_elements * inputAVecSize, NULL, &err); test_error(err, "clCreateBuffer failed"); pThis->m_streams[1] = clCreateBuffer( context, CL_MEM_READ_WRITE, pThis->m_type_size * num_elements * inputBVecSize, NULL, &err); test_error(err, "clCreateBuffer failed"); pThis->m_streams[2] = clCreateBuffer( context, CL_MEM_READ_WRITE, pThis->m_type_size * num_elements * vectorSize, NULL, &err); test_error(err, "clCreateBuffer failed"); const char *vectorString = sizeNames[ vectorSize ]; const char *inputAVectorString = sizeNames[ inputAVecSize ]; const char *inputBVectorString = sizeNames[ inputBVecSize ]; if (testID == -1) { log_info("\tTesting %s%s (%d bytes)...\n", type_name, vectorString, (int)(pThis->m_type_size*vectorSize)); } char programString[4096]; const char *ptr; const char * kernel_code_base = ( vectorSize != 3 ) ? kernel_code : ( inputAVecSize == 1 ) ? kernel_code_V3_scalar_vector : ( inputBVecSize == 1 ) ? kernel_code_V3_vector_scalar : kernel_code_V3; for (i=start_test_ID; i 1 sprintf(programString, kernel_code_base, type_name, inputAVectorString, type_name, inputBVectorString, ((vectorSize == 1) ? type_name : signed_type_name), vectorString, tests[i]); break; case 22: // Need an unsigned result here for vector sizes > 1 sprintf(programString, vectorSize == 3 ? not_kernel_code_V3 : not_kernel_code, type_name, inputAVectorString, type_name, inputBVectorString, ((vectorSize == 1) ? type_name : signed_type_name), vectorString, tests[i]); break; default: sprintf(programString, kernel_code_base, type_name, inputAVectorString, type_name, inputBVectorString, type_name, vectorString, tests[i]); break; } //printf("kernel: %s\n", programString); ptr = programString; err = create_single_kernel_helper( context, &(pThis->m_program[ i ]), &(pThis->m_kernel[ i ]), 1, &ptr, "test" ); test_error( err, "Unable to create test kernel" ); err = clSetKernelArg(pThis->m_kernel[i], 0, sizeof pThis->m_streams[0], &(pThis->m_streams[0]) ); err |= clSetKernelArg(pThis->m_kernel[i], 1, sizeof pThis->m_streams[1], &(pThis->m_streams[1]) ); err |= clSetKernelArg(pThis->m_kernel[i], 2, sizeof pThis->m_streams[2], &(pThis->m_streams[2]) ); test_error(err, "clSetKernelArgs failed"); } return CL_SUCCESS; } typedef struct _globalThreadData { cl_device_id m_deviceID; cl_context m_context; // cl_command_queue m_queue; int m_num_elements; int m_threadcount; int m_vectorSize; int m_num_runs_shift; TestStyle m_style; ExplicitType m_type; MTdata * m_pRandData; uint64_t m_offset; int m_testID; perThreadData **m_arrPerThreadData; } globalThreadData; globalThreadData * globalThreadDataNew(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int vectorSize, TestStyle style, int num_runs_shift, ExplicitType type, int testID, int threadcount) { int i; globalThreadData * pThis = (globalThreadData *)malloc(sizeof(globalThreadData)); pThis->m_deviceID = deviceID; pThis->m_context = context; // pThis->m_queue = queue; pThis->m_num_elements = num_elements; pThis->m_num_runs_shift = num_runs_shift; pThis->m_vectorSize = vectorSize; pThis->m_style = style; pThis->m_type = type; pThis->m_offset = (uint64_t)0; pThis->m_testID = testID; pThis->m_arrPerThreadData = NULL; pThis->m_threadcount = threadcount; pThis->m_pRandData = (MTdata *)malloc(threadcount*sizeof(MTdata)); pThis->m_arrPerThreadData = (perThreadData **) malloc(threadcount*sizeof(perThreadData *)); for(i=0; i < threadcount; ++i) { pThis->m_pRandData[i] = init_genrand(i+1); pThis->m_arrPerThreadData[i] = NULL; } return pThis; } void globalThreadDataDestroy(globalThreadData * pThis) { int i; for(i=0; i < pThis->m_threadcount; ++i) { free_mtdata(pThis->m_pRandData[i]); if(pThis->m_arrPerThreadData[i] != NULL) { perThreadDataDestroy(pThis->m_arrPerThreadData[i]); } } free(pThis->m_arrPerThreadData); free(pThis->m_pRandData); free(pThis); } int test_integer_ops(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int vectorSize, TestStyle style, int num_runs_shift, ExplicitType type, int testID, MTdata randIn, uint64_t startIndx, uint64_t endIndx, perThreadData ** ppThreadData); cl_int test_integer_ops_do_thread( cl_uint job_id, cl_uint thread_id, void *userInfo ) { cl_int error; cl_int result; globalThreadData * threadInfoGlobal = (globalThreadData *)userInfo; cl_command_queue queue; #if THREAD_DEBUG log_error("Thread %x (job %x) about to create command queue\n", thread_id, job_id); #endif queue = clCreateCommandQueue (threadInfoGlobal->m_context, threadInfoGlobal->m_deviceID,0, &error); if(error != CL_SUCCESS) { log_error("Thread %x (job %x) could not create command queue\n", thread_id, job_id); return error; // should we clean up the queue too? } #if THREAD_DEBUG log_error("Thread %x (job %x) created command queue\n", thread_id, job_id); #endif result = test_integer_ops( threadInfoGlobal->m_deviceID, threadInfoGlobal->m_context, queue, threadInfoGlobal->m_num_elements, threadInfoGlobal->m_vectorSize, threadInfoGlobal->m_style, threadInfoGlobal->m_num_runs_shift, threadInfoGlobal->m_type, threadInfoGlobal->m_testID, threadInfoGlobal->m_pRandData[thread_id], threadInfoGlobal->m_offset + threadInfoGlobal->m_num_elements*job_id, threadInfoGlobal->m_offset + threadInfoGlobal->m_num_elements*(job_id+1), &(threadInfoGlobal->m_arrPerThreadData[thread_id]) ); if(result != 0) { log_error("Thread %x (job %x) failed test_integer_ops with result %x\n", thread_id, job_id, result); // return error; } error = clReleaseCommandQueue(queue); if(error != CL_SUCCESS) { log_error("Thread %x (job %x) could not release command queue\n", thread_id, job_id); return error; } return result; } int test_integer_ops_threaded(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int vectorSize, TestStyle style, int num_runs_shift, ExplicitType type, int testID) { globalThreadData * pThreadInfo = NULL; cl_int result=0; cl_uint threadcount = GetThreadCount(); // Check to see if we are using single threaded mode on other than a 1.0 device if (getenv( "CL_TEST_SINGLE_THREADED" )) { char device_version[1024] = { 0 }; result = clGetDeviceInfo( deviceID, CL_DEVICE_VERSION, sizeof(device_version), device_version, NULL ); if(result != CL_SUCCESS) { log_error("clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed: %d\n", result); return result; } if (strcmp("OpenCL 1.0 ",device_version)) { log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n"); } } // This test will run threadcount threads concurrently; each thread will execute test_integer_ops() // which will allocate 2 OpenCL buffers on the device; each buffer has size num_elements * type_size * vectorSize. // We need to make sure that the total device memory allocated by all threads does not exceed the maximum // memory on the device. If it does, we decrease num_elements until all threads combined will not // over-subscribe device memory. cl_ulong maxDeviceGlobalMem; result = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxDeviceGlobalMem), &maxDeviceGlobalMem, NULL); if(result != CL_SUCCESS) { log_error("clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed: %d\n", result); return result; } if (maxDeviceGlobalMem > (cl_ulong)SIZE_MAX) { maxDeviceGlobalMem = (cl_ulong)SIZE_MAX; } // Let's not take all device memory - reduce by 75% maxDeviceGlobalMem = (maxDeviceGlobalMem * 3) >> 2; // Now reduce num_elements so that the total device memory usage does not exceed 75% of global device memory. size_t type_size = get_explicit_type_size(type); while ((cl_ulong)threadcount * 4 * num_elements * type_size * vectorSize > maxDeviceGlobalMem) { num_elements >>= 1; } uint64_t startIndx = (uint64_t)0; uint64_t endIndx = (1ULL<m_offset = startIndx; #if THREAD_DEBUG log_error("Launching %llx jobs\n", jobcount); #endif result = ThreadPool_Do(test_integer_ops_do_thread, (cl_uint)jobcount, (void *)pThreadInfo); if(result != 0) { // cleanup ?? log_error("ThreadPool_Do return non-success value %d\n", result); } globalThreadDataDestroy(pThreadInfo); return result; } int test_integer_ops(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int vectorSize, TestStyle style, int num_runs_shift, ExplicitType type, int testID, MTdata randDataIn, uint64_t startIndx, uint64_t endIndx, perThreadData ** ppThreadData) { size_t threads[1]; int err; int i; int inputAVecSize, inputBVecSize; inputAVecSize = inputBVecSize = vectorSize; if( style == kInputAScalar ) inputAVecSize = 1; else if( style == kInputBScalar ) inputBVecSize = 1; /* if( inputAVecSize != inputBVecSize ) log_info("Testing \"%s\" on %s%d (%s-%s inputs) (range %llx - %llx of 0-%llx)\n", test_names[testID], get_explicit_type_name(type), vectorSize, ( inputAVecSize == 1 ) ? "scalar" : "vector", ( inputBVecSize == 1 ) ? "scalar" : "vector", startIndx, endIndx, (1ULL< NUM_TESTS) { log_error("Invalid test ID: %d\n", testID); return -1; } if(*ppThreadData == NULL) { *ppThreadData = perThreadDataNew(); err = perThreadDataInit(*ppThreadData, type, num_elements, vectorSize, inputAVecSize, inputBVecSize, context, start_test_ID, end_test_ID, testID); test_error(err, "failed to init per thread data\n"); } perThreadData * pThreadData = *ppThreadData; threads[0] = (size_t)num_elements; int error_count = 0; for (i=start_test_ID; i= endIndx) { startIndx = (uint64_t)0; endIndx = (1ULL<m_input_ptr), randDataIn); break; case kUChar: init_uchar_data(indx, num_elements * vectorSize, (cl_uchar**)(pThreadData->m_input_ptr), randDataIn); break; case kShort: init_short_data(indx, num_elements * vectorSize, (cl_short**)(pThreadData->m_input_ptr), randDataIn); break; case kUShort: init_ushort_data(indx, num_elements * vectorSize, (cl_ushort**)(pThreadData->m_input_ptr), randDataIn); break; case kInt: init_int_data(indx, num_elements * vectorSize, (cl_int**)(pThreadData->m_input_ptr), randDataIn); break; case kUInt: init_uint_data(indx, num_elements * vectorSize, (cl_uint**)(pThreadData->m_input_ptr), randDataIn); break; case kLong: init_long_data(indx, num_elements * vectorSize, (cl_long**)(pThreadData->m_input_ptr), randDataIn); break; case kULong: init_ulong_data(indx, num_elements * vectorSize, (cl_ulong**)(pThreadData->m_input_ptr), randDataIn); break; default: err = 1; log_error("Invalid type.\n"); break; } err = clEnqueueWriteBuffer(queue, pThreadData->m_streams[0], CL_FALSE, 0, pThreadData->m_type_size*num_elements * inputAVecSize, (void *)pThreadData->m_input_ptr[0], 0, NULL, NULL); test_error(err, "clEnqueueWriteBuffer failed"); err = clEnqueueWriteBuffer( queue, pThreadData->m_streams[1], CL_FALSE, 0, pThreadData->m_type_size*num_elements * inputBVecSize, (void *)pThreadData->m_input_ptr[1], 0, NULL, NULL ); test_error(err, "clEnqueueWriteBuffer failed"); err = clEnqueueNDRangeKernel( queue, pThreadData->m_kernel[i], 1, NULL, threads, NULL, 0, NULL, NULL ); test_error(err, "clEnqueueNDRangeKernel failed"); err = clEnqueueReadBuffer( queue, pThreadData->m_streams[2], CL_TRUE, 0, pThreadData->m_type_size*num_elements * vectorSize, (void *)pThreadData->m_output_ptr, 0, NULL, NULL ); test_error(err, "clEnqueueReadBuffer failed"); // log_info("Performing verification\n"); // If one of the inputs are scalar, we need to extend the input values to vectors // to accommodate the verify functions if( vectorSize > 1 ) { char * p = NULL; if( style == kInputAScalar ) p = (char *)pThreadData->m_input_ptr[ 0 ]; else if( style == kInputBScalar ) p = (char *)pThreadData->m_input_ptr[ 1 ]; if( p != NULL ) { for( int element = num_elements - 1; element >= 0; element-- ) { for( int vec = ( element == 0 ) ? 1 : 0; vec < vectorSize; vec++ ) memcpy( p + ( element * vectorSize + vec ) * pThreadData->m_type_size, p + element * pThreadData->m_type_size, pThreadData->m_type_size ); } } } switch (type) { case kChar: err = verify_char(i, vectorSize, (cl_char*)pThreadData->m_input_ptr[0], (cl_char*)pThreadData->m_input_ptr[1], (cl_char*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kUChar: err = verify_uchar(i, vectorSize, (cl_uchar*)pThreadData->m_input_ptr[0], (cl_uchar*)pThreadData->m_input_ptr[1], (cl_uchar*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kShort: err = verify_short(i, vectorSize, (cl_short*)pThreadData->m_input_ptr[0], (cl_short*)pThreadData->m_input_ptr[1], (cl_short*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kUShort: err = verify_ushort(i, vectorSize, (cl_ushort*)pThreadData->m_input_ptr[0], (cl_ushort*)pThreadData->m_input_ptr[1], (cl_ushort*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kInt: err = verify_int(i, vectorSize, (cl_int*)pThreadData->m_input_ptr[0], (cl_int*)pThreadData->m_input_ptr[1], (cl_int*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kUInt: err = verify_uint(i, vectorSize, (cl_uint*)pThreadData->m_input_ptr[0], (cl_uint*)pThreadData->m_input_ptr[1], (cl_uint*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kLong: err = verify_long(i, vectorSize, (cl_long*)pThreadData->m_input_ptr[0], (cl_long*)pThreadData->m_input_ptr[1], (cl_long*)pThreadData->m_output_ptr, num_elements * vectorSize); break; case kULong: err = verify_ulong(i, vectorSize, (cl_ulong*)pThreadData->m_input_ptr[0], (cl_ulong*)pThreadData->m_input_ptr[1], (cl_ulong*)pThreadData->m_output_ptr, num_elements * vectorSize); break; default: err = 1; log_error("Invalid type.\n"); break; } if (err) { #if 0 log_error( "* inASize: %d inBSize: %d numElem: %d\n", inputAVecSize, inputBVecSize, num_elements ); cl_char *inP = (cl_char *)pThreadData->m_input_ptr[0]; log_error( "from 18:\n" ); for( int q = 18; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); inP = (cl_char *)pThreadData->m_input_ptr[1]; for( int q = 18; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); inP = (cl_char *)pThreadData->m_output_ptr; for( int q = 18; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); log_error( "from 36:\n" ); inP = (cl_char *)pThreadData->m_input_ptr[0]; for( int q = 36; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); inP = (cl_char *)pThreadData->m_input_ptr[1]; for( int q = 36; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); inP = (cl_char *)pThreadData->m_output_ptr; for( int q = 36; q < 64; q++ ) { log_error( "%02x ", inP[ q ] ); } log_error( "\n" ); #endif error_count++; break; } } /* const char * sizeNames[] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; if (err) { log_error("\t\t%s%s test %s failed (range %llx - %llx of 0-%llx)\n", get_explicit_type_name(type), sizeNames[vectorSize], test_names[i], startIndx, endIndx, (1ULL<> scalar case tested by tests 10 and 11 // so they get skipped entirely int testsToRun[] = { 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, 15, 16, 17, 18, 19, 20, 21 }; for (i=0; i< sizeof(testsToRun)/sizeof(testsToRun[0]); i++) { errors += run_test_sizes(deviceID, context, queue, 2048, type, num, testsToRun[i]); } return errors; } int test_vector_scalar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) { int errors = 0; int numTypes = sizeof( types ) / sizeof( types[ 0 ] ); for( int t = 0; t < numTypes; t++ ) { if ((types[ t ] == kLong || types[ t ] == kULong) && !gHasLong) continue; errors += run_vector_scalar_tests( deviceID, context, queue, num_elements, types[ t ], 1 ); break; } return errors; } void generate_random_bool_data( size_t count, MTdata d, cl_char *outData, size_t outDataSize ) { cl_uint bits = genrand_int32(d); cl_uint bitsLeft = 32; memset( outData, 0, outDataSize * count ); for( size_t i = 0; i < count; i++ ) { if( 0 == bitsLeft) { bits = genrand_int32(d); bitsLeft = 32; } // Note: we will be setting just any bit non-zero for the type, so we can easily skip past // and just write bytes (assuming the entire output buffer is already zeroed, which we did) *outData = ( bits & 1 ) ? 0xff : 0; bits >>= 1; bitsLeft -= 1; outData += outDataSize; } } static const char *kernel_question_colon_full = "__kernel void test(__global %s%s *srcA, __global %s%s *srcB, __global %s%s *srcC, __global %s%s *dst)\n" "{\n" " int tid = get_global_id(0);\n" "\n" " %s%s valA = %ssrcA%s" " %s%s valB = %ssrcB%s" " %s%s valC = %ssrcC%s" " %s%s destVal = valC ? valA : valB;\n" " %s" "}\n"; static const char *kernel_qc_load_plain_prefix = ""; static const char *kernel_qc_load_plain_suffix = "[ tid ];\n"; static const char *kernel_qc_load_vec3_prefix = "vload3( tid, "; static const char *kernel_qc_load_vec3_suffix = ");\n"; static const char *kernel_qc_store_plain = "dst[ tid ] = destVal;\n"; static const char *kernel_qc_store_vec3 = "vstore3( destVal, tid, dst );\n"; int test_question_colon_op(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int vectorSize, TestStyle style, ExplicitType type ) { cl_mem streams[4]; cl_int *input_ptr[3], *output_ptr; cl_program program; cl_kernel kernel; size_t threads[1]; int err; int inputAVecSize, inputBVecSize, inputCVecSize; const char * sizeNames[] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; // Identical to sizeNames but with a blank for 3, since we use vload/store there const char * paramSizeNames[] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; MTdata s_randStates; inputAVecSize = inputBVecSize = inputCVecSize = vectorSize; if( style & kInputCAlsoScalar ) { style = (TestStyle)( style & ~kInputCAlsoScalar ); inputCVecSize = 1; } if( style == kInputAScalar ) inputAVecSize = 1; else if( style == kInputBScalar ) inputBVecSize = 1; else if( style == kVectorScalarScalar ) inputAVecSize = inputBVecSize = 1; log_info("Testing \"?:\" on %s%d (%s?%s:%s inputs)\n", get_explicit_type_name(type), vectorSize, ( inputCVecSize == 1 ) ? "scalar" : "vector", ( inputAVecSize == 1 ) ? "scalar" : "vector", ( inputBVecSize == 1 ) ? "scalar" : "vector" ); const char *type_name = get_explicit_type_name(type); size_t type_size = get_explicit_type_size(type); // Create and initialize I/O buffers input_ptr[0] = (cl_int*)malloc(type_size * num_elements * vectorSize); input_ptr[1] = (cl_int*)malloc(type_size * num_elements * vectorSize); input_ptr[2] = (cl_int*)malloc(type_size * num_elements * vectorSize); output_ptr = (cl_int*)malloc(type_size * num_elements * vectorSize); s_randStates = init_genrand( gRandomSeed ); generate_random_data( type, num_elements * inputAVecSize, s_randStates, input_ptr[ 0 ] ); generate_random_data( type, num_elements * inputBVecSize, s_randStates, input_ptr[ 1 ] ); generate_random_bool_data( num_elements * inputCVecSize, s_randStates, (cl_char *)input_ptr[ 2 ], type_size ); streams[0] = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, type_size * num_elements * inputAVecSize, input_ptr[0], &err); test_error(err, "clCreateBuffer failed"); streams[1] = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, type_size * num_elements * inputBVecSize, input_ptr[1], &err); test_error(err, "clCreateBuffer failed"); streams[2] = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, type_size * num_elements * inputCVecSize, input_ptr[2], &err); test_error(err, "clCreateBuffer failed"); streams[3] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, type_size * num_elements * vectorSize, NULL, &err); test_error(err, "clCreateBuffer failed"); const char *vectorString = sizeNames[ vectorSize ]; const char *inputAVectorString = sizeNames[ inputAVecSize ]; const char *inputBVectorString = sizeNames[ inputBVecSize ]; const char *inputCVectorString = sizeNames[ inputCVecSize ]; char programString[4096]; const char *ptr; sprintf( programString, kernel_question_colon_full, type_name, paramSizeNames[ inputAVecSize ], type_name, paramSizeNames[ inputBVecSize ], type_name, paramSizeNames[ inputCVecSize ], type_name, paramSizeNames[ vectorSize ], // Loads type_name, inputAVectorString, ( inputAVecSize == 3 ) ? kernel_qc_load_vec3_prefix : kernel_qc_load_plain_prefix, ( inputAVecSize == 3 ) ? kernel_qc_load_vec3_suffix : kernel_qc_load_plain_suffix, type_name, inputBVectorString, ( inputBVecSize == 3 ) ? kernel_qc_load_vec3_prefix : kernel_qc_load_plain_prefix, ( inputBVecSize == 3 ) ? kernel_qc_load_vec3_suffix : kernel_qc_load_plain_suffix, type_name, inputCVectorString, ( inputCVecSize == 3 ) ? kernel_qc_load_vec3_prefix : kernel_qc_load_plain_prefix, ( inputCVecSize == 3 ) ? kernel_qc_load_vec3_suffix : kernel_qc_load_plain_suffix, // Dest type type_name, vectorString, // Store ( vectorSize == 3 ) ? kernel_qc_store_vec3 : kernel_qc_store_plain ); ptr = programString; err = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test" ); test_error( err, "Unable to create test kernel" ); err = clSetKernelArg( kernel, 0, sizeof streams[0], &streams[0] ); err |= clSetKernelArg( kernel, 1, sizeof streams[1], &streams[1] ); err |= clSetKernelArg( kernel, 2, sizeof streams[2], &streams[2] ); err |= clSetKernelArg( kernel, 3, sizeof streams[3], &streams[3] ); test_error(err, "clSetKernelArgs failed"); // Run threads[0] = (size_t)num_elements; err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL ); test_error(err, "clEnqueueNDRangeKernel failed"); // Read and verify results err = clEnqueueReadBuffer( queue, streams[3], CL_TRUE, 0, type_size*num_elements * vectorSize, (void *)output_ptr, 0, NULL, NULL ); test_error(err, "clEnqueueReadBuffer failed"); // log_info("Performing verification\n"); int error_count = 0; char *inputAPtr = (char *)input_ptr[ 0 ]; char *inputBPtr = (char *)input_ptr[ 1 ]; cl_char *inputCPtr = (cl_char *)input_ptr[ 2 ]; char *actualPtr = (char *)output_ptr; for( int i = 0; i < num_elements; i++ ) { for( int j = 0; j < vectorSize; j++ ) { char *expectedPtr = ( *inputCPtr ) ? inputAPtr : inputBPtr; if( memcmp( expectedPtr, actualPtr, type_size ) != 0 ) { #if 0 char expectedStr[ 128 ], actualStr[ 128 ], inputAStr[ 128 ], inputBStr[ 128 ]; print_type_to_string( type, inputAPtr, inputAStr ); print_type_to_string( type, inputBPtr, inputBStr ); print_type_to_string( type, expectedPtr, expectedStr ); print_type_to_string( type, actualPtr, actualStr ); log_error( "cl_%s verification failed at element %d:%d (expected %s, got %s, inputs: %s, %s, %s)\n", type_name, i, j, expectedStr, actualStr, inputAStr, inputBStr, ( *inputCPtr ) ? "true" : "false" ); #endif error_count++; } // Advance for each element member. Note if any of the vec sizes are 1, they don't advance here inputAPtr += ( inputAVecSize == 1 ) ? 0 : type_size; inputBPtr += ( inputBVecSize == 1 ) ? 0 : type_size; inputCPtr += ( inputCVecSize == 1 ) ? 0 : type_size; actualPtr += ( vectorSize == 1 ) ? 0 : type_size; } // Reverse for the member advance. If the vec sizes are 1, we need to advance, but otherwise they're already correct inputAPtr += ( inputAVecSize == 1 ) ? type_size : 0; inputBPtr += ( inputBVecSize == 1 ) ? type_size : 0; inputCPtr += ( inputCVecSize == 1 ) ? type_size : 0; actualPtr += ( vectorSize == 1 ) ? type_size : 0; } // cleanup clReleaseMemObject(streams[0]); clReleaseMemObject(streams[1]); clReleaseMemObject(streams[2]); clReleaseMemObject(streams[3]); clReleaseKernel(kernel); clReleaseProgram(program); free(input_ptr[0]); free(input_ptr[1]); free(input_ptr[2]); free(output_ptr); free_mtdata( s_randStates ); return error_count; }