// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "common.h" const char *find_targets_kernel[] = { "__kernel void find_targets(__global uint* image, uint target, volatile " "__global atomic_uint *numTargetsFound, volatile __global atomic_uint " "*targetLocations)\n" "{\n" " size_t i = get_global_id(0);\n" " uint index;\n" " if(image[i] == target) {\n" " index = atomic_fetch_add_explicit(numTargetsFound, 1u, " "memory_order_relaxed, memory_scope_device); \n" " atomic_exchange_explicit(&targetLocations[index], i, " "memory_order_relaxed, memory_scope_all_svm_devices); \n" " }\n" "}\n" }; void spawnAnalysisTask(int location) { // printf("found target at location %d\n", location); } #define MAX_TARGETS 1024 // Goals: demonstrate use of SVM's atomics to do fine grain synchronization between the device and host. // Concept: a device kernel is used to search an input image for regions that match a target pattern. // The device immediately notifies the host when it finds a target (via an atomic operation that works across host and devices). // The host is then able to spawn a task that further analyzes the target while the device continues searching for more targets. int test_svm_fine_grain_sync_buffers(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements) { clContextWrapper context = NULL; clProgramWrapper program = NULL; cl_uint num_devices = 0; cl_int err = CL_SUCCESS; clCommandQueueWrapper queues[MAXQ]; err = create_cl_objects(deviceID, &find_targets_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS); if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing. if(err < 0) return -1; // fail test. clKernelWrapper kernel = clCreateKernel(program, "find_targets", &err); test_error(err, "clCreateKernel failed"); size_t num_pixels = num_elements; //cl_uint num_pixels = 1024*1024*32; cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0); cl_uint *pNumTargetsFound = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_uint), 0); cl_int *pTargetLocations = (cl_int* ) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int) * MAX_TARGETS, 0); cl_uint targetDescriptor = 777; *pNumTargetsFound = 0; cl_uint i; for(i=0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1; for(i=0; i < num_pixels; i++) pInputImage[i] = 0; pInputImage[0] = targetDescriptor; pInputImage[3] = targetDescriptor; pInputImage[num_pixels - 1] = targetDescriptor; err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage); err |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &targetDescriptor); err |= clSetKernelArgSVMPointer(kernel, 2, pNumTargetsFound); err |= clSetKernelArgSVMPointer(kernel, 3, pTargetLocations); test_error(err, "clSetKernelArg failed"); cl_event done; err = clEnqueueNDRangeKernel(queues[0], kernel, 1, NULL, &num_pixels, NULL, 0, NULL, &done); test_error(err,"clEnqueueNDRangeKernel failed"); clFlush(queues[0]); i=0; cl_int status; // check for new targets, if found spawn a task to analyze target. do { err = clGetEventInfo(done,CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL); test_error(err,"clGetEventInfo failed"); if( AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1) // -1 indicates slot not used yet. { spawnAnalysisTask(pTargetLocations[i]); i++; } } while (status != CL_COMPLETE || AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1); clReleaseEvent(done); clSVMFree(context, pInputImage); clSVMFree(context, pNumTargetsFound); clSVMFree(context, pTargetLocations); if(i != 3) return -1; return 0; }