• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "common.h"
17 
18 const char *find_targets_kernel[] = {
19 
20     "__kernel void find_targets(__global uint* image, uint target, volatile "
21     "__global atomic_uint *numTargetsFound, volatile __global atomic_uint "
22     "*targetLocations)\n"
23     "{\n"
24     " size_t i = get_global_id(0);\n"
25     " uint index;\n"
26     " if(image[i] == target) {\n"
27     "   index = atomic_fetch_add_explicit(numTargetsFound, 1u, "
28     "memory_order_relaxed, memory_scope_device); \n"
29     "   atomic_exchange_explicit(&targetLocations[index], i, "
30     "memory_order_relaxed, memory_scope_all_svm_devices); \n"
31     " }\n"
32     "}\n"
33 };
34 
35 
spawnAnalysisTask(int location)36 void spawnAnalysisTask(int location)
37 {
38   //    printf("found target at location %d\n", location);
39 }
40 
41 #define MAX_TARGETS 1024
42 
43 // Goals: demonstrate use of SVM's atomics to do fine grain synchronization between the device and host.
44 // Concept: a device kernel is used to search an input image for regions that match a target pattern.
45 // The device immediately notifies the host when it finds a target (via an atomic operation that works across host and devices).
46 // The host is then able to spawn a task that further analyzes the target while the device continues searching for more targets.
test_svm_fine_grain_sync_buffers(cl_device_id deviceID,cl_context c,cl_command_queue queue,int num_elements)47 int test_svm_fine_grain_sync_buffers(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
48 {
49   clContextWrapper    context = NULL;
50   clProgramWrapper    program = NULL;
51   cl_uint     num_devices = 0;
52   cl_int      err = CL_SUCCESS;
53   clCommandQueueWrapper queues[MAXQ];
54 
55   err = create_cl_objects(deviceID, &find_targets_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS);
56   if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
57   if(err < 0) return -1; // fail test.
58 
59   clKernelWrapper kernel = clCreateKernel(program, "find_targets", &err);
60   test_error(err, "clCreateKernel failed");
61 
62   size_t num_pixels = num_elements;
63   //cl_uint num_pixels = 1024*1024*32;
64 
65   cl_uint *pInputImage      = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
66   cl_uint *pNumTargetsFound = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_uint), 0);
67   cl_int  *pTargetLocations = (cl_int* ) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int) * MAX_TARGETS, 0);
68 
69   cl_uint targetDescriptor = 777;
70   *pNumTargetsFound = 0;
71   cl_uint i;
72   for(i=0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
73   for(i=0; i < num_pixels; i++) pInputImage[i] = 0;
74   pInputImage[0] = targetDescriptor;
75   pInputImage[3] = targetDescriptor;
76   pInputImage[num_pixels - 1] = targetDescriptor;
77 
78   err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
79   err |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &targetDescriptor);
80   err |= clSetKernelArgSVMPointer(kernel, 2, pNumTargetsFound);
81   err |= clSetKernelArgSVMPointer(kernel, 3, pTargetLocations);
82   test_error(err, "clSetKernelArg failed");
83 
84   cl_event done;
85   err = clEnqueueNDRangeKernel(queues[0], kernel, 1, NULL, &num_pixels, NULL, 0, NULL, &done);
86   test_error(err,"clEnqueueNDRangeKernel failed");
87   clFlush(queues[0]);
88 
89 
90   i=0;
91   cl_int status;
92   // check for new targets, if found spawn a task to analyze target.
93   do {
94     err = clGetEventInfo(done,CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL);
95     test_error(err,"clGetEventInfo failed");
96     if( AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1)  // -1 indicates slot not used yet.
97     {
98       spawnAnalysisTask(pTargetLocations[i]);
99       i++;
100     }
101   } while (status != CL_COMPLETE || AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1);
102 
103   clReleaseEvent(done);
104   clSVMFree(context, pInputImage);
105   clSVMFree(context, pNumTargetsFound);
106   clSVMFree(context, pTargetLocations);
107 
108   if(i != 3) return -1;
109   return 0;
110 }
111