1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "common.h"
17
18 const char *byte_manipulation_kernels[] = {
19 // Each device will write it's id into the bytes that it "owns", ownership is based on round robin (global_id % num_id)
20 // num_id is equal to number of SVM devices in the system plus one (for the host code).
21 // id is the index (id) of the device that this kernel is executing on.
22 // For example, if there are 2 SVM devices and the host; the buffer should look like this after each device and the host write their id's:
23 // 0, 1, 2, 0, 1, 2, 0, 1, 2...
24 "__kernel void write_owned_locations(__global char* a, uint num_id, uint id)\n"
25 "{\n"
26 " size_t i = get_global_id(0);\n"
27 " int owner = i % num_id;\n"
28 " if(id == owner) \n"
29 " a[i] = id;\n" // modify location if it belongs to this device, write id
30 "}\n"
31
32 // Verify that a device can see the byte sized updates from the other devices, sum up the device id's and see if they match expected value.
33 // Note: this must be called with a reduced NDRange so that neighbor acesses don't go past end of buffer.
34 // For example if there are two SVM devices and the host (3 total devices) the buffer should look like this:
35 // 0,1,2,0,1,2...
36 // and the expected sum at each point is 0+1+2 = 3.
37 "__kernel void sum_neighbor_locations(__global char* a, uint num_devices, volatile __global uint* error_count)\n"
38 "{\n"
39 " size_t i = get_global_id(0);\n"
40 " uint expected_sum = (num_devices * (num_devices - 1))/2;\n"
41 " uint sum = 0;\n"
42 " for(uint j=0; j<num_devices; j++) {\n"
43 " sum += a[i + j];\n" // add my neighbors to the right
44 " }\n"
45 " if(sum != expected_sum)\n"
46 " atomic_inc(error_count);\n"
47 "}\n"
48 };
49
50
51
test_svm_byte_granularity(cl_device_id deviceID,cl_context c,cl_command_queue queue,int num_elements)52 int test_svm_byte_granularity(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
53 {
54 clContextWrapper context;
55 clProgramWrapper program;
56 clKernelWrapper k1,k2;
57 clCommandQueueWrapper queues[MAXQ];
58
59 cl_uint num_devices = 0;
60 cl_int err = CL_SUCCESS;
61 cl_int rval = CL_SUCCESS;
62
63 err = create_cl_objects(deviceID, &byte_manipulation_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
64 if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
65 if(err < 0) return -1; // fail test.
66
67 cl_uint num_devices_plus_host = num_devices + 1;
68
69 k1 = clCreateKernel(program, "write_owned_locations", &err);
70 test_error(err, "clCreateKernel failed");
71 k2 = clCreateKernel(program, "sum_neighbor_locations", &err);
72 test_error(err, "clCreateKernel failed");
73
74
75 cl_char *pA = (cl_char*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_char) * num_elements, 0);
76
77 cl_uint **error_counts = (cl_uint**) malloc(sizeof(void*) * num_devices);
78
79 for(cl_uint i=0; i < num_devices; i++) {
80 error_counts[i] = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint), 0);
81 *error_counts[i] = 0;
82 }
83 for(int i=0; i < num_elements; i++) pA[i] = -1;
84
85 err |= clSetKernelArgSVMPointer(k1, 0, pA);
86 err |= clSetKernelArg(k1, 1, sizeof(cl_uint), &num_devices_plus_host);
87 test_error(err, "clSetKernelArg failed");
88
89 // get all the devices going simultaneously
90 size_t element_num = num_elements;
91 for(cl_uint d=0; d < num_devices; d++) // device ids starting at 1.
92 {
93 err = clSetKernelArg(k1, 2, sizeof(cl_uint), &d);
94 test_error(err, "clSetKernelArg failed");
95 err = clEnqueueNDRangeKernel(queues[d], k1, 1, NULL, &element_num, NULL, 0, NULL, NULL);
96 test_error(err,"clEnqueueNDRangeKernel failed");
97 }
98
99 for(cl_uint d=0; d < num_devices; d++) clFlush(queues[d]);
100
101 cl_uint host_id = num_devices; // host code will take the id above the devices.
102 for(int i = (int)num_devices; i < num_elements; i+= num_devices_plus_host) pA[i] = host_id;
103
104 for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
105
106 // now check that each device can see the byte writes made by the other devices.
107
108 err |= clSetKernelArgSVMPointer(k2, 0, pA);
109 err |= clSetKernelArg(k2, 1, sizeof(cl_uint), &num_devices_plus_host);
110 test_error(err, "clSetKernelArg failed");
111
112 // adjusted so k2 doesn't read past end of buffer
113 size_t adjusted_num_elements = num_elements - num_devices;
114 for(cl_uint id = 0; id < num_devices; id++)
115 {
116 err = clSetKernelArgSVMPointer(k2, 2, error_counts[id]);
117 test_error(err, "clSetKernelArg failed");
118
119 err = clEnqueueNDRangeKernel(queues[id], k2, 1, NULL, &adjusted_num_elements, NULL, 0, NULL, NULL);
120 test_error(err,"clEnqueueNDRangeKernel failed");
121 }
122
123 for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
124
125 bool failed = false;
126
127 // see if any of the devices found errors
128 for(cl_uint i=0; i < num_devices; i++) {
129 if(*error_counts[i] > 0)
130 failed = true;
131 }
132 cl_uint expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
133 // check that host can see the byte writes made by the devices.
134 for(cl_uint i = 0; i < num_elements - num_devices_plus_host; i++)
135 {
136 int sum = 0;
137 for(cl_uint j=0; j < num_devices_plus_host; j++) sum += pA[i+j];
138 if(sum != expected)
139 failed = true;
140 }
141
142 clSVMFree(context, pA);
143 for(cl_uint i=0; i < num_devices; i++) clSVMFree(context, error_counts[i]);
144
145 if(failed)
146 return -1;
147 return 0;
148 }
149