1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
kernel(clover::program & prog,const std::string & name,const std::vector<binary::argument> & bargs)31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<binary::argument> &bargs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &barg : bargs) {
36 if (barg.semantic == binary::argument::general)
37 _args.emplace_back(argument::create(barg));
38 }
39 for (auto &dev : prog.devices()) {
40 auto &b = prog.build(dev).bin;
41 auto bsym = find(name_equals(name), b.syms);
42 const auto f = id_type_equals(bsym.section, binary::section::data_constant);
43 if (!any_of(f, b.secs))
44 continue;
45
46 auto mconst = find(f, b.secs);
47 auto rb = std::make_unique<root_buffer>(prog.context(), std::vector<cl_mem_properties>(),
48 CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
49 mconst.size, mconst.data.data());
50 _constant_buffers.emplace(&dev, std::move(rb));
51 }
52 }
53
54 template<typename V>
55 static inline std::vector<uint>
pad_vector(command_queue & q,const V & v,uint x)56 pad_vector(command_queue &q, const V &v, uint x) {
57 std::vector<uint> w { v.begin(), v.end() };
58 w.resize(q.device().max_block_size().size(), x);
59 return w;
60 }
61
62 void
launch(command_queue & q,const std::vector<size_t> & grid_offset,const std::vector<size_t> & grid_size,const std::vector<size_t> & block_size)63 kernel::launch(command_queue &q,
64 const std::vector<size_t> &grid_offset,
65 const std::vector<size_t> &grid_size,
66 const std::vector<size_t> &block_size) {
67 const auto b = program().build(q.device()).bin;
68 const auto reduced_grid_size =
69 map(divides(), grid_size, block_size);
70
71 if (any_of(is_zero(), grid_size))
72 return;
73
74 void *st = exec.bind(&q, grid_offset);
75 struct pipe_grid_info info = {};
76
77 // The handles are created during exec_context::bind(), so we need make
78 // sure to call exec_context::bind() before retrieving them.
79 std::vector<uint32_t *> g_handles = map([&](size_t h) {
80 return (uint32_t *)&exec.input[h];
81 }, exec.g_handles);
82
83 q.pipe->bind_compute_state(q.pipe, st);
84 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
85 0, exec.samplers.size(),
86 exec.samplers.data());
87
88 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
89 exec.sviews.size(), 0, false, exec.sviews.data());
90 q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
91 exec.iviews.size(), 0, exec.iviews.data());
92 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
93 exec.resources.data());
94 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
95 exec.g_buffers.data(), g_handles.data());
96
97 // Fill information for the launch_grid() call.
98 info.work_dim = grid_size.size();
99 copy(pad_vector(q, block_size, 1), info.block);
100 copy(pad_vector(q, reduced_grid_size, 1), info.grid);
101 info.pc = find(name_equals(_name), b.syms).offset;
102 info.input = exec.input.data();
103
104 q.pipe->launch_grid(q.pipe, &info);
105
106 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
107 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
108 q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
109 0, exec.iviews.size(), NULL);
110 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
111 0, exec.sviews.size(), false, NULL);
112 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
113 exec.samplers.size(), NULL);
114
115 q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
116 exec.unbind();
117 }
118
119 size_t
mem_local() const120 kernel::mem_local() const {
121 size_t sz = 0;
122
123 for (auto &arg : args()) {
124 if (dynamic_cast<local_argument *>(&arg))
125 sz += arg.storage();
126 }
127
128 return sz;
129 }
130
131 size_t
mem_private() const132 kernel::mem_private() const {
133 return 0;
134 }
135
136 const std::string &
name() const137 kernel::name() const {
138 return _name;
139 }
140
141 std::vector<size_t>
optimal_block_size(const command_queue & q,const std::vector<size_t> & grid_size) const142 kernel::optimal_block_size(const command_queue &q,
143 const std::vector<size_t> &grid_size) const {
144 if (any_of(is_zero(), grid_size))
145 return grid_size;
146
147 return factor::find_grid_optimal_factor<size_t>(
148 q.device().max_threads_per_block(), q.device().max_block_size(),
149 grid_size);
150 }
151
152 std::vector<size_t>
required_block_size() const153 kernel::required_block_size() const {
154 return find(name_equals(_name), program().symbols()).reqd_work_group_size;
155 }
156
157 kernel::argument_range
args()158 kernel::args() {
159 return map(derefs(), _args);
160 }
161
162 kernel::const_argument_range
args() const163 kernel::args() const {
164 return map(derefs(), _args);
165 }
166
167 std::vector<clover::binary::arg_info>
args_infos()168 kernel::args_infos() {
169 std::vector<clover::binary::arg_info> infos;
170 for (auto &barg: find(name_equals(_name), program().symbols()).args)
171 if (barg.semantic == clover::binary::argument::general)
172 infos.emplace_back(barg.info);
173
174 return infos;
175 }
176
177 const binary &
binary(const command_queue & q) const178 kernel::binary(const command_queue &q) const {
179 return program().build(q.device()).bin;
180 }
181
exec_context(kernel & kern)182 kernel::exec_context::exec_context(kernel &kern) :
183 kern(kern), q(NULL), print_handler(), mem_local(0), st(NULL), cs() {
184 }
185
~exec_context()186 kernel::exec_context::~exec_context() {
187 if (st)
188 q->pipe->delete_compute_state(q->pipe, st);
189 }
190
191 void *
bind(intrusive_ptr<command_queue> _q,const std::vector<size_t> & grid_offset)192 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
193 const std::vector<size_t> &grid_offset) {
194 std::swap(q, _q);
195
196 // Bind kernel arguments.
197 auto &b = kern.program().build(q->device()).bin;
198 auto bsym = find(name_equals(kern.name()), b.syms);
199 auto bargs = bsym.args;
200 auto msec = find(id_type_equals(bsym.section, binary::section::text_executable), b.secs);
201 auto explicit_arg = kern._args.begin();
202
203 for (auto &barg : bargs) {
204 switch (barg.semantic) {
205 case binary::argument::general:
206 (*(explicit_arg++))->bind(*this, barg);
207 break;
208
209 case binary::argument::grid_dimension: {
210 const cl_uint dimension = grid_offset.size();
211 auto arg = argument::create(barg);
212
213 arg->set(sizeof(dimension), &dimension);
214 arg->bind(*this, barg);
215 break;
216 }
217 case binary::argument::grid_offset: {
218 for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
219 auto arg = argument::create(barg);
220
221 arg->set(sizeof(x), &x);
222 arg->bind(*this, barg);
223 }
224 break;
225 }
226 case binary::argument::image_size: {
227 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
228 std::vector<cl_uint> image_size{
229 static_cast<cl_uint>(img->width()),
230 static_cast<cl_uint>(img->height()),
231 static_cast<cl_uint>(img->depth())};
232 for (auto x : image_size) {
233 auto arg = argument::create(barg);
234
235 arg->set(sizeof(x), &x);
236 arg->bind(*this, barg);
237 }
238 break;
239 }
240 case binary::argument::image_format: {
241 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
242 cl_image_format fmt = img->format();
243 std::vector<cl_uint> image_format{
244 static_cast<cl_uint>(fmt.image_channel_data_type),
245 static_cast<cl_uint>(fmt.image_channel_order)};
246 for (auto x : image_format) {
247 auto arg = argument::create(barg);
248
249 arg->set(sizeof(x), &x);
250 arg->bind(*this, barg);
251 }
252 break;
253 }
254 case binary::argument::constant_buffer: {
255 auto arg = argument::create(barg);
256 cl_mem buf = kern._constant_buffers.at(&q->device()).get();
257 arg->set(sizeof(buf), &buf);
258 arg->bind(*this, barg);
259 break;
260 }
261 case binary::argument::printf_buffer: {
262 print_handler = printf_handler::create(q, b.printf_infos,
263 b.printf_strings_in_buffer,
264 q->device().max_printf_buffer_size());
265 cl_mem print_mem = print_handler->get_mem();
266
267 auto arg = argument::create(barg);
268 arg->set(sizeof(cl_mem), &print_mem);
269 arg->bind(*this, barg);
270 break;
271 }
272 }
273 }
274
275 // Create a new compute state if anything changed.
276 if (!st || q != _q ||
277 cs.req_local_mem != mem_local ||
278 cs.req_input_mem != input.size()) {
279 if (st)
280 _q->pipe->delete_compute_state(_q->pipe, st);
281
282 cs.ir_type = q->device().ir_format();
283 cs.prog = &(msec.data[0]);
284 cs.req_local_mem = mem_local;
285 cs.req_input_mem = input.size();
286 st = q->pipe->create_compute_state(q->pipe, &cs);
287 if (!st) {
288 unbind(); // Cleanup
289 throw error(CL_OUT_OF_RESOURCES);
290 }
291 }
292
293 return st;
294 }
295
296 void
unbind()297 kernel::exec_context::unbind() {
298 if (print_handler)
299 print_handler->print();
300
301 for (auto &arg : kern.args())
302 arg.unbind(*this);
303
304 input.clear();
305 samplers.clear();
306 sviews.clear();
307 iviews.clear();
308 resources.clear();
309 g_buffers.clear();
310 g_handles.clear();
311 mem_local = 0;
312 }
313
314 namespace {
315 template<typename T>
316 std::vector<uint8_t>
bytes(const T & x)317 bytes(const T& x) {
318 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
319 }
320
321 ///
322 /// Transform buffer \a v from the native byte order into the byte
323 /// order specified by \a e.
324 ///
325 template<typename T>
326 void
byteswap(T & v,pipe_endian e)327 byteswap(T &v, pipe_endian e) {
328 if (PIPE_ENDIAN_NATIVE != e)
329 std::reverse(v.begin(), v.end());
330 }
331
332 ///
333 /// Pad buffer \a v to the next multiple of \a n.
334 ///
335 template<typename T>
336 void
align(T & v,size_t n)337 align(T &v, size_t n) {
338 v.resize(util_align_npot(v.size(), n));
339 }
340
341 bool
msb(const std::vector<uint8_t> & s)342 msb(const std::vector<uint8_t> &s) {
343 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
344 return s.back() & 0x80;
345 else
346 return s.front() & 0x80;
347 }
348
349 ///
350 /// Resize buffer \a v to size \a n using sign or zero extension
351 /// according to \a ext.
352 ///
353 template<typename T>
354 void
extend(T & v,enum binary::argument::ext_type ext,size_t n)355 extend(T &v, enum binary::argument::ext_type ext, size_t n) {
356 const size_t m = std::min(v.size(), n);
357 const bool sign_ext = (ext == binary::argument::sign_ext);
358 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
359 T w(n, fill);
360
361 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
362 std::copy_n(v.begin(), m, w.begin());
363 else
364 std::copy_n(v.end() - m, m, w.end() - m);
365
366 std::swap(v, w);
367 }
368
369 ///
370 /// Append buffer \a w to \a v.
371 ///
372 template<typename T>
373 void
insert(T & v,const T & w)374 insert(T &v, const T &w) {
375 v.insert(v.end(), w.begin(), w.end());
376 }
377
378 ///
379 /// Append \a n elements to the end of buffer \a v.
380 ///
381 template<typename T>
382 size_t
allocate(T & v,size_t n)383 allocate(T &v, size_t n) {
384 size_t pos = v.size();
385 v.resize(pos + n);
386 return pos;
387 }
388 }
389
390 std::unique_ptr<kernel::argument>
create(const binary::argument & barg)391 kernel::argument::create(const binary::argument &barg) {
392 switch (barg.type) {
393 case binary::argument::scalar:
394 return std::unique_ptr<kernel::argument>(new scalar_argument(barg.size));
395
396 case binary::argument::global:
397 return std::unique_ptr<kernel::argument>(new global_argument);
398
399 case binary::argument::local:
400 return std::unique_ptr<kernel::argument>(new local_argument);
401
402 case binary::argument::constant:
403 return std::unique_ptr<kernel::argument>(new constant_argument);
404
405 case binary::argument::image_rd:
406 return std::unique_ptr<kernel::argument>(new image_rd_argument);
407
408 case binary::argument::image_wr:
409 return std::unique_ptr<kernel::argument>(new image_wr_argument);
410
411 case binary::argument::sampler:
412 return std::unique_ptr<kernel::argument>(new sampler_argument);
413
414 }
415 throw error(CL_INVALID_KERNEL_DEFINITION);
416 }
417
argument()418 kernel::argument::argument() : _set(false) {
419 }
420
421 bool
set() const422 kernel::argument::set() const {
423 return _set;
424 }
425
426 size_t
storage() const427 kernel::argument::storage() const {
428 return 0;
429 }
430
scalar_argument(size_t size)431 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
432 }
433
434 void
set(size_t size,const void * value)435 kernel::scalar_argument::set(size_t size, const void *value) {
436 if (!value)
437 throw error(CL_INVALID_ARG_VALUE);
438
439 if (size != this->size)
440 throw error(CL_INVALID_ARG_SIZE);
441
442 v = { (uint8_t *)value, (uint8_t *)value + size };
443 _set = true;
444 }
445
446 void
bind(exec_context & ctx,const binary::argument & barg)447 kernel::scalar_argument::bind(exec_context &ctx,
448 const binary::argument &barg) {
449 auto w = v;
450
451 extend(w, barg.ext_type, barg.target_size);
452 byteswap(w, ctx.q->device().endianness());
453 align(ctx.input, barg.target_align);
454 insert(ctx.input, w);
455 }
456
457 void
unbind(exec_context & ctx)458 kernel::scalar_argument::unbind(exec_context &ctx) {
459 }
460
global_argument()461 kernel::global_argument::global_argument() : buf(nullptr), svm(nullptr) {
462 }
463
464 void
set(size_t size,const void * value)465 kernel::global_argument::set(size_t size, const void *value) {
466 if (size != sizeof(cl_mem))
467 throw error(CL_INVALID_ARG_SIZE);
468
469 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
470 svm = nullptr;
471 _set = true;
472 }
473
474 void
set_svm(const void * value)475 kernel::global_argument::set_svm(const void *value) {
476 svm = value;
477 buf = nullptr;
478 _set = true;
479 }
480
481 void
bind(exec_context & ctx,const binary::argument & barg)482 kernel::global_argument::bind(exec_context &ctx,
483 const binary::argument &barg) {
484 align(ctx.input, barg.target_align);
485
486 if (buf) {
487 const resource &r = buf->resource_in(*ctx.q);
488 ctx.g_handles.push_back(ctx.input.size());
489 ctx.g_buffers.push_back(r.pipe);
490
491 // How to handle multi-demensional offsets?
492 // We don't need to. Buffer offsets are always
493 // one-dimensional.
494 auto v = bytes(r.offset[0]);
495 extend(v, barg.ext_type, barg.target_size);
496 byteswap(v, ctx.q->device().endianness());
497 insert(ctx.input, v);
498 } else if (svm) {
499 auto v = bytes(svm);
500 extend(v, barg.ext_type, barg.target_size);
501 byteswap(v, ctx.q->device().endianness());
502 insert(ctx.input, v);
503 } else {
504 // Null pointer.
505 allocate(ctx.input, barg.target_size);
506 }
507 }
508
509 void
unbind(exec_context & ctx)510 kernel::global_argument::unbind(exec_context &ctx) {
511 }
512
513 size_t
storage() const514 kernel::local_argument::storage() const {
515 return _storage;
516 }
517
518 void
set(size_t size,const void * value)519 kernel::local_argument::set(size_t size, const void *value) {
520 if (value)
521 throw error(CL_INVALID_ARG_VALUE);
522
523 if (!size)
524 throw error(CL_INVALID_ARG_SIZE);
525
526 _storage = size;
527 _set = true;
528 }
529
530 void
bind(exec_context & ctx,const binary::argument & barg)531 kernel::local_argument::bind(exec_context &ctx,
532 const binary::argument &barg) {
533 ctx.mem_local = ::align(ctx.mem_local, barg.target_align);
534 auto v = bytes(ctx.mem_local);
535
536 extend(v, binary::argument::zero_ext, barg.target_size);
537 byteswap(v, ctx.q->device().endianness());
538 align(ctx.input, ctx.q->device().address_bits() / 8);
539 insert(ctx.input, v);
540
541 ctx.mem_local += _storage;
542 }
543
544 void
unbind(exec_context & ctx)545 kernel::local_argument::unbind(exec_context &ctx) {
546 }
547
constant_argument()548 kernel::constant_argument::constant_argument() : buf(nullptr), st(nullptr) {
549 }
550
551 void
set(size_t size,const void * value)552 kernel::constant_argument::set(size_t size, const void *value) {
553 if (size != sizeof(cl_mem))
554 throw error(CL_INVALID_ARG_SIZE);
555
556 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
557 _set = true;
558 }
559
560 void
bind(exec_context & ctx,const binary::argument & barg)561 kernel::constant_argument::bind(exec_context &ctx,
562 const binary::argument &barg) {
563 align(ctx.input, barg.target_align);
564
565 if (buf) {
566 resource &r = buf->resource_in(*ctx.q);
567 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
568
569 extend(v, binary::argument::zero_ext, barg.target_size);
570 byteswap(v, ctx.q->device().endianness());
571 insert(ctx.input, v);
572
573 st = r.bind_surface(*ctx.q, false);
574 ctx.resources.push_back(st);
575 } else {
576 // Null pointer.
577 allocate(ctx.input, barg.target_size);
578 }
579 }
580
581 void
unbind(exec_context & ctx)582 kernel::constant_argument::unbind(exec_context &ctx) {
583 if (buf)
584 buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);
585 }
586
587 void
set(size_t size,const void * value)588 kernel::image_rd_argument::set(size_t size, const void *value) {
589 if (!value)
590 throw error(CL_INVALID_ARG_VALUE);
591
592 if (size != sizeof(cl_mem))
593 throw error(CL_INVALID_ARG_SIZE);
594
595 img = &obj<image>(*(cl_mem *)value);
596 _set = true;
597 }
598
599 void
bind(exec_context & ctx,const binary::argument & barg)600 kernel::image_rd_argument::bind(exec_context &ctx,
601 const binary::argument &barg) {
602 auto v = bytes(ctx.sviews.size());
603
604 extend(v, binary::argument::zero_ext, barg.target_size);
605 byteswap(v, ctx.q->device().endianness());
606 align(ctx.input, barg.target_align);
607 insert(ctx.input, v);
608
609 st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
610 ctx.sviews.push_back(st);
611 }
612
613 void
unbind(exec_context & ctx)614 kernel::image_rd_argument::unbind(exec_context &ctx) {
615 img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);
616 }
617
618 void
set(size_t size,const void * value)619 kernel::image_wr_argument::set(size_t size, const void *value) {
620 if (!value)
621 throw error(CL_INVALID_ARG_VALUE);
622
623 if (size != sizeof(cl_mem))
624 throw error(CL_INVALID_ARG_SIZE);
625
626 img = &obj<image>(*(cl_mem *)value);
627 _set = true;
628 }
629
630 void
bind(exec_context & ctx,const binary::argument & barg)631 kernel::image_wr_argument::bind(exec_context &ctx,
632 const binary::argument &barg) {
633 auto v = bytes(ctx.iviews.size());
634
635 extend(v, binary::argument::zero_ext, barg.target_size);
636 byteswap(v, ctx.q->device().endianness());
637 align(ctx.input, barg.target_align);
638 insert(ctx.input, v);
639 ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
640 }
641
642 void
unbind(exec_context & ctx)643 kernel::image_wr_argument::unbind(exec_context &ctx) {
644 }
645
sampler_argument()646 kernel::sampler_argument::sampler_argument() : s(nullptr), st(nullptr) {
647 }
648
649 void
set(size_t size,const void * value)650 kernel::sampler_argument::set(size_t size, const void *value) {
651 if (!value)
652 throw error(CL_INVALID_SAMPLER);
653
654 if (size != sizeof(cl_sampler))
655 throw error(CL_INVALID_ARG_SIZE);
656
657 s = &obj(*(cl_sampler *)value);
658 _set = true;
659 }
660
661 void
bind(exec_context & ctx,const binary::argument & barg)662 kernel::sampler_argument::bind(exec_context &ctx,
663 const binary::argument &barg) {
664 st = s->bind(*ctx.q);
665 ctx.samplers.push_back(st);
666 }
667
668 void
unbind(exec_context & ctx)669 kernel::sampler_argument::unbind(exec_context &ctx) {
670 s->unbind(*ctx.q, st);
671 }
672