1 use crate::api::icd::*;
2 use crate::api::types::*;
3 use crate::api::util::*;
4 use crate::core::context::*;
5 use crate::core::device::*;
6 use crate::core::format::*;
7 use crate::core::gl::*;
8 use crate::core::queue::*;
9 use crate::core::util::*;
10 use crate::impl_cl_type_trait;
11 use crate::impl_cl_type_trait_base;
12 use crate::perf_warning;
13
14 use mesa_rust::pipe::context::*;
15 use mesa_rust::pipe::resource::*;
16 use mesa_rust::pipe::screen::ResourceType;
17 use mesa_rust::pipe::transfer::*;
18 use mesa_rust_gen::*;
19 use mesa_rust_util::conversion::*;
20 use mesa_rust_util::properties::Properties;
21 use mesa_rust_util::ptr::AllocSize;
22 use mesa_rust_util::ptr::TrackedPointers;
23 use rusticl_opencl_gen::*;
24
25 use std::alloc;
26 use std::alloc::Layout;
27 use std::cmp;
28 use std::collections::btree_map::Entry;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::mem;
32 use std::mem::size_of;
33 use std::ops::Deref;
34 use std::os::raw::c_void;
35 use std::ptr;
36 use std::sync::Arc;
37 use std::sync::Mutex;
38 use std::sync::MutexGuard;
39
40 struct Mapping<T> {
41 layout: Layout,
42 writes: bool,
43 ptr: Option<MutMemoryPtr>,
44 /// reference count from the API perspective. Once it reaches 0, we need to write back the
45 /// mappings content to the GPU resource.
46 count: u32,
47 inner: T,
48 }
49
50 impl<T> Drop for Mapping<T> {
drop(&mut self)51 fn drop(&mut self) {
52 if let Some(ptr) = &self.ptr {
53 unsafe {
54 alloc::dealloc(ptr.as_ptr().cast(), self.layout);
55 }
56 }
57 }
58 }
59
60 impl<T> AllocSize<usize> for Mapping<T> {
size(&self) -> usize61 fn size(&self) -> usize {
62 self.layout.size()
63 }
64 }
65
66 impl<T> Deref for Mapping<T> {
67 type Target = T;
68
deref(&self) -> &Self::Target69 fn deref(&self) -> &Self::Target {
70 &self.inner
71 }
72 }
73
74 struct BufferMapping {
75 offset: usize,
76 }
77
78 struct ImageMapping {
79 origin: CLVec<usize>,
80 region: CLVec<usize>,
81 }
82
83 #[repr(transparent)]
84 #[derive(Clone, Copy)]
85 pub struct ConstMemoryPtr {
86 ptr: *const c_void,
87 }
88 unsafe impl Send for ConstMemoryPtr {}
89 unsafe impl Sync for ConstMemoryPtr {}
90
91 impl ConstMemoryPtr {
as_ptr(&self) -> *const c_void92 pub fn as_ptr(&self) -> *const c_void {
93 self.ptr
94 }
95
96 /// # Safety
97 ///
98 /// Users need to ensure that `ptr` is only accessed in a thread-safe manner sufficient for
99 /// [Send] and [Sync]
from_ptr(ptr: *const c_void) -> Self100 pub unsafe fn from_ptr(ptr: *const c_void) -> Self {
101 Self { ptr: ptr }
102 }
103 }
104
105 impl From<MutMemoryPtr> for ConstMemoryPtr {
from(value: MutMemoryPtr) -> Self106 fn from(value: MutMemoryPtr) -> Self {
107 Self {
108 ptr: value.ptr.cast(),
109 }
110 }
111 }
112
113 #[repr(transparent)]
114 #[derive(Clone, Copy)]
115 pub struct MutMemoryPtr {
116 ptr: *mut c_void,
117 }
118 unsafe impl Send for MutMemoryPtr {}
119 unsafe impl Sync for MutMemoryPtr {}
120
121 impl MutMemoryPtr {
as_ptr(&self) -> *mut c_void122 pub fn as_ptr(&self) -> *mut c_void {
123 self.ptr
124 }
125
126 /// # Safety
127 ///
128 /// Users need to ensure that `ptr` is only accessed in a thread-safe manner sufficient for
129 /// [Send] and [Sync]
from_ptr(ptr: *mut c_void) -> Self130 pub unsafe fn from_ptr(ptr: *mut c_void) -> Self {
131 Self { ptr: ptr }
132 }
133 }
134
135 #[derive(Copy, Clone, PartialEq)]
136 pub enum ResourceValidityEntity {
137 Host,
138 Device(&'static Device),
139 }
140
141 /// Allocation with real GPU backing storage. Tracks on which device the content is valid on.
142 pub struct ResourceAllocation {
143 pub res: HashMap<&'static Device, Arc<PipeResource>>,
144 valid_on: Mutex<Vec<ResourceValidityEntity>>,
145 // it's a bit hacky, but storing the pointer as `usize` gives us `Send` and `Sync`. The
146 // application is required to ensure no data races exist on the memory anyway.
147 host_ptr: usize,
148 hostptr_devs: Vec<ResourceValidityEntity>,
149 // this might be non zero for dma-buf imported resources
150 offset: usize,
151 }
152
153 impl ResourceAllocation {
154 /// # Panics
155 ///
156 /// valid_on needs to be a Vec with at least one element, will panic otherwise.
get_best_valid_entity_for_transfer( valid_on: &MutexGuard<Vec<ResourceValidityEntity>>, ) -> ResourceValidityEntity157 fn get_best_valid_entity_for_transfer(
158 valid_on: &MutexGuard<Vec<ResourceValidityEntity>>,
159 ) -> ResourceValidityEntity {
160 // We want to avoid having to copy over the PCIe bus, so we prefer an entity which is either
161 // the host itself or a device using host memory.
162 let res = valid_on.iter().min_by_key(|entity| match entity {
163 ResourceValidityEntity::Host => 0,
164 ResourceValidityEntity::Device(dev) => {
165 if dev.unified_memory() {
166 1
167 } else {
168 2
169 }
170 }
171 });
172
173 *res.unwrap()
174 }
175
176 /// Small helper function to indicate when transparent migration is never required, e.g. if it's
177 /// a single device allocation with no hostptr.
can_skip_migration(&self) -> bool178 fn can_skip_migration(&self) -> bool {
179 match self.hostptr_devs.len() {
180 // If storage isn't shared between devices, we only need to migrate when there is more
181 // than one device.
182 0 => self.res.len() == 1,
183
184 // If all devices use a host_ptr allocation, the content is automatically synchronized
185 // as they share the same storage. The - 1 is required as the Host is also part of
186 // `hostptr_devs`.
187 len => len - 1 == self.res.len(),
188 }
189 }
190
191 /// Returns the GPU resource for the device `ctx` is associated with. It will transparently
192 /// migrate the data to the GPU.
193 /// TODO: add a map function to return a mapping to the resource of one device the data is valid
194 /// on instead of migrating if the user would simply map the resource anyway.
get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>>195 fn get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>> {
196 let dev = ctx.dev;
197 let dev_entity = ResourceValidityEntity::Device(dev);
198 let to_res = self.res.get(dev).ok_or(CL_OUT_OF_HOST_MEMORY)?;
199
200 // in most cases we can skip most of the work below.
201 if self.can_skip_migration() {
202 return Ok(to_res);
203 }
204
205 let Ok(mut valid_on) = self.valid_on.lock() else {
206 return Err(CL_OUT_OF_HOST_MEMORY);
207 };
208
209 // If the content isn't valid on dev we need to migrate it to it.
210 if matches!(rw, RWFlags::RD | RWFlags::RW) && !valid_on.contains(&dev_entity) {
211 // valid_on is a vec with at least one element, so this call won't panic.
212 let entity = Self::get_best_valid_entity_for_transfer(&valid_on);
213
214 let helper_ctx;
215 let map;
216 let flush;
217
218 if to_res.is_buffer() {
219 let ptr;
220 match entity {
221 ResourceValidityEntity::Host => {
222 flush = false;
223 ptr = self.host_ptr as *mut c_void;
224 }
225 ResourceValidityEntity::Device(dev) => {
226 flush = true;
227
228 let from_res = &self.res[dev];
229 helper_ctx = dev.helper_ctx();
230
231 // update the resource and wait for the operation to finish. We also map the resources
232 // unsynchronized as we can't block or flush any other contexts here as this might cause
233 // deadlocks.
234 map = helper_ctx
235 .map_buffer_unsynchronized(
236 from_res,
237 0,
238 from_res.width() as i32,
239 RWFlags::RD,
240 )
241 .ok_or(CL_OUT_OF_HOST_MEMORY)?;
242
243 ptr = map.ptr();
244 }
245 }
246
247 ctx.buffer_subdata(to_res, 0, ptr, to_res.width());
248 } else {
249 let ResourceValidityEntity::Device(dev) = entity else {
250 // we don't support migrating from host_ptr for images yet. It's also not needed
251 // because the Image struct has a more optimized way of doing things there.
252 unimplemented!();
253 };
254
255 flush = true;
256 let from_res = &self.res[dev];
257 helper_ctx = dev.helper_ctx();
258
259 // update the resource and wait for the operation to finish. We also map the resources
260 // unsynchronized as we can't block or flush any other contexts here as this might cause
261 // deadlocks.
262 let bx = pipe_box {
263 width: from_res.width() as i32,
264 height: from_res.height() as i32,
265 depth: from_res.depth() as i16,
266 ..Default::default()
267 };
268
269 map = helper_ctx
270 .map_texture_unsynchronized(from_res, &bx, RWFlags::RD)
271 .ok_or(CL_OUT_OF_HOST_MEMORY)?;
272
273 let row_pitch: u32 = map.row_pitch();
274 let slice_pitch: usize = map.slice_pitch();
275
276 let bx = pipe_box {
277 width: to_res.width() as i32,
278 height: to_res.height() as i32,
279 depth: to_res.depth() as i16,
280 ..Default::default()
281 };
282
283 ctx.texture_subdata(to_res, &bx, map.ptr(), row_pitch, slice_pitch);
284 }
285
286 // TODO: we really kinda need to figure out how we can make the compiler scream, that
287 // temporarily mapped memory might be accessed at some random point in the future
288 // by a GPU unless it's queues are flushed and processed.
289 if flush {
290 ctx.flush().wait();
291 }
292 }
293
294 if matches!(rw, RWFlags::WR | RWFlags::RW) {
295 // If the user writes to it it's not valid on any other device anymore.
296 valid_on.clear();
297 }
298
299 if !valid_on.contains(&dev_entity) {
300 // if we update one hostptr resource, we update them all.
301 if self.hostptr_devs.contains(&dev_entity) {
302 valid_on.extend_from_slice(&self.hostptr_devs);
303 } else {
304 valid_on.push(ResourceValidityEntity::Device(dev));
305 }
306 }
307
308 Ok(to_res)
309 }
310
migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()>311 pub fn migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
312 let host_entity = ResourceValidityEntity::Host;
313 let host_ptr = self.host_ptr as *mut c_void;
314
315 // in most cases we can skip most of the work below.
316 if self.can_skip_migration() || host_ptr.is_null() {
317 return Ok(());
318 }
319
320 let Ok(mut valid_on) = self.valid_on.lock() else {
321 return Err(CL_OUT_OF_HOST_MEMORY);
322 };
323
324 // If the content isn't valid on the host we need to migrate it to it.
325 if matches!(rw, RWFlags::RD | RWFlags::RW) && !valid_on.contains(&host_entity) {
326 let ctx_dev_entity = ResourceValidityEntity::Device(ctx.dev);
327 let mut entity = ctx_dev_entity;
328
329 if !valid_on.contains(&entity) {
330 // valid_on is a vec with at least one element, so this call won't panic.
331 entity = Self::get_best_valid_entity_for_transfer(&valid_on);
332 }
333
334 debug_assert!(entity != ResourceValidityEntity::Host);
335
336 let ResourceValidityEntity::Device(from_dev) = entity else {
337 // we check if `valid_on` contains a host entity above, so this should never happen.
338 unreachable!();
339 };
340
341 let helper_ctx;
342 let map;
343 let from_res = &self.res[from_dev];
344
345 assert!(
346 from_res.is_buffer(),
347 "Transparent resource migration only supported on buffers."
348 );
349
350 if from_dev == ctx.dev {
351 map = ctx
352 .buffer_map(from_res, 0, from_res.width() as i32, RWFlags::RD)
353 .ok_or(CL_OUT_OF_HOST_MEMORY)?;
354 } else {
355 helper_ctx = from_dev.helper_ctx();
356 // update the resource and wait for the operation to finish. We also map the resources
357 // unsynchronized as we can't block or flush any other contexts here as this might cause
358 // deadlocks.
359 map = helper_ctx
360 .map_buffer_unsynchronized(from_res, 0, from_res.width() as i32, RWFlags::RD)
361 .ok_or(CL_OUT_OF_HOST_MEMORY)?;
362 }
363
364 let ptr = map.ptr();
365 // SAFETY: The application promises, that host_ptr is big enough to hold the entire
366 // content of the buffer, also `ptr` is the mapped resource containing at least
367 // `from_res.width()` bytes. Also both pointers do not overlap.
368 unsafe {
369 ptr::copy_nonoverlapping(ptr, host_ptr, from_res.width() as usize);
370 }
371 }
372
373 if matches!(rw, RWFlags::WR | RWFlags::RW) {
374 // If the user writes to it it's not valid on any other device anymore.
375 valid_on.clear();
376 }
377
378 if !valid_on.contains(&host_entity) {
379 // if we update the hostptr, we update all devices having a hostptr allocation.
380 valid_on.extend_from_slice(&self.hostptr_devs);
381 }
382
383 Ok(())
384 }
385 }
386
387 pub struct SubAllocation {
388 mem: Mem,
389 // offset relative to the actual resource, not relative to `mem`. This saves us a few
390 // calculations and we only need the total amount anyway.
391 offset: usize,
392 }
393
394 /// Abstraction over the memory allocation. It might be a real GPU backing storage or simply a sub
395 /// allocation over an existing memory object.
396 enum Allocation {
397 Resource(ResourceAllocation),
398 SubAlloc(SubAllocation),
399 }
400
401 // TODO: - Once it's used for more stuff might make sense to split it into an Image and Buffer
402 // variant.
403 // - Instead of doing full migration every time, it could also do it for only parts of the
404 // allocation.
405 impl Allocation {
406 /// Creates a new allocation object assuming the initial data is valid on every device.
new( res: HashMap<&'static Device, Arc<PipeResource>>, offset: usize, host_ptr: *mut c_void, ) -> Self407 pub fn new(
408 res: HashMap<&'static Device, Arc<PipeResource>>,
409 offset: usize,
410 host_ptr: *mut c_void,
411 ) -> Self {
412 let hostptr_devs = if !host_ptr.is_null() {
413 res.iter()
414 // we only add devices we actually have a host ptr resource for
415 .filter_map(|(&dev, res)| {
416 res.is_user().then_some(ResourceValidityEntity::Device(dev))
417 })
418 // and the host itself
419 .chain([ResourceValidityEntity::Host])
420 .collect()
421 } else {
422 Vec::new()
423 };
424
425 let mut valid_on: Vec<_> = res
426 .keys()
427 .copied()
428 .map(ResourceValidityEntity::Device)
429 .collect();
430 if !host_ptr.is_null() {
431 valid_on.push(ResourceValidityEntity::Host);
432 }
433
434 Self::Resource(ResourceAllocation {
435 valid_on: Mutex::new(valid_on),
436 res: res,
437 host_ptr: host_ptr as usize,
438 hostptr_devs: hostptr_devs,
439 offset: offset,
440 })
441 }
442
new_sub(mem: Mem, offset: usize) -> Self443 fn new_sub(mem: Mem, offset: usize) -> Self {
444 Self::SubAlloc(SubAllocation {
445 // we precalculate the entire offset here.
446 offset: offset + mem.alloc.offset(),
447 mem: mem,
448 })
449 }
450
451 /// Returns true if the backing storage of the two objects is equal.
backing_resource_eq(&self, other: &Self) -> bool452 fn backing_resource_eq(&self, other: &Self) -> bool {
453 ptr::eq(self.get_real_resource(), other.get_real_resource())
454 }
455
456 /// Follows the sub-allocation chain until it hits a real GPU allocation.
get_real_resource(&self) -> &ResourceAllocation457 fn get_real_resource(&self) -> &ResourceAllocation {
458 match self {
459 Allocation::SubAlloc(sub) => sub.mem.alloc.get_real_resource(),
460 Allocation::Resource(res) => res,
461 }
462 }
463
464 /// Returns the resource associated with `dev` without any data migration.
get_res_of_dev(&self, dev: &Device) -> CLResult<&Arc<PipeResource>>465 fn get_res_of_dev(&self, dev: &Device) -> CLResult<&Arc<PipeResource>> {
466 self.get_real_resource()
467 .res
468 .get(dev)
469 .ok_or(CL_OUT_OF_HOST_MEMORY)
470 }
471
472 /// Returns the resource associated with `ctx.dev` and transparently migrate the data.
get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>>473 fn get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>> {
474 self.get_real_resource().get_res_for_access(ctx, rw)
475 }
476
477 /// Migrates the content to the host. Fails if there is no host ptr.
_migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()>478 pub fn _migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
479 self.get_real_resource().migrate_to_hostptr(ctx, rw)
480 }
481
host_ptr(&self) -> *mut c_void482 pub fn host_ptr(&self) -> *mut c_void {
483 let mut host_ptr = self.get_real_resource().host_ptr;
484
485 // we can only apply the offset as long the host_ptr isn't null.
486 if host_ptr != 0 {
487 host_ptr += self.offset();
488 }
489
490 host_ptr as _
491 }
492
is_user_alloc_for_dev(&self, dev: &Device) -> CLResult<bool>493 fn is_user_alloc_for_dev(&self, dev: &Device) -> CLResult<bool> {
494 Ok(self.get_res_of_dev(dev)?.is_user())
495 }
496
offset(&self) -> usize497 fn offset(&self) -> usize {
498 match self {
499 Allocation::Resource(res) => res.offset,
500 Allocation::SubAlloc(sub) => sub.offset,
501 }
502 }
503 }
504
505 pub enum Mem {
506 Buffer(Arc<Buffer>),
507 Image(Arc<Image>),
508 }
509
510 impl Deref for Mem {
511 type Target = MemBase;
512
deref(&self) -> &Self::Target513 fn deref(&self) -> &Self::Target {
514 match self {
515 Self::Buffer(b) => &b.base,
516 Self::Image(i) => &i.base,
517 }
518 }
519 }
520
521 impl Mem {
is_mapped_ptr(&self, ptr: *mut c_void) -> bool522 pub fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
523 match self {
524 Self::Buffer(b) => b.is_mapped_ptr(ptr),
525 Self::Image(i) => i.is_mapped_ptr(ptr),
526 }
527 }
528
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>529 pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
530 match self {
531 Self::Buffer(b) => b.sync_unmap(ctx, ptr),
532 Self::Image(i) => i.sync_unmap(ctx, ptr),
533 }
534 }
535
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>536 pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
537 match self {
538 Self::Buffer(b) => b.unmap(ptr),
539 Self::Image(i) => i.unmap(ptr),
540 }
541 }
542 }
543
544 /// # Mapping memory
545 ///
546 /// Maps the queue associated device's resource.
547 ///
548 /// Mapping resources could have been quite straightforward if OpenCL wouldn't allow for so called
549 /// non blocking maps. Non blocking maps shall return a valid pointer to the mapped region
550 /// immediately, but should not synchronize data (in case of shadow buffers) until after the map
551 /// event is reached in the queue. This makes it not possible to simply use pipe_transfers as those
552 /// can't be explicitly synced by the frontend.
553 ///
554 /// In order to have a compliant implementation of the mapping API we have to consider the following
555 /// cases:
556 /// 1. Mapping a cl_mem object with CL_MEM_USE_HOST_PTR: We simply return the host_ptr.
557 /// Synchronization of shadowed host ptrs are done in `sync_shadow` on demand.
558 /// 2. Mapping linear resources on UMA systems: We simply create the pipe_transfer with
559 /// `PIPE_MAP_DIRECTLY` and `PIPE_MAP_UNSYNCHRONIZED` and return the attached pointer.
560 /// 3. On non UMA systems or when 2. fails (e.g. due to the resource being tiled) we
561 /// - create a shadow pipe_resource with `PIPE_USAGE_STAGING`,
562 /// `PIPE_RESOURCE_FLAG_MAP_PERSISTENT` and `PIPE_RESOURCE_FLAG_MAP_COHERENT`
563 /// - create a pipe_transfer with `PIPE_MAP_COHERENT`, `PIPE_MAP_PERSISTENT` and
564 /// `PIPE_MAP_UNSYNCHRONIZED`
565 /// - sync the shadow buffer like a host_ptr shadow buffer in 1.
566 ///
567 /// Taking this approach we guarentee that we only copy when actually needed while making sure the
568 /// content behind the returned pointer is valid until unmapped.
569 pub struct MemBase {
570 pub base: CLObjectBase<CL_INVALID_MEM_OBJECT>,
571 pub context: Arc<Context>,
572 pub mem_type: cl_mem_object_type,
573 pub flags: cl_mem_flags,
574 pub size: usize,
575 pub props: Properties<cl_mem_properties>,
576 pub cbs: Mutex<Vec<MemCB>>,
577 pub gl_obj: Option<GLObject>,
578 alloc: Allocation,
579 }
580
581 pub struct Buffer {
582 base: MemBase,
583 maps: Mutex<TrackedPointers<usize, Mapping<BufferMapping>>>,
584 }
585
586 pub struct Image {
587 base: MemBase,
588 pub image_format: cl_image_format,
589 pub pipe_format: pipe_format,
590 pub image_desc: cl_image_desc,
591 pub image_elem_size: u8,
592 maps: Mutex<TrackedPointers<usize, Mapping<ImageMapping>>>,
593 }
594
595 impl Deref for Buffer {
596 type Target = MemBase;
597
deref(&self) -> &Self::Target598 fn deref(&self) -> &Self::Target {
599 &self.base
600 }
601 }
602
603 impl Deref for Image {
604 type Target = MemBase;
605
deref(&self) -> &Self::Target606 fn deref(&self) -> &Self::Target {
607 &self.base
608 }
609 }
610
611 impl_cl_type_trait_base!(cl_mem, MemBase, [Buffer, Image], CL_INVALID_MEM_OBJECT);
612 impl_cl_type_trait!(cl_mem, Buffer, CL_INVALID_MEM_OBJECT, base.base);
613 impl_cl_type_trait!(cl_mem, Image, CL_INVALID_MEM_OBJECT, base.base);
614
615 pub trait CLImageDescInfo {
type_info(&self) -> (u8, bool)616 fn type_info(&self) -> (u8, bool);
pixels(&self) -> usize617 fn pixels(&self) -> usize;
bx(&self) -> CLResult<pipe_box>618 fn bx(&self) -> CLResult<pipe_box>;
row_pitch(&self) -> CLResult<u32>619 fn row_pitch(&self) -> CLResult<u32>;
slice_pitch(&self) -> usize620 fn slice_pitch(&self) -> usize;
width(&self) -> CLResult<u32>621 fn width(&self) -> CLResult<u32>;
height(&self) -> CLResult<u32>622 fn height(&self) -> CLResult<u32>;
size(&self) -> CLVec<usize>623 fn size(&self) -> CLVec<usize>;
624
dims(&self) -> u8625 fn dims(&self) -> u8 {
626 self.type_info().0
627 }
628
dims_with_array(&self) -> u8629 fn dims_with_array(&self) -> u8 {
630 let array: u8 = self.is_array().into();
631 self.dims() + array
632 }
633
has_slice(&self) -> bool634 fn has_slice(&self) -> bool {
635 self.dims() == 3 || self.is_array()
636 }
637
is_array(&self) -> bool638 fn is_array(&self) -> bool {
639 self.type_info().1
640 }
641 }
642
643 impl CLImageDescInfo for cl_image_desc {
type_info(&self) -> (u8, bool)644 fn type_info(&self) -> (u8, bool) {
645 match self.image_type {
646 CL_MEM_OBJECT_IMAGE1D | CL_MEM_OBJECT_IMAGE1D_BUFFER => (1, false),
647 CL_MEM_OBJECT_IMAGE1D_ARRAY => (1, true),
648 CL_MEM_OBJECT_IMAGE2D => (2, false),
649 CL_MEM_OBJECT_IMAGE2D_ARRAY => (2, true),
650 CL_MEM_OBJECT_IMAGE3D => (3, false),
651 _ => panic!("unknown image_type {:x}", self.image_type),
652 }
653 }
654
pixels(&self) -> usize655 fn pixels(&self) -> usize {
656 let mut res = self.image_width;
657 let dims = self.dims();
658
659 if dims > 1 {
660 res *= self.image_height;
661 }
662
663 if dims > 2 {
664 res *= self.image_depth;
665 }
666
667 if self.is_array() {
668 res *= self.image_array_size;
669 }
670
671 res
672 }
673
size(&self) -> CLVec<usize>674 fn size(&self) -> CLVec<usize> {
675 let mut height = cmp::max(self.image_height, 1);
676 let mut depth = cmp::max(self.image_depth, 1);
677
678 match self.image_type {
679 CL_MEM_OBJECT_IMAGE1D_ARRAY => height = self.image_array_size,
680 CL_MEM_OBJECT_IMAGE2D_ARRAY => depth = self.image_array_size,
681 _ => {}
682 }
683
684 CLVec::new([self.image_width, height, depth])
685 }
686
bx(&self) -> CLResult<pipe_box>687 fn bx(&self) -> CLResult<pipe_box> {
688 create_pipe_box(CLVec::default(), self.size(), self.image_type)
689 }
690
row_pitch(&self) -> CLResult<u32>691 fn row_pitch(&self) -> CLResult<u32> {
692 self.image_row_pitch
693 .try_into_with_err(CL_OUT_OF_HOST_MEMORY)
694 }
695
slice_pitch(&self) -> usize696 fn slice_pitch(&self) -> usize {
697 self.image_slice_pitch
698 }
699
width(&self) -> CLResult<u32>700 fn width(&self) -> CLResult<u32> {
701 self.image_width.try_into_with_err(CL_OUT_OF_HOST_MEMORY)
702 }
703
height(&self) -> CLResult<u32>704 fn height(&self) -> CLResult<u32> {
705 self.image_height.try_into_with_err(CL_OUT_OF_HOST_MEMORY)
706 }
707 }
708
sw_copy( src: *const c_void, dst: *mut c_void, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, pixel_size: u8, )709 fn sw_copy(
710 src: *const c_void,
711 dst: *mut c_void,
712 region: &CLVec<usize>,
713 src_origin: &CLVec<usize>,
714 src_row_pitch: usize,
715 src_slice_pitch: usize,
716 dst_origin: &CLVec<usize>,
717 dst_row_pitch: usize,
718 dst_slice_pitch: usize,
719 pixel_size: u8,
720 ) {
721 let pixel_size = pixel_size as usize;
722 for z in 0..region[2] {
723 if src_row_pitch == dst_row_pitch && region[1] * pixel_size == src_row_pitch {
724 unsafe {
725 ptr::copy(
726 src.byte_add(
727 (*src_origin + [0, 0, z]) * [pixel_size, src_row_pitch, src_slice_pitch],
728 ),
729 dst.byte_add(
730 (*dst_origin + [0, 0, z]) * [pixel_size, dst_row_pitch, dst_slice_pitch],
731 ),
732 region[0] * region[1] * pixel_size,
733 )
734 }
735 } else {
736 for y in 0..region[1] {
737 unsafe {
738 ptr::copy(
739 src.byte_add(
740 (*src_origin + [0, y, z])
741 * [pixel_size, src_row_pitch, src_slice_pitch],
742 ),
743 dst.byte_add(
744 (*dst_origin + [0, y, z])
745 * [pixel_size, dst_row_pitch, dst_slice_pitch],
746 ),
747 region[0] * pixel_size,
748 )
749 };
750 }
751 }
752 }
753 }
754
755 impl MemBase {
new_buffer( context: Arc<Context>, flags: cl_mem_flags, size: usize, mut host_ptr: *mut c_void, props: Properties<cl_mem_properties>, ) -> CLResult<Arc<Buffer>>756 pub fn new_buffer(
757 context: Arc<Context>,
758 flags: cl_mem_flags,
759 size: usize,
760 mut host_ptr: *mut c_void,
761 props: Properties<cl_mem_properties>,
762 ) -> CLResult<Arc<Buffer>> {
763 let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
764 ResourceType::Staging
765 } else {
766 ResourceType::Normal
767 };
768
769 let buffer = context.create_buffer(
770 size,
771 host_ptr,
772 bit_check(flags, CL_MEM_COPY_HOST_PTR),
773 res_type,
774 )?;
775
776 // We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
777 if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
778 host_ptr = ptr::null_mut()
779 }
780
781 let alloc = Allocation::new(buffer, 0, host_ptr);
782 Ok(Arc::new(Buffer {
783 base: Self {
784 base: CLObjectBase::new(RusticlTypes::Buffer),
785 context: context,
786 mem_type: CL_MEM_OBJECT_BUFFER,
787 flags: flags,
788 size: size,
789 props: props,
790 gl_obj: None,
791 cbs: Mutex::new(Vec::new()),
792 alloc: alloc,
793 },
794 maps: Mutex::new(TrackedPointers::new()),
795 }))
796 }
797
new_sub_buffer( parent: Arc<Buffer>, flags: cl_mem_flags, offset: usize, size: usize, ) -> Arc<Buffer>798 pub fn new_sub_buffer(
799 parent: Arc<Buffer>,
800 flags: cl_mem_flags,
801 offset: usize,
802 size: usize,
803 ) -> Arc<Buffer> {
804 Arc::new(Buffer {
805 base: Self {
806 base: CLObjectBase::new(RusticlTypes::Buffer),
807 context: Arc::clone(&parent.context),
808 mem_type: CL_MEM_OBJECT_BUFFER,
809 flags: flags,
810 size: size,
811 props: Properties::default(),
812 gl_obj: None,
813 cbs: Mutex::new(Vec::new()),
814 alloc: Allocation::new_sub(Mem::Buffer(parent), offset),
815 },
816 maps: Mutex::new(TrackedPointers::new()),
817 })
818 }
819
new_image( context: Arc<Context>, parent: Option<Mem>, flags: cl_mem_flags, image_format: &cl_image_format, mut image_desc: cl_image_desc, image_elem_size: u8, mut host_ptr: *mut c_void, props: Properties<cl_mem_properties>, ) -> CLResult<Arc<Image>>820 pub fn new_image(
821 context: Arc<Context>,
822 parent: Option<Mem>,
823 flags: cl_mem_flags,
824 image_format: &cl_image_format,
825 mut image_desc: cl_image_desc,
826 image_elem_size: u8,
827 mut host_ptr: *mut c_void,
828 props: Properties<cl_mem_properties>,
829 ) -> CLResult<Arc<Image>> {
830 // we have to sanitize the image_desc a little for internal use
831 let api_image_desc = image_desc;
832 let dims = image_desc.dims();
833 let is_array = image_desc.is_array();
834 if dims < 3 {
835 image_desc.image_depth = 1;
836 }
837 if dims < 2 {
838 image_desc.image_height = 1;
839 }
840 if !is_array {
841 image_desc.image_array_size = 1;
842 }
843
844 let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
845 ResourceType::Staging
846 } else {
847 ResourceType::Normal
848 };
849
850 let alloc = if let Some(parent) = parent {
851 Allocation::new_sub(parent, 0)
852 } else {
853 let mut texture = context.create_texture(
854 &image_desc,
855 image_format,
856 host_ptr,
857 bit_check(flags, CL_MEM_COPY_HOST_PTR),
858 res_type,
859 );
860
861 // if we error allocating a Staging resource, just try with normal as
862 // `CL_MEM_ALLOC_HOST_PTR` is just a performance hint.
863 if res_type == ResourceType::Staging && texture.is_err() {
864 texture = context.create_texture(
865 &image_desc,
866 image_format,
867 host_ptr,
868 bit_check(flags, CL_MEM_COPY_HOST_PTR),
869 ResourceType::Normal,
870 )
871 }
872
873 // We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
874 if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
875 host_ptr = ptr::null_mut()
876 }
877
878 Allocation::new(texture?, 0, host_ptr)
879 };
880
881 let pipe_format = image_format.to_pipe_format().unwrap();
882 Ok(Arc::new(Image {
883 base: Self {
884 base: CLObjectBase::new(RusticlTypes::Image),
885 context: context,
886 mem_type: image_desc.image_type,
887 flags: flags,
888 size: image_desc.pixels() * image_format.pixel_size().unwrap() as usize,
889 props: props,
890 gl_obj: None,
891 cbs: Mutex::new(Vec::new()),
892 alloc: alloc,
893 },
894 image_format: *image_format,
895 pipe_format: pipe_format,
896 image_desc: api_image_desc,
897 image_elem_size: image_elem_size,
898 maps: Mutex::new(TrackedPointers::new()),
899 }))
900 }
901
arc_from_raw(ptr: cl_mem) -> CLResult<Mem>902 pub fn arc_from_raw(ptr: cl_mem) -> CLResult<Mem> {
903 let mem = Self::ref_from_raw(ptr)?;
904 match mem.base.get_type()? {
905 RusticlTypes::Buffer => Ok(Mem::Buffer(Buffer::arc_from_raw(ptr)?)),
906 RusticlTypes::Image => Ok(Mem::Image(Image::arc_from_raw(ptr)?)),
907 _ => Err(CL_INVALID_MEM_OBJECT),
908 }
909 }
910
arcs_from_arr(objs: *const cl_mem, count: u32) -> CLResult<Vec<Mem>>911 pub fn arcs_from_arr(objs: *const cl_mem, count: u32) -> CLResult<Vec<Mem>> {
912 let count = count as usize;
913 let mut res = Vec::with_capacity(count);
914 for i in 0..count {
915 res.push(Self::arc_from_raw(unsafe { *objs.add(i) })?);
916 }
917 Ok(res)
918 }
919
from_gl( context: Arc<Context>, flags: cl_mem_flags, gl_export_manager: &GLExportManager, ) -> CLResult<cl_mem>920 pub fn from_gl(
921 context: Arc<Context>,
922 flags: cl_mem_flags,
923 gl_export_manager: &GLExportManager,
924 ) -> CLResult<cl_mem> {
925 let export_in = &gl_export_manager.export_in;
926 let export_out = &gl_export_manager.export_out;
927
928 let (mem_type, gl_object_type) = target_from_gl(export_in.target)?;
929 let gl_mem_props = gl_export_manager.get_gl_mem_props()?;
930
931 // Handle Buffers
932 let (image_format, pipe_format, rusticl_type) = if gl_export_manager.is_gl_buffer() {
933 (
934 cl_image_format::default(),
935 pipe_format::PIPE_FORMAT_NONE,
936 RusticlTypes::Buffer,
937 )
938 } else {
939 let image_format =
940 format_from_gl(export_out.internal_format).ok_or(CL_OUT_OF_HOST_MEMORY)?;
941 (
942 image_format,
943 image_format.to_pipe_format().unwrap(),
944 RusticlTypes::Image,
945 )
946 };
947
948 let imported_gl_tex = context.import_gl_buffer(
949 export_out.dmabuf_fd as u32,
950 export_out.modifier,
951 mem_type,
952 export_in.target,
953 image_format,
954 gl_mem_props.clone(),
955 )?;
956
957 // Cube maps faces are not linear in memory, so copy all contents
958 // of desired face into a 2D image and copy it back after gl release.
959 let (shadow_map, texture) = if is_cube_map_face(export_in.target) {
960 let shadow = create_shadow_slice(&imported_gl_tex, image_format)?;
961
962 let mut res_map = HashMap::new();
963 shadow
964 .iter()
965 .map(|(k, v)| {
966 let gl_res = Arc::clone(imported_gl_tex.get(k).unwrap());
967 res_map.insert(Arc::clone(v), gl_res);
968 })
969 .for_each(drop);
970
971 (Some(res_map), shadow)
972 } else {
973 (None, imported_gl_tex)
974 };
975
976 // it's kinda not supported, but we want to know if anything actually hits this as it's
977 // certainly not tested by the CL CTS.
978 if mem_type != CL_MEM_OBJECT_BUFFER {
979 assert_eq!(gl_mem_props.offset, 0);
980 }
981
982 let base = Self {
983 base: CLObjectBase::new(rusticl_type),
984 context: context,
985 mem_type: mem_type,
986 flags: flags,
987 size: gl_mem_props.size(),
988 props: Properties::default(),
989 gl_obj: Some(GLObject {
990 gl_object_target: gl_export_manager.export_in.target,
991 gl_object_type: gl_object_type,
992 gl_object_name: export_in.obj,
993 shadow_map: shadow_map,
994 }),
995 cbs: Mutex::new(Vec::new()),
996 alloc: Allocation::new(texture, gl_mem_props.offset as usize, ptr::null_mut()),
997 };
998
999 Ok(if rusticl_type == RusticlTypes::Buffer {
1000 Arc::new(Buffer {
1001 base: base,
1002 maps: Mutex::new(TrackedPointers::new()),
1003 })
1004 .into_cl()
1005 } else {
1006 Arc::new(Image {
1007 base: base,
1008 image_format: image_format,
1009 pipe_format: pipe_format,
1010 image_desc: cl_image_desc {
1011 image_type: mem_type,
1012 image_width: gl_mem_props.width as usize,
1013 image_height: gl_mem_props.height as usize,
1014 image_depth: gl_mem_props.depth as usize,
1015 image_array_size: gl_mem_props.array_size as usize,
1016 image_row_pitch: 0,
1017 image_slice_pitch: 0,
1018 num_mip_levels: 0,
1019 num_samples: 0,
1020 ..Default::default()
1021 },
1022 image_elem_size: gl_mem_props.pixel_size,
1023 maps: Mutex::new(TrackedPointers::new()),
1024 })
1025 .into_cl()
1026 })
1027 }
1028
is_buffer(&self) -> bool1029 pub fn is_buffer(&self) -> bool {
1030 self.mem_type == CL_MEM_OBJECT_BUFFER
1031 }
1032
1033 /// Checks if the backing memory is actually the same memory object.
backing_memory_eq(&self, other: &Self) -> bool1034 pub fn backing_memory_eq(&self, other: &Self) -> bool {
1035 self.alloc.backing_resource_eq(&other.alloc)
1036 }
1037
1038 // this is kinda bogus, because that won't work with system SVM, but the spec wants us to
1039 // implement this.
is_svm(&self) -> bool1040 pub fn is_svm(&self) -> bool {
1041 self.context
1042 .find_svm_alloc(self.host_ptr() as usize)
1043 .is_some()
1044 && bit_check(self.flags, CL_MEM_USE_HOST_PTR)
1045 }
1046
get_res_for_access( &self, ctx: &QueueContext, rw: RWFlags, ) -> CLResult<&Arc<PipeResource>>1047 pub fn get_res_for_access(
1048 &self,
1049 ctx: &QueueContext,
1050 rw: RWFlags,
1051 ) -> CLResult<&Arc<PipeResource>> {
1052 self.alloc.get_res_for_access(ctx, rw)
1053 }
1054
1055 /// Returns the parent memory object or None if self isn't a sub allocated memory object.
parent(&self) -> Option<&Mem>1056 pub fn parent(&self) -> Option<&Mem> {
1057 match &self.alloc {
1058 Allocation::SubAlloc(sub) => Some(&sub.mem),
1059 Allocation::Resource(_) => None,
1060 }
1061 }
1062
host_ptr(&self) -> *mut c_void1063 pub fn host_ptr(&self) -> *mut c_void {
1064 self.alloc.host_ptr()
1065 }
1066
is_pure_user_memory(&self, d: &Device) -> CLResult<bool>1067 fn is_pure_user_memory(&self, d: &Device) -> CLResult<bool> {
1068 // 1Dbuffer objects are weird. The parent memory object can be a host_ptr thing, but we are
1069 // not allowed to actually return a pointer based on the host_ptr when mapping.
1070 Ok(self.alloc.is_user_alloc_for_dev(d)? && !self.host_ptr().is_null())
1071 }
1072
map<T>( &self, offset: usize, layout: Layout, writes: bool, maps: &Mutex<TrackedPointers<usize, Mapping<T>>>, inner: T, ) -> CLResult<MutMemoryPtr>1073 fn map<T>(
1074 &self,
1075 offset: usize,
1076 layout: Layout,
1077 writes: bool,
1078 maps: &Mutex<TrackedPointers<usize, Mapping<T>>>,
1079 inner: T,
1080 ) -> CLResult<MutMemoryPtr> {
1081 let host_ptr = self.host_ptr();
1082 let ptr = unsafe {
1083 let ptr = if !host_ptr.is_null() {
1084 host_ptr.byte_add(offset)
1085 } else {
1086 alloc::alloc(layout).cast()
1087 };
1088
1089 MutMemoryPtr::from_ptr(ptr)
1090 };
1091
1092 match maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1093 Entry::Occupied(mut e) => {
1094 debug_assert!(!host_ptr.is_null());
1095 e.get_mut().count += 1;
1096 }
1097 Entry::Vacant(e) => {
1098 e.insert(Mapping {
1099 layout: layout,
1100 writes: writes,
1101 ptr: host_ptr.is_null().then_some(ptr),
1102 count: 1,
1103 inner: inner,
1104 });
1105 }
1106 }
1107
1108 Ok(ptr)
1109 }
1110 }
1111
1112 impl Drop for MemBase {
drop(&mut self)1113 fn drop(&mut self) {
1114 let cbs = mem::take(self.cbs.get_mut().unwrap());
1115 for cb in cbs.into_iter().rev() {
1116 cb.call(self);
1117 }
1118 }
1119 }
1120
1121 impl Buffer {
apply_offset(&self, offset: usize) -> CLResult<usize>1122 fn apply_offset(&self, offset: usize) -> CLResult<usize> {
1123 self.offset()
1124 .checked_add(offset)
1125 .ok_or(CL_OUT_OF_HOST_MEMORY)
1126 }
1127
copy_rect( &self, dst: &Self, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1128 pub fn copy_rect(
1129 &self,
1130 dst: &Self,
1131 ctx: &QueueContext,
1132 region: &CLVec<usize>,
1133 src_origin: &CLVec<usize>,
1134 src_row_pitch: usize,
1135 src_slice_pitch: usize,
1136 dst_origin: &CLVec<usize>,
1137 dst_row_pitch: usize,
1138 dst_slice_pitch: usize,
1139 ) -> CLResult<()> {
1140 let (offset, size) =
1141 CLVec::calc_offset_size(src_origin, region, [1, src_row_pitch, src_slice_pitch]);
1142 let tx_src = self.tx(ctx, offset, size, RWFlags::RD)?;
1143
1144 let (offset, size) =
1145 CLVec::calc_offset_size(dst_origin, region, [1, dst_row_pitch, dst_slice_pitch]);
1146 let tx_dst = dst.tx(ctx, offset, size, RWFlags::WR)?;
1147
1148 perf_warning!("clEnqueueCopyBufferRect stalls the GPU");
1149
1150 // TODO check to use hw accelerated paths (e.g. resource_copy_region or blits)
1151 sw_copy(
1152 tx_src.ptr(),
1153 tx_dst.ptr(),
1154 region,
1155 &CLVec::default(),
1156 src_row_pitch,
1157 src_slice_pitch,
1158 &CLVec::default(),
1159 dst_row_pitch,
1160 dst_slice_pitch,
1161 1,
1162 );
1163
1164 Ok(())
1165 }
1166
copy_to_buffer( &self, ctx: &QueueContext, dst: &Buffer, src_offset: usize, dst_offset: usize, size: usize, ) -> CLResult<()>1167 pub fn copy_to_buffer(
1168 &self,
1169 ctx: &QueueContext,
1170 dst: &Buffer,
1171 src_offset: usize,
1172 dst_offset: usize,
1173 size: usize,
1174 ) -> CLResult<()> {
1175 let src_offset = self.apply_offset(src_offset)?;
1176 let dst_offset = dst.apply_offset(dst_offset)?;
1177 let src_res = self.get_res_for_access(ctx, RWFlags::RD)?;
1178 let dst_res = dst.get_res_for_access(ctx, RWFlags::WR)?;
1179
1180 let bx = create_pipe_box(
1181 [src_offset, 0, 0].into(),
1182 [size, 1, 1].into(),
1183 CL_MEM_OBJECT_BUFFER,
1184 )?;
1185 let dst_origin: [u32; 3] = [dst_offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?, 0, 0];
1186
1187 ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx);
1188 Ok(())
1189 }
1190
copy_to_image( &self, ctx: &QueueContext, dst: &Image, src_offset: usize, dst_origin: CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1191 pub fn copy_to_image(
1192 &self,
1193 ctx: &QueueContext,
1194 dst: &Image,
1195 src_offset: usize,
1196 dst_origin: CLVec<usize>,
1197 region: &CLVec<usize>,
1198 ) -> CLResult<()> {
1199 let src_offset = self.apply_offset(src_offset)?;
1200 let bpp = dst.image_format.pixel_size().unwrap().into();
1201 let src_pitch = [bpp, bpp * region[0], bpp * region[0] * region[1]];
1202 let size = CLVec::calc_size(region, src_pitch);
1203 let tx_src = self.tx(ctx, src_offset, size, RWFlags::RD)?;
1204
1205 // If image is created from a buffer, use image's slice and row pitch instead
1206 let tx_dst;
1207 let dst_pitch;
1208 if let Some(Mem::Buffer(buffer)) = dst.parent() {
1209 dst_pitch = [
1210 bpp,
1211 dst.image_desc.row_pitch()? as usize,
1212 dst.image_desc.slice_pitch(),
1213 ];
1214
1215 let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1216 tx_dst = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1217 } else {
1218 tx_dst = dst.tx_image(
1219 ctx,
1220 &create_pipe_box(dst_origin, *region, dst.mem_type)?,
1221 RWFlags::WR,
1222 )?;
1223
1224 dst_pitch = [1, tx_dst.row_pitch() as usize, tx_dst.slice_pitch()];
1225 }
1226
1227 // Those pitch values cannot have 0 value in its coordinates
1228 debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1229 debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1230
1231 perf_warning!("clEnqueueCopyBufferToImage stalls the GPU");
1232
1233 sw_copy(
1234 tx_src.ptr(),
1235 tx_dst.ptr(),
1236 region,
1237 &CLVec::default(),
1238 src_pitch[1],
1239 src_pitch[2],
1240 &CLVec::default(),
1241 dst_pitch[1],
1242 dst_pitch[2],
1243 bpp as u8,
1244 );
1245 Ok(())
1246 }
1247
fill( &self, ctx: &QueueContext, pattern: &[u8], offset: usize, size: usize, ) -> CLResult<()>1248 pub fn fill(
1249 &self,
1250 ctx: &QueueContext,
1251 pattern: &[u8],
1252 offset: usize,
1253 size: usize,
1254 ) -> CLResult<()> {
1255 let offset = self.apply_offset(offset)?;
1256 let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1257 ctx.clear_buffer(
1258 res,
1259 pattern,
1260 offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1261 size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1262 );
1263 Ok(())
1264 }
1265
is_mapped_ptr(&self, ptr: *mut c_void) -> bool1266 fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
1267 let mut maps = self.maps.lock().unwrap();
1268 let entry = maps.entry(ptr as usize);
1269 matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
1270 }
1271
map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr>1272 pub fn map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr> {
1273 let layout =
1274 unsafe { Layout::from_size_align_unchecked(size, size_of::<[cl_ulong; 16]>()) };
1275 self.base.map(
1276 offset,
1277 layout,
1278 writes,
1279 &self.maps,
1280 BufferMapping { offset: offset },
1281 )
1282 }
1283
offset(&self) -> usize1284 pub fn offset(&self) -> usize {
1285 self.alloc.offset()
1286 }
1287
read( &self, ctx: &QueueContext, offset: usize, ptr: MutMemoryPtr, size: usize, ) -> CLResult<()>1288 pub fn read(
1289 &self,
1290 ctx: &QueueContext,
1291 offset: usize,
1292 ptr: MutMemoryPtr,
1293 size: usize,
1294 ) -> CLResult<()> {
1295 let ptr = ptr.as_ptr();
1296 let tx = self.tx(ctx, offset, size, RWFlags::RD)?;
1297
1298 perf_warning!("clEnqueueReadBuffer and clEnqueueMapBuffer stall the GPU");
1299
1300 unsafe {
1301 ptr::copy(tx.ptr(), ptr, size);
1302 }
1303
1304 Ok(())
1305 }
1306
read_rect( &self, dst: MutMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1307 pub fn read_rect(
1308 &self,
1309 dst: MutMemoryPtr,
1310 ctx: &QueueContext,
1311 region: &CLVec<usize>,
1312 src_origin: &CLVec<usize>,
1313 src_row_pitch: usize,
1314 src_slice_pitch: usize,
1315 dst_origin: &CLVec<usize>,
1316 dst_row_pitch: usize,
1317 dst_slice_pitch: usize,
1318 ) -> CLResult<()> {
1319 let dst = dst.as_ptr();
1320 let (offset, size) =
1321 CLVec::calc_offset_size(src_origin, region, [1, src_row_pitch, src_slice_pitch]);
1322 let tx = self.tx(ctx, offset, size, RWFlags::RD)?;
1323
1324 perf_warning!("clEnqueueReadBufferRect stalls the GPU");
1325
1326 sw_copy(
1327 tx.ptr(),
1328 dst,
1329 region,
1330 &CLVec::default(),
1331 src_row_pitch,
1332 src_slice_pitch,
1333 dst_origin,
1334 dst_row_pitch,
1335 dst_slice_pitch,
1336 1,
1337 );
1338
1339 Ok(())
1340 }
1341
sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1342 pub fn sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1343 let maps = self.maps.lock().unwrap();
1344 let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
1345 return Err(CL_INVALID_VALUE);
1346 };
1347
1348 // in this case we only need to migrate to the device if the data is located on a device not
1349 // having a userptr allocation.
1350 if self.is_pure_user_memory(ctx.dev)? {
1351 let rw = if mapping.writes {
1352 RWFlags::RW
1353 } else {
1354 RWFlags::RD
1355 };
1356
1357 let _ = self.get_res_for_access(ctx, rw)?;
1358 return Ok(());
1359 }
1360
1361 self.read(ctx, mapping.offset, ptr, mapping.size())
1362 }
1363
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1364 pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1365 // no need to update
1366 if self.is_pure_user_memory(ctx.dev)? {
1367 return Ok(());
1368 }
1369
1370 match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1371 Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1372 Entry::Occupied(entry) => {
1373 let mapping = entry.get();
1374
1375 if mapping.writes {
1376 self.write(ctx, mapping.offset, ptr.into(), mapping.size())?;
1377 }
1378
1379 // only remove if the mapping wasn't reused in the meantime
1380 if mapping.count == 0 {
1381 entry.remove();
1382 }
1383
1384 Ok(())
1385 }
1386 }
1387 }
1388
tx<'a>( &self, ctx: &'a QueueContext, offset: usize, size: usize, rw: RWFlags, ) -> CLResult<PipeTransfer<'a>>1389 fn tx<'a>(
1390 &self,
1391 ctx: &'a QueueContext,
1392 offset: usize,
1393 size: usize,
1394 rw: RWFlags,
1395 ) -> CLResult<PipeTransfer<'a>> {
1396 let offset = self.apply_offset(offset)?;
1397 let r = self.get_res_for_access(ctx, rw)?;
1398
1399 ctx.buffer_map(
1400 r,
1401 offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1402 size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1403 rw,
1404 )
1405 .ok_or(CL_OUT_OF_RESOURCES)
1406 }
1407
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>1408 pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
1409 match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1410 Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1411 Entry::Occupied(mut entry) => {
1412 let entry = entry.get_mut();
1413 debug_assert!(entry.count > 0);
1414 entry.count -= 1;
1415 Ok(entry.count == 0)
1416 }
1417 }
1418 }
1419
write( &self, ctx: &QueueContext, offset: usize, ptr: ConstMemoryPtr, size: usize, ) -> CLResult<()>1420 pub fn write(
1421 &self,
1422 ctx: &QueueContext,
1423 offset: usize,
1424 ptr: ConstMemoryPtr,
1425 size: usize,
1426 ) -> CLResult<()> {
1427 let ptr = ptr.as_ptr();
1428 let offset = self.apply_offset(offset)?;
1429 let r = self.get_res_for_access(ctx, RWFlags::WR)?;
1430
1431 perf_warning!("clEnqueueWriteBuffer and clEnqueueUnmapMemObject might stall the GPU");
1432
1433 ctx.buffer_subdata(
1434 r,
1435 offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1436 ptr,
1437 size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1438 );
1439 Ok(())
1440 }
1441
write_rect( &self, src: ConstMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1442 pub fn write_rect(
1443 &self,
1444 src: ConstMemoryPtr,
1445 ctx: &QueueContext,
1446 region: &CLVec<usize>,
1447 src_origin: &CLVec<usize>,
1448 src_row_pitch: usize,
1449 src_slice_pitch: usize,
1450 dst_origin: &CLVec<usize>,
1451 dst_row_pitch: usize,
1452 dst_slice_pitch: usize,
1453 ) -> CLResult<()> {
1454 let src = src.as_ptr();
1455 let (offset, size) =
1456 CLVec::calc_offset_size(dst_origin, region, [1, dst_row_pitch, dst_slice_pitch]);
1457 let tx = self.tx(ctx, offset, size, RWFlags::WR)?;
1458
1459 perf_warning!("clEnqueueWriteBufferRect stalls the GPU");
1460
1461 sw_copy(
1462 src,
1463 tx.ptr(),
1464 region,
1465 src_origin,
1466 src_row_pitch,
1467 src_slice_pitch,
1468 &CLVec::default(),
1469 dst_row_pitch,
1470 dst_slice_pitch,
1471 1,
1472 );
1473
1474 Ok(())
1475 }
1476 }
1477
1478 impl Image {
copy_to_buffer( &self, ctx: &QueueContext, dst: &Buffer, src_origin: CLVec<usize>, dst_offset: usize, region: &CLVec<usize>, ) -> CLResult<()>1479 pub fn copy_to_buffer(
1480 &self,
1481 ctx: &QueueContext,
1482 dst: &Buffer,
1483 src_origin: CLVec<usize>,
1484 dst_offset: usize,
1485 region: &CLVec<usize>,
1486 ) -> CLResult<()> {
1487 let bpp = self.image_format.pixel_size().unwrap().into();
1488
1489 let src_pitch;
1490 let tx_src;
1491 if let Some(Mem::Buffer(buffer)) = self.parent() {
1492 src_pitch = [
1493 bpp,
1494 self.image_desc.row_pitch()? as usize,
1495 self.image_desc.slice_pitch(),
1496 ];
1497 let (offset, size) = CLVec::calc_offset_size(src_origin, region, src_pitch);
1498 tx_src = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1499 } else {
1500 tx_src = self.tx_image(
1501 ctx,
1502 &create_pipe_box(src_origin, *region, self.mem_type)?,
1503 RWFlags::RD,
1504 )?;
1505 src_pitch = [1, tx_src.row_pitch() as usize, tx_src.slice_pitch()];
1506 }
1507
1508 // If image is created from a buffer, use image's slice and row pitch instead
1509 let dst_pitch = [bpp, bpp * region[0], bpp * region[0] * region[1]];
1510
1511 let dst_origin: CLVec<usize> = [dst_offset, 0, 0].into();
1512 let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1513 let tx_dst = dst.tx(ctx, offset, size, RWFlags::WR)?;
1514
1515 // Those pitch values cannot have 0 value in its coordinates
1516 debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1517 debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1518
1519 perf_warning!("clEnqueueCopyImageToBuffer stalls the GPU");
1520
1521 sw_copy(
1522 tx_src.ptr(),
1523 tx_dst.ptr(),
1524 region,
1525 &CLVec::default(),
1526 src_pitch[1],
1527 src_pitch[2],
1528 &CLVec::default(),
1529 dst_pitch[1],
1530 dst_pitch[2],
1531 bpp as u8,
1532 );
1533 Ok(())
1534 }
1535
copy_to_image( &self, ctx: &QueueContext, dst: &Image, src_origin: CLVec<usize>, dst_origin: CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1536 pub fn copy_to_image(
1537 &self,
1538 ctx: &QueueContext,
1539 dst: &Image,
1540 src_origin: CLVec<usize>,
1541 dst_origin: CLVec<usize>,
1542 region: &CLVec<usize>,
1543 ) -> CLResult<()> {
1544 let src_res = self.get_res_for_access(ctx, RWFlags::RD)?;
1545 let dst_res = dst.get_res_for_access(ctx, RWFlags::WR)?;
1546
1547 // We just want to use sw_copy if mem objects have different types or if copy can have
1548 // custom strides (image2d from buff/images)
1549 if self.is_parent_buffer() || dst.is_parent_buffer() {
1550 let bpp = self.image_format.pixel_size().unwrap().into();
1551
1552 let tx_src;
1553 let tx_dst;
1554 let dst_pitch;
1555 let src_pitch;
1556 if let Some(Mem::Buffer(buffer)) = self.parent() {
1557 src_pitch = [
1558 bpp,
1559 self.image_desc.row_pitch()? as usize,
1560 self.image_desc.slice_pitch(),
1561 ];
1562
1563 let (offset, size) = CLVec::calc_offset_size(src_origin, region, src_pitch);
1564 tx_src = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1565 } else {
1566 tx_src = self.tx_image(
1567 ctx,
1568 &create_pipe_box(src_origin, *region, self.mem_type)?,
1569 RWFlags::RD,
1570 )?;
1571
1572 src_pitch = [1, tx_src.row_pitch() as usize, tx_src.slice_pitch()];
1573 }
1574
1575 if let Some(Mem::Buffer(buffer)) = dst.parent() {
1576 // If image is created from a buffer, use image's slice and row pitch instead
1577 dst_pitch = [
1578 bpp,
1579 dst.image_desc.row_pitch()? as usize,
1580 dst.image_desc.slice_pitch(),
1581 ];
1582
1583 let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1584 tx_dst = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1585 } else {
1586 tx_dst = dst.tx_image(
1587 ctx,
1588 &create_pipe_box(dst_origin, *region, dst.mem_type)?,
1589 RWFlags::WR,
1590 )?;
1591
1592 dst_pitch = [1, tx_dst.row_pitch() as usize, tx_dst.slice_pitch()];
1593 }
1594
1595 // Those pitch values cannot have 0 value in its coordinates
1596 debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1597 debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1598
1599 perf_warning!(
1600 "clEnqueueCopyImage stalls the GPU when src or dst are created from a buffer"
1601 );
1602
1603 sw_copy(
1604 tx_src.ptr(),
1605 tx_dst.ptr(),
1606 region,
1607 &CLVec::default(),
1608 src_pitch[1],
1609 src_pitch[2],
1610 &CLVec::default(),
1611 dst_pitch[1],
1612 dst_pitch[2],
1613 bpp as u8,
1614 )
1615 } else {
1616 let bx = create_pipe_box(src_origin, *region, self.mem_type)?;
1617 let mut dst_origin: [u32; 3] = dst_origin.try_into()?;
1618
1619 if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1620 (dst_origin[1], dst_origin[2]) = (dst_origin[2], dst_origin[1]);
1621 }
1622
1623 ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx);
1624 }
1625 Ok(())
1626 }
1627
fill( &self, ctx: &QueueContext, pattern: [u32; 4], origin: &CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1628 pub fn fill(
1629 &self,
1630 ctx: &QueueContext,
1631 pattern: [u32; 4],
1632 origin: &CLVec<usize>,
1633 region: &CLVec<usize>,
1634 ) -> CLResult<()> {
1635 let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1636
1637 // make sure we allocate multiples of 4 bytes so drivers don't read out of bounds or
1638 // unaligned.
1639 let pixel_size: usize = self.image_format.pixel_size().unwrap().into();
1640 let mut new_pattern: Vec<u32> = vec![0; pixel_size.div_ceil(size_of::<u32>())];
1641
1642 // SAFETY: pointers have to be valid for read/writes of exactly one pixel of their
1643 // respective format.
1644 // `new_pattern` has the correct size due to the `size` above.
1645 // `pattern` is validated through the CL API and allows undefined behavior if not followed
1646 // by CL API rules.
1647 unsafe {
1648 util_format_pack_rgba(
1649 self.pipe_format,
1650 new_pattern.as_mut_ptr().cast(),
1651 pattern.as_ptr().cast(),
1652 1,
1653 );
1654 }
1655
1656 // If image is created from a buffer, use clear_image_buffer instead
1657 if self.is_parent_buffer() {
1658 let strides = (
1659 self.image_desc.row_pitch()? as usize,
1660 self.image_desc.slice_pitch(),
1661 );
1662 ctx.clear_image_buffer(res, &new_pattern, origin, region, strides, pixel_size);
1663 } else {
1664 let bx = create_pipe_box(*origin, *region, self.mem_type)?;
1665 ctx.clear_texture(res, &new_pattern, &bx);
1666 }
1667
1668 Ok(())
1669 }
1670
is_mapped_ptr(&self, ptr: *mut c_void) -> bool1671 fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
1672 let mut maps = self.maps.lock().unwrap();
1673 let entry = maps.entry(ptr as usize);
1674 matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
1675 }
1676
is_parent_buffer(&self) -> bool1677 pub fn is_parent_buffer(&self) -> bool {
1678 matches!(self.parent(), Some(Mem::Buffer(_)))
1679 }
1680
map( &self, origin: CLVec<usize>, region: CLVec<usize>, row_pitch: &mut usize, slice_pitch: &mut usize, writes: bool, ) -> CLResult<MutMemoryPtr>1681 pub fn map(
1682 &self,
1683 origin: CLVec<usize>,
1684 region: CLVec<usize>,
1685 row_pitch: &mut usize,
1686 slice_pitch: &mut usize,
1687 writes: bool,
1688 ) -> CLResult<MutMemoryPtr> {
1689 let pixel_size = self.image_format.pixel_size().unwrap() as usize;
1690
1691 *row_pitch = self.image_desc.row_pitch()? as usize;
1692 *slice_pitch = self.image_desc.slice_pitch();
1693
1694 let offset = CLVec::calc_offset(origin, [pixel_size, *row_pitch, *slice_pitch]);
1695
1696 // From the CL Spec:
1697 //
1698 // The pointer returned maps a 1D, 2D or 3D region starting at origin and is at least
1699 // region[0] pixels in size for a 1D image, 1D image buffer or 1D image array,
1700 // (image_row_pitch × region[1]) pixels in size for a 2D image or 2D image array, and
1701 // (image_slice_pitch × region[2]) pixels in size for a 3D image. The result of a memory
1702 // access outside this region is undefined.
1703 //
1704 // It's not guaranteed that the row_pitch is taken into account for 1D images, but the CL
1705 // CTS relies on this behavior.
1706 //
1707 // Also note, that the spec wording is wrong in regards to arrays, which need to take the
1708 // image_slice_pitch into account.
1709 let size = if self.image_desc.is_array() || self.image_desc.dims() == 3 {
1710 debug_assert_ne!(*slice_pitch, 0);
1711 // the slice count is in region[1] for 1D array images
1712 if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1713 region[1] * *slice_pitch
1714 } else {
1715 region[2] * *slice_pitch
1716 }
1717 } else {
1718 debug_assert_ne!(*row_pitch, 0);
1719 region[1] * *row_pitch
1720 };
1721
1722 let layout;
1723 unsafe {
1724 layout = Layout::from_size_align_unchecked(size, size_of::<[u32; 4]>());
1725 }
1726
1727 self.base.map(
1728 offset,
1729 layout,
1730 writes,
1731 &self.maps,
1732 ImageMapping {
1733 origin: origin,
1734 region: region,
1735 },
1736 )
1737 }
1738
pipe_image_host_access(&self) -> u161739 fn pipe_image_host_access(&self) -> u16 {
1740 // those flags are all mutually exclusive
1741 (if bit_check(self.flags, CL_MEM_HOST_READ_ONLY) {
1742 PIPE_IMAGE_ACCESS_READ
1743 } else if bit_check(self.flags, CL_MEM_HOST_WRITE_ONLY) {
1744 PIPE_IMAGE_ACCESS_WRITE
1745 } else if bit_check(self.flags, CL_MEM_HOST_NO_ACCESS) {
1746 0
1747 } else {
1748 PIPE_IMAGE_ACCESS_READ_WRITE
1749 }) as u16
1750 }
1751
read( &self, dst: MutMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1752 pub fn read(
1753 &self,
1754 dst: MutMemoryPtr,
1755 ctx: &QueueContext,
1756 region: &CLVec<usize>,
1757 src_origin: &CLVec<usize>,
1758 dst_row_pitch: usize,
1759 dst_slice_pitch: usize,
1760 ) -> CLResult<()> {
1761 let dst = dst.as_ptr();
1762 let pixel_size = self.image_format.pixel_size().unwrap();
1763
1764 let tx;
1765 let src_row_pitch;
1766 let src_slice_pitch;
1767 if let Some(Mem::Buffer(buffer)) = self.parent() {
1768 src_row_pitch = self.image_desc.image_row_pitch;
1769 src_slice_pitch = self.image_desc.image_slice_pitch;
1770
1771 let (offset, size) = CLVec::calc_offset_size(
1772 src_origin,
1773 region,
1774 [pixel_size.into(), src_row_pitch, src_slice_pitch],
1775 );
1776
1777 tx = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1778 } else {
1779 let bx = create_pipe_box(*src_origin, *region, self.mem_type)?;
1780 tx = self.tx_image(ctx, &bx, RWFlags::RD)?;
1781 src_row_pitch = tx.row_pitch() as usize;
1782 src_slice_pitch = tx.slice_pitch();
1783 };
1784
1785 perf_warning!("clEnqueueReadImage and clEnqueueMapImage stall the GPU");
1786
1787 sw_copy(
1788 tx.ptr(),
1789 dst,
1790 region,
1791 &CLVec::default(),
1792 src_row_pitch,
1793 src_slice_pitch,
1794 &CLVec::default(),
1795 dst_row_pitch,
1796 dst_slice_pitch,
1797 pixel_size,
1798 );
1799
1800 Ok(())
1801 }
1802
sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1803 pub fn sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1804 let maps = self.maps.lock().unwrap();
1805 let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
1806 return Err(CL_INVALID_VALUE);
1807 };
1808
1809 // in this case we only need to migrate to the device if the data is located on a device not
1810 // having a userptr allocation.
1811 if self.is_pure_user_memory(ctx.dev)? {
1812 let rw = if mapping.writes {
1813 RWFlags::RW
1814 } else {
1815 RWFlags::RD
1816 };
1817
1818 let _ = self.get_res_for_access(ctx, rw)?;
1819 return Ok(());
1820 }
1821
1822 let row_pitch = self.image_desc.row_pitch()? as usize;
1823 let slice_pitch = self.image_desc.slice_pitch();
1824
1825 self.read(
1826 ptr,
1827 ctx,
1828 &mapping.region,
1829 &mapping.origin,
1830 row_pitch,
1831 slice_pitch,
1832 )
1833 }
1834
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1835 pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1836 // no need to update
1837 if self.is_pure_user_memory(ctx.dev)? {
1838 return Ok(());
1839 }
1840
1841 match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1842 Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1843 Entry::Occupied(entry) => {
1844 let mapping = entry.get();
1845 let row_pitch = self.image_desc.row_pitch()? as usize;
1846 let slice_pitch = self.image_desc.slice_pitch();
1847
1848 if mapping.writes {
1849 self.write(
1850 ptr.into(),
1851 ctx,
1852 &mapping.region,
1853 row_pitch,
1854 slice_pitch,
1855 &mapping.origin,
1856 )?;
1857 }
1858
1859 // only remove if the mapping wasn't reused in the meantime
1860 if mapping.count == 0 {
1861 entry.remove();
1862 }
1863
1864 Ok(())
1865 }
1866 }
1867 }
1868
tx_image<'a>( &self, ctx: &'a QueueContext, bx: &pipe_box, rw: RWFlags, ) -> CLResult<PipeTransfer<'a>>1869 fn tx_image<'a>(
1870 &self,
1871 ctx: &'a QueueContext,
1872 bx: &pipe_box,
1873 rw: RWFlags,
1874 ) -> CLResult<PipeTransfer<'a>> {
1875 let r = self.get_res_for_access(ctx, rw)?;
1876 ctx.texture_map(r, bx, rw).ok_or(CL_OUT_OF_RESOURCES)
1877 }
1878
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>1879 pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
1880 match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1881 Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1882 Entry::Occupied(mut entry) => {
1883 let entry = entry.get_mut();
1884 debug_assert!(entry.count > 0);
1885 entry.count -= 1;
1886 Ok(entry.count == 0)
1887 }
1888 }
1889 }
1890
write( &self, src: ConstMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_row_pitch: usize, mut src_slice_pitch: usize, dst_origin: &CLVec<usize>, ) -> CLResult<()>1891 pub fn write(
1892 &self,
1893 src: ConstMemoryPtr,
1894 ctx: &QueueContext,
1895 region: &CLVec<usize>,
1896 src_row_pitch: usize,
1897 mut src_slice_pitch: usize,
1898 dst_origin: &CLVec<usize>,
1899 ) -> CLResult<()> {
1900 let src = src.as_ptr();
1901 let dst_row_pitch = self.image_desc.image_row_pitch;
1902 let dst_slice_pitch = self.image_desc.image_slice_pitch;
1903
1904 // texture_subdata most likely maps the resource anyway
1905 perf_warning!("clEnqueueWriteImage and clEnqueueUnmapMemObject stall the GPU");
1906
1907 if let Some(Mem::Buffer(buffer)) = self.parent() {
1908 let pixel_size = self.image_format.pixel_size().unwrap();
1909 let (offset, size) = CLVec::calc_offset_size(
1910 dst_origin,
1911 region,
1912 [pixel_size.into(), dst_row_pitch, dst_slice_pitch],
1913 );
1914 let tx = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1915
1916 sw_copy(
1917 src,
1918 tx.ptr(),
1919 region,
1920 &CLVec::default(),
1921 src_row_pitch,
1922 src_slice_pitch,
1923 &CLVec::default(),
1924 dst_row_pitch,
1925 dst_slice_pitch,
1926 pixel_size,
1927 );
1928 } else {
1929 let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1930 let bx = create_pipe_box(*dst_origin, *region, self.mem_type)?;
1931
1932 if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1933 src_slice_pitch = src_row_pitch;
1934 }
1935
1936 ctx.texture_subdata(
1937 res,
1938 &bx,
1939 src,
1940 src_row_pitch.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1941 src_slice_pitch,
1942 );
1943 }
1944 Ok(())
1945 }
1946
1947 /// Creates metadata when an 2D image or sampler view is created over a buffer resource.
buffer_2d_info(&self) -> CLResult<AppImgInfo>1948 fn buffer_2d_info(&self) -> CLResult<AppImgInfo> {
1949 Ok(AppImgInfo::new(
1950 self.image_desc.row_pitch()? / self.image_elem_size as u32,
1951 self.image_desc.width()?,
1952 self.image_desc.height()?,
1953 ))
1954 }
1955
sampler_view<'c>(&self, ctx: &'c QueueContext) -> CLResult<PipeSamplerView<'c, '_>>1956 pub fn sampler_view<'c>(&self, ctx: &'c QueueContext) -> CLResult<PipeSamplerView<'c, '_>> {
1957 let res = self.get_res_for_access(ctx, RWFlags::RD)?;
1958
1959 let template = if res.is_buffer() && self.mem_type == CL_MEM_OBJECT_IMAGE2D {
1960 res.pipe_sampler_view_template_2d_buffer(self.pipe_format, &self.buffer_2d_info()?)
1961 } else if res.is_buffer() {
1962 // we need to pass in the size of the buffer, not the width.
1963 let size = self.size.try_into_with_err(CL_OUT_OF_RESOURCES)?;
1964 res.pipe_sampler_view_template_1d_buffer(self.pipe_format, size)
1965 } else {
1966 res.pipe_sampler_view_template()
1967 };
1968
1969 PipeSamplerView::new(ctx, res, &template).ok_or(CL_OUT_OF_HOST_MEMORY)
1970 }
1971
image_view(&self, ctx: &QueueContext, read_write: bool) -> CLResult<PipeImageView>1972 pub fn image_view(&self, ctx: &QueueContext, read_write: bool) -> CLResult<PipeImageView> {
1973 let rw = if read_write { RWFlags::RW } else { RWFlags::WR };
1974
1975 let res = self.get_res_for_access(ctx, rw)?;
1976 if res.is_buffer() && self.mem_type == CL_MEM_OBJECT_IMAGE2D {
1977 Ok(res.pipe_image_view_2d_buffer(
1978 self.pipe_format,
1979 read_write,
1980 self.pipe_image_host_access(),
1981 &self.buffer_2d_info()?,
1982 ))
1983 } else if res.is_buffer() {
1984 let size = self.size.try_into_with_err(CL_OUT_OF_RESOURCES)?;
1985 Ok(res.pipe_image_view_1d_buffer(
1986 self.pipe_format,
1987 read_write,
1988 self.pipe_image_host_access(),
1989 size,
1990 ))
1991 } else {
1992 Ok(res.pipe_image_view(read_write, self.pipe_image_host_access()))
1993 }
1994 }
1995 }
1996
1997 pub struct Sampler {
1998 pub base: CLObjectBase<CL_INVALID_SAMPLER>,
1999 pub context: Arc<Context>,
2000 pub normalized_coords: bool,
2001 pub addressing_mode: cl_addressing_mode,
2002 pub filter_mode: cl_filter_mode,
2003 pub props: Properties<cl_sampler_properties>,
2004 }
2005
2006 impl_cl_type_trait!(cl_sampler, Sampler, CL_INVALID_SAMPLER);
2007
2008 impl Sampler {
new( context: Arc<Context>, normalized_coords: bool, addressing_mode: cl_addressing_mode, filter_mode: cl_filter_mode, props: Properties<cl_sampler_properties>, ) -> Arc<Sampler>2009 pub fn new(
2010 context: Arc<Context>,
2011 normalized_coords: bool,
2012 addressing_mode: cl_addressing_mode,
2013 filter_mode: cl_filter_mode,
2014 props: Properties<cl_sampler_properties>,
2015 ) -> Arc<Sampler> {
2016 Arc::new(Self {
2017 base: CLObjectBase::new(RusticlTypes::Sampler),
2018 context: context,
2019 normalized_coords: normalized_coords,
2020 addressing_mode: addressing_mode,
2021 filter_mode: filter_mode,
2022 props: props,
2023 })
2024 }
2025
nir_to_cl( addressing_mode: u32, filter_mode: u32, normalized_coords: u32, ) -> (cl_addressing_mode, cl_filter_mode, bool)2026 pub fn nir_to_cl(
2027 addressing_mode: u32,
2028 filter_mode: u32,
2029 normalized_coords: u32,
2030 ) -> (cl_addressing_mode, cl_filter_mode, bool) {
2031 let addr_mode = match addressing_mode {
2032 cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_NONE => CL_ADDRESS_NONE,
2033 cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE => {
2034 CL_ADDRESS_CLAMP_TO_EDGE
2035 }
2036 cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_CLAMP => CL_ADDRESS_CLAMP,
2037 cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_REPEAT => CL_ADDRESS_REPEAT,
2038 cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_REPEAT_MIRRORED => {
2039 CL_ADDRESS_MIRRORED_REPEAT
2040 }
2041 _ => panic!("unknown addressing_mode"),
2042 };
2043
2044 let filter = match filter_mode {
2045 cl_sampler_filter_mode::SAMPLER_FILTER_MODE_NEAREST => CL_FILTER_NEAREST,
2046 cl_sampler_filter_mode::SAMPLER_FILTER_MODE_LINEAR => CL_FILTER_LINEAR,
2047 _ => panic!("unknown filter_mode"),
2048 };
2049
2050 (addr_mode, filter, normalized_coords != 0)
2051 }
2052
cl_to_pipe( (addressing_mode, filter_mode, normalized_coords): ( cl_addressing_mode, cl_filter_mode, bool, ), ) -> pipe_sampler_state2053 pub fn cl_to_pipe(
2054 (addressing_mode, filter_mode, normalized_coords): (
2055 cl_addressing_mode,
2056 cl_filter_mode,
2057 bool,
2058 ),
2059 ) -> pipe_sampler_state {
2060 let mut res = pipe_sampler_state::default();
2061
2062 let wrap = match addressing_mode {
2063 CL_ADDRESS_CLAMP_TO_EDGE => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_EDGE,
2064 CL_ADDRESS_CLAMP => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_BORDER,
2065 CL_ADDRESS_REPEAT => pipe_tex_wrap::PIPE_TEX_WRAP_REPEAT,
2066 CL_ADDRESS_MIRRORED_REPEAT => pipe_tex_wrap::PIPE_TEX_WRAP_MIRROR_REPEAT,
2067 // TODO: what's a reasonable default?
2068 _ => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_EDGE,
2069 };
2070
2071 let img_filter = match filter_mode {
2072 CL_FILTER_NEAREST => pipe_tex_filter::PIPE_TEX_FILTER_NEAREST,
2073 CL_FILTER_LINEAR => pipe_tex_filter::PIPE_TEX_FILTER_LINEAR,
2074 _ => panic!("unknown filter_mode"),
2075 };
2076
2077 res.set_min_img_filter(img_filter);
2078 res.set_mag_img_filter(img_filter);
2079 res.set_unnormalized_coords((!normalized_coords).into());
2080 res.set_wrap_r(wrap);
2081 res.set_wrap_s(wrap);
2082 res.set_wrap_t(wrap);
2083
2084 res
2085 }
2086
pipe(&self) -> pipe_sampler_state2087 pub fn pipe(&self) -> pipe_sampler_state {
2088 Self::cl_to_pipe((
2089 self.addressing_mode,
2090 self.filter_mode,
2091 self.normalized_coords,
2092 ))
2093 }
2094 }
2095