• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::api::icd::*;
2 use crate::api::types::*;
3 use crate::api::util::*;
4 use crate::core::context::*;
5 use crate::core::device::*;
6 use crate::core::format::*;
7 use crate::core::gl::*;
8 use crate::core::queue::*;
9 use crate::core::util::*;
10 use crate::impl_cl_type_trait;
11 use crate::impl_cl_type_trait_base;
12 use crate::perf_warning;
13 
14 use mesa_rust::pipe::context::*;
15 use mesa_rust::pipe::resource::*;
16 use mesa_rust::pipe::screen::ResourceType;
17 use mesa_rust::pipe::transfer::*;
18 use mesa_rust_gen::*;
19 use mesa_rust_util::conversion::*;
20 use mesa_rust_util::properties::Properties;
21 use mesa_rust_util::ptr::AllocSize;
22 use mesa_rust_util::ptr::TrackedPointers;
23 use rusticl_opencl_gen::*;
24 
25 use std::alloc;
26 use std::alloc::Layout;
27 use std::cmp;
28 use std::collections::btree_map::Entry;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::mem;
32 use std::mem::size_of;
33 use std::ops::Deref;
34 use std::os::raw::c_void;
35 use std::ptr;
36 use std::sync::Arc;
37 use std::sync::Mutex;
38 use std::sync::MutexGuard;
39 
40 struct Mapping<T> {
41     layout: Layout,
42     writes: bool,
43     ptr: Option<MutMemoryPtr>,
44     /// reference count from the API perspective. Once it reaches 0, we need to write back the
45     /// mappings content to the GPU resource.
46     count: u32,
47     inner: T,
48 }
49 
50 impl<T> Drop for Mapping<T> {
drop(&mut self)51     fn drop(&mut self) {
52         if let Some(ptr) = &self.ptr {
53             unsafe {
54                 alloc::dealloc(ptr.as_ptr().cast(), self.layout);
55             }
56         }
57     }
58 }
59 
60 impl<T> AllocSize<usize> for Mapping<T> {
size(&self) -> usize61     fn size(&self) -> usize {
62         self.layout.size()
63     }
64 }
65 
66 impl<T> Deref for Mapping<T> {
67     type Target = T;
68 
deref(&self) -> &Self::Target69     fn deref(&self) -> &Self::Target {
70         &self.inner
71     }
72 }
73 
74 struct BufferMapping {
75     offset: usize,
76 }
77 
78 struct ImageMapping {
79     origin: CLVec<usize>,
80     region: CLVec<usize>,
81 }
82 
83 #[repr(transparent)]
84 #[derive(Clone, Copy)]
85 pub struct ConstMemoryPtr {
86     ptr: *const c_void,
87 }
88 unsafe impl Send for ConstMemoryPtr {}
89 unsafe impl Sync for ConstMemoryPtr {}
90 
91 impl ConstMemoryPtr {
as_ptr(&self) -> *const c_void92     pub fn as_ptr(&self) -> *const c_void {
93         self.ptr
94     }
95 
96     /// # Safety
97     ///
98     /// Users need to ensure that `ptr` is only accessed in a thread-safe manner sufficient for
99     /// [Send] and [Sync]
from_ptr(ptr: *const c_void) -> Self100     pub unsafe fn from_ptr(ptr: *const c_void) -> Self {
101         Self { ptr: ptr }
102     }
103 }
104 
105 impl From<MutMemoryPtr> for ConstMemoryPtr {
from(value: MutMemoryPtr) -> Self106     fn from(value: MutMemoryPtr) -> Self {
107         Self {
108             ptr: value.ptr.cast(),
109         }
110     }
111 }
112 
113 #[repr(transparent)]
114 #[derive(Clone, Copy)]
115 pub struct MutMemoryPtr {
116     ptr: *mut c_void,
117 }
118 unsafe impl Send for MutMemoryPtr {}
119 unsafe impl Sync for MutMemoryPtr {}
120 
121 impl MutMemoryPtr {
as_ptr(&self) -> *mut c_void122     pub fn as_ptr(&self) -> *mut c_void {
123         self.ptr
124     }
125 
126     /// # Safety
127     ///
128     /// Users need to ensure that `ptr` is only accessed in a thread-safe manner sufficient for
129     /// [Send] and [Sync]
from_ptr(ptr: *mut c_void) -> Self130     pub unsafe fn from_ptr(ptr: *mut c_void) -> Self {
131         Self { ptr: ptr }
132     }
133 }
134 
135 #[derive(Copy, Clone, PartialEq)]
136 pub enum ResourceValidityEntity {
137     Host,
138     Device(&'static Device),
139 }
140 
141 /// Allocation with real GPU backing storage. Tracks on which device the content is valid on.
142 pub struct ResourceAllocation {
143     pub res: HashMap<&'static Device, Arc<PipeResource>>,
144     valid_on: Mutex<Vec<ResourceValidityEntity>>,
145     // it's a bit hacky, but storing the pointer as `usize` gives us `Send` and `Sync`. The
146     // application is required to ensure no data races exist on the memory anyway.
147     host_ptr: usize,
148     hostptr_devs: Vec<ResourceValidityEntity>,
149     // this might be non zero for dma-buf imported resources
150     offset: usize,
151 }
152 
153 impl ResourceAllocation {
154     /// # Panics
155     ///
156     /// valid_on needs to be a Vec with at least one element, will panic otherwise.
get_best_valid_entity_for_transfer( valid_on: &MutexGuard<Vec<ResourceValidityEntity>>, ) -> ResourceValidityEntity157     fn get_best_valid_entity_for_transfer(
158         valid_on: &MutexGuard<Vec<ResourceValidityEntity>>,
159     ) -> ResourceValidityEntity {
160         // We want to avoid having to copy over the PCIe bus, so we prefer an entity which is either
161         // the host itself or a device using host memory.
162         let res = valid_on.iter().min_by_key(|entity| match entity {
163             ResourceValidityEntity::Host => 0,
164             ResourceValidityEntity::Device(dev) => {
165                 if dev.unified_memory() {
166                     1
167                 } else {
168                     2
169                 }
170             }
171         });
172 
173         *res.unwrap()
174     }
175 
176     /// Small helper function to indicate when transparent migration is never required, e.g. if it's
177     /// a single device allocation with no hostptr.
can_skip_migration(&self) -> bool178     fn can_skip_migration(&self) -> bool {
179         match self.hostptr_devs.len() {
180             // If storage isn't shared between devices, we only need to migrate when there is more
181             // than one device.
182             0 => self.res.len() == 1,
183 
184             // If all devices use a host_ptr allocation, the content is automatically synchronized
185             // as they share the same storage. The - 1 is required as the Host is also part of
186             // `hostptr_devs`.
187             len => len - 1 == self.res.len(),
188         }
189     }
190 
191     /// Returns the GPU resource for the device `ctx` is associated with. It will transparently
192     /// migrate the data to the GPU.
193     /// TODO: add a map function to return a mapping to the resource of one device the data is valid
194     ///       on instead of migrating if the user would simply map the resource anyway.
get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>>195     fn get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>> {
196         let dev = ctx.dev;
197         let dev_entity = ResourceValidityEntity::Device(dev);
198         let to_res = self.res.get(dev).ok_or(CL_OUT_OF_HOST_MEMORY)?;
199 
200         // in most cases we can skip most of the work below.
201         if self.can_skip_migration() {
202             return Ok(to_res);
203         }
204 
205         let Ok(mut valid_on) = self.valid_on.lock() else {
206             return Err(CL_OUT_OF_HOST_MEMORY);
207         };
208 
209         // If the content isn't valid on dev we need to migrate it to it.
210         if matches!(rw, RWFlags::RD | RWFlags::RW) && !valid_on.contains(&dev_entity) {
211             // valid_on is a vec with at least one element, so this call won't panic.
212             let entity = Self::get_best_valid_entity_for_transfer(&valid_on);
213 
214             let helper_ctx;
215             let map;
216             let flush;
217 
218             if to_res.is_buffer() {
219                 let ptr;
220                 match entity {
221                     ResourceValidityEntity::Host => {
222                         flush = false;
223                         ptr = self.host_ptr as *mut c_void;
224                     }
225                     ResourceValidityEntity::Device(dev) => {
226                         flush = true;
227 
228                         let from_res = &self.res[dev];
229                         helper_ctx = dev.helper_ctx();
230 
231                         // update the resource and wait for the operation to finish. We also map the resources
232                         // unsynchronized as we can't block or flush any other contexts here as this might cause
233                         // deadlocks.
234                         map = helper_ctx
235                             .map_buffer_unsynchronized(
236                                 from_res,
237                                 0,
238                                 from_res.width() as i32,
239                                 RWFlags::RD,
240                             )
241                             .ok_or(CL_OUT_OF_HOST_MEMORY)?;
242 
243                         ptr = map.ptr();
244                     }
245                 }
246 
247                 ctx.buffer_subdata(to_res, 0, ptr, to_res.width());
248             } else {
249                 let ResourceValidityEntity::Device(dev) = entity else {
250                     // we don't support migrating from host_ptr for images yet. It's also not needed
251                     // because the Image struct has a more optimized way of doing things there.
252                     unimplemented!();
253                 };
254 
255                 flush = true;
256                 let from_res = &self.res[dev];
257                 helper_ctx = dev.helper_ctx();
258 
259                 // update the resource and wait for the operation to finish. We also map the resources
260                 // unsynchronized as we can't block or flush any other contexts here as this might cause
261                 // deadlocks.
262                 let bx = pipe_box {
263                     width: from_res.width() as i32,
264                     height: from_res.height() as i32,
265                     depth: from_res.depth() as i16,
266                     ..Default::default()
267                 };
268 
269                 map = helper_ctx
270                     .map_texture_unsynchronized(from_res, &bx, RWFlags::RD)
271                     .ok_or(CL_OUT_OF_HOST_MEMORY)?;
272 
273                 let row_pitch: u32 = map.row_pitch();
274                 let slice_pitch: usize = map.slice_pitch();
275 
276                 let bx = pipe_box {
277                     width: to_res.width() as i32,
278                     height: to_res.height() as i32,
279                     depth: to_res.depth() as i16,
280                     ..Default::default()
281                 };
282 
283                 ctx.texture_subdata(to_res, &bx, map.ptr(), row_pitch, slice_pitch);
284             }
285 
286             // TODO: we really kinda need to figure out how we can make the compiler scream, that
287             //       temporarily mapped memory might be accessed at some random point in the future
288             //       by a GPU unless it's queues are flushed and processed.
289             if flush {
290                 ctx.flush().wait();
291             }
292         }
293 
294         if matches!(rw, RWFlags::WR | RWFlags::RW) {
295             // If the user writes to it it's not valid on any other device anymore.
296             valid_on.clear();
297         }
298 
299         if !valid_on.contains(&dev_entity) {
300             // if we update one hostptr resource, we update them all.
301             if self.hostptr_devs.contains(&dev_entity) {
302                 valid_on.extend_from_slice(&self.hostptr_devs);
303             } else {
304                 valid_on.push(ResourceValidityEntity::Device(dev));
305             }
306         }
307 
308         Ok(to_res)
309     }
310 
migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()>311     pub fn migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
312         let host_entity = ResourceValidityEntity::Host;
313         let host_ptr = self.host_ptr as *mut c_void;
314 
315         // in most cases we can skip most of the work below.
316         if self.can_skip_migration() || host_ptr.is_null() {
317             return Ok(());
318         }
319 
320         let Ok(mut valid_on) = self.valid_on.lock() else {
321             return Err(CL_OUT_OF_HOST_MEMORY);
322         };
323 
324         // If the content isn't valid on the host we need to migrate it to it.
325         if matches!(rw, RWFlags::RD | RWFlags::RW) && !valid_on.contains(&host_entity) {
326             let ctx_dev_entity = ResourceValidityEntity::Device(ctx.dev);
327             let mut entity = ctx_dev_entity;
328 
329             if !valid_on.contains(&entity) {
330                 // valid_on is a vec with at least one element, so this call won't panic.
331                 entity = Self::get_best_valid_entity_for_transfer(&valid_on);
332             }
333 
334             debug_assert!(entity != ResourceValidityEntity::Host);
335 
336             let ResourceValidityEntity::Device(from_dev) = entity else {
337                 // we check if `valid_on` contains a host entity above, so this should never happen.
338                 unreachable!();
339             };
340 
341             let helper_ctx;
342             let map;
343             let from_res = &self.res[from_dev];
344 
345             assert!(
346                 from_res.is_buffer(),
347                 "Transparent resource migration only supported on buffers."
348             );
349 
350             if from_dev == ctx.dev {
351                 map = ctx
352                     .buffer_map(from_res, 0, from_res.width() as i32, RWFlags::RD)
353                     .ok_or(CL_OUT_OF_HOST_MEMORY)?;
354             } else {
355                 helper_ctx = from_dev.helper_ctx();
356                 // update the resource and wait for the operation to finish. We also map the resources
357                 // unsynchronized as we can't block or flush any other contexts here as this might cause
358                 // deadlocks.
359                 map = helper_ctx
360                     .map_buffer_unsynchronized(from_res, 0, from_res.width() as i32, RWFlags::RD)
361                     .ok_or(CL_OUT_OF_HOST_MEMORY)?;
362             }
363 
364             let ptr = map.ptr();
365             // SAFETY: The application promises, that host_ptr is big enough to hold the entire
366             //         content of the buffer, also `ptr` is the mapped resource containing at least
367             //         `from_res.width()` bytes. Also both pointers do not overlap.
368             unsafe {
369                 ptr::copy_nonoverlapping(ptr, host_ptr, from_res.width() as usize);
370             }
371         }
372 
373         if matches!(rw, RWFlags::WR | RWFlags::RW) {
374             // If the user writes to it it's not valid on any other device anymore.
375             valid_on.clear();
376         }
377 
378         if !valid_on.contains(&host_entity) {
379             // if we update the hostptr, we update all devices having a hostptr allocation.
380             valid_on.extend_from_slice(&self.hostptr_devs);
381         }
382 
383         Ok(())
384     }
385 }
386 
387 pub struct SubAllocation {
388     mem: Mem,
389     // offset relative to the actual resource, not relative to `mem`. This saves us a few
390     // calculations and we only need the total amount anyway.
391     offset: usize,
392 }
393 
394 /// Abstraction over the memory allocation. It might be a real GPU backing storage or simply a sub
395 /// allocation over an existing memory object.
396 enum Allocation {
397     Resource(ResourceAllocation),
398     SubAlloc(SubAllocation),
399 }
400 
401 // TODO: - Once it's used for more stuff might make sense to split it into an Image and Buffer
402 //         variant.
403 //       - Instead of doing full migration every time, it could also do it for only parts of the
404 //         allocation.
405 impl Allocation {
406     /// Creates a new allocation object assuming the initial data is valid on every device.
new( res: HashMap<&'static Device, Arc<PipeResource>>, offset: usize, host_ptr: *mut c_void, ) -> Self407     pub fn new(
408         res: HashMap<&'static Device, Arc<PipeResource>>,
409         offset: usize,
410         host_ptr: *mut c_void,
411     ) -> Self {
412         let hostptr_devs = if !host_ptr.is_null() {
413             res.iter()
414                 // we only add devices we actually have a host ptr resource for
415                 .filter_map(|(&dev, res)| {
416                     res.is_user().then_some(ResourceValidityEntity::Device(dev))
417                 })
418                 // and the host itself
419                 .chain([ResourceValidityEntity::Host])
420                 .collect()
421         } else {
422             Vec::new()
423         };
424 
425         let mut valid_on: Vec<_> = res
426             .keys()
427             .copied()
428             .map(ResourceValidityEntity::Device)
429             .collect();
430         if !host_ptr.is_null() {
431             valid_on.push(ResourceValidityEntity::Host);
432         }
433 
434         Self::Resource(ResourceAllocation {
435             valid_on: Mutex::new(valid_on),
436             res: res,
437             host_ptr: host_ptr as usize,
438             hostptr_devs: hostptr_devs,
439             offset: offset,
440         })
441     }
442 
new_sub(mem: Mem, offset: usize) -> Self443     fn new_sub(mem: Mem, offset: usize) -> Self {
444         Self::SubAlloc(SubAllocation {
445             // we precalculate the entire offset here.
446             offset: offset + mem.alloc.offset(),
447             mem: mem,
448         })
449     }
450 
451     /// Returns true if the backing storage of the two objects is equal.
backing_resource_eq(&self, other: &Self) -> bool452     fn backing_resource_eq(&self, other: &Self) -> bool {
453         ptr::eq(self.get_real_resource(), other.get_real_resource())
454     }
455 
456     /// Follows the sub-allocation chain until it hits a real GPU allocation.
get_real_resource(&self) -> &ResourceAllocation457     fn get_real_resource(&self) -> &ResourceAllocation {
458         match self {
459             Allocation::SubAlloc(sub) => sub.mem.alloc.get_real_resource(),
460             Allocation::Resource(res) => res,
461         }
462     }
463 
464     /// Returns the resource associated with `dev` without any data migration.
get_res_of_dev(&self, dev: &Device) -> CLResult<&Arc<PipeResource>>465     fn get_res_of_dev(&self, dev: &Device) -> CLResult<&Arc<PipeResource>> {
466         self.get_real_resource()
467             .res
468             .get(dev)
469             .ok_or(CL_OUT_OF_HOST_MEMORY)
470     }
471 
472     /// Returns the resource associated with `ctx.dev` and transparently migrate the data.
get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>>473     fn get_res_for_access(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<&Arc<PipeResource>> {
474         self.get_real_resource().get_res_for_access(ctx, rw)
475     }
476 
477     /// Migrates the content to the host. Fails if there is no host ptr.
_migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()>478     pub fn _migrate_to_hostptr(&self, ctx: &QueueContext, rw: RWFlags) -> CLResult<()> {
479         self.get_real_resource().migrate_to_hostptr(ctx, rw)
480     }
481 
host_ptr(&self) -> *mut c_void482     pub fn host_ptr(&self) -> *mut c_void {
483         let mut host_ptr = self.get_real_resource().host_ptr;
484 
485         // we can only apply the offset as long the host_ptr isn't null.
486         if host_ptr != 0 {
487             host_ptr += self.offset();
488         }
489 
490         host_ptr as _
491     }
492 
is_user_alloc_for_dev(&self, dev: &Device) -> CLResult<bool>493     fn is_user_alloc_for_dev(&self, dev: &Device) -> CLResult<bool> {
494         Ok(self.get_res_of_dev(dev)?.is_user())
495     }
496 
offset(&self) -> usize497     fn offset(&self) -> usize {
498         match self {
499             Allocation::Resource(res) => res.offset,
500             Allocation::SubAlloc(sub) => sub.offset,
501         }
502     }
503 }
504 
505 pub enum Mem {
506     Buffer(Arc<Buffer>),
507     Image(Arc<Image>),
508 }
509 
510 impl Deref for Mem {
511     type Target = MemBase;
512 
deref(&self) -> &Self::Target513     fn deref(&self) -> &Self::Target {
514         match self {
515             Self::Buffer(b) => &b.base,
516             Self::Image(i) => &i.base,
517         }
518     }
519 }
520 
521 impl Mem {
is_mapped_ptr(&self, ptr: *mut c_void) -> bool522     pub fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
523         match self {
524             Self::Buffer(b) => b.is_mapped_ptr(ptr),
525             Self::Image(i) => i.is_mapped_ptr(ptr),
526         }
527     }
528 
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>529     pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
530         match self {
531             Self::Buffer(b) => b.sync_unmap(ctx, ptr),
532             Self::Image(i) => i.sync_unmap(ctx, ptr),
533         }
534     }
535 
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>536     pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
537         match self {
538             Self::Buffer(b) => b.unmap(ptr),
539             Self::Image(i) => i.unmap(ptr),
540         }
541     }
542 }
543 
544 /// # Mapping memory
545 ///
546 /// Maps the queue associated device's resource.
547 ///
548 /// Mapping resources could have been quite straightforward if OpenCL wouldn't allow for so called
549 /// non blocking maps. Non blocking maps shall return a valid pointer to the mapped region
550 /// immediately, but should not synchronize data (in case of shadow buffers) until after the map
551 /// event is reached in the queue. This makes it not possible to simply use pipe_transfers as those
552 /// can't be explicitly synced by the frontend.
553 ///
554 /// In order to have a compliant implementation of the mapping API we have to consider the following
555 /// cases:
556 ///   1. Mapping a cl_mem object with CL_MEM_USE_HOST_PTR: We simply return the host_ptr.
557 ///      Synchronization of shadowed host ptrs are done in `sync_shadow` on demand.
558 ///   2. Mapping linear resources on UMA systems: We simply create the pipe_transfer with
559 ///      `PIPE_MAP_DIRECTLY` and `PIPE_MAP_UNSYNCHRONIZED` and return the attached pointer.
560 ///   3. On non UMA systems or when 2. fails (e.g. due to the resource being tiled) we
561 ///      - create a shadow pipe_resource with `PIPE_USAGE_STAGING`,
562 ///        `PIPE_RESOURCE_FLAG_MAP_PERSISTENT` and `PIPE_RESOURCE_FLAG_MAP_COHERENT`
563 ///      - create a pipe_transfer with `PIPE_MAP_COHERENT`, `PIPE_MAP_PERSISTENT` and
564 ///        `PIPE_MAP_UNSYNCHRONIZED`
565 ///      - sync the shadow buffer like a host_ptr shadow buffer in 1.
566 ///
567 /// Taking this approach we guarentee that we only copy when actually needed while making sure the
568 /// content behind the returned pointer is valid until unmapped.
569 pub struct MemBase {
570     pub base: CLObjectBase<CL_INVALID_MEM_OBJECT>,
571     pub context: Arc<Context>,
572     pub mem_type: cl_mem_object_type,
573     pub flags: cl_mem_flags,
574     pub size: usize,
575     pub props: Properties<cl_mem_properties>,
576     pub cbs: Mutex<Vec<MemCB>>,
577     pub gl_obj: Option<GLObject>,
578     alloc: Allocation,
579 }
580 
581 pub struct Buffer {
582     base: MemBase,
583     maps: Mutex<TrackedPointers<usize, Mapping<BufferMapping>>>,
584 }
585 
586 pub struct Image {
587     base: MemBase,
588     pub image_format: cl_image_format,
589     pub pipe_format: pipe_format,
590     pub image_desc: cl_image_desc,
591     pub image_elem_size: u8,
592     maps: Mutex<TrackedPointers<usize, Mapping<ImageMapping>>>,
593 }
594 
595 impl Deref for Buffer {
596     type Target = MemBase;
597 
deref(&self) -> &Self::Target598     fn deref(&self) -> &Self::Target {
599         &self.base
600     }
601 }
602 
603 impl Deref for Image {
604     type Target = MemBase;
605 
deref(&self) -> &Self::Target606     fn deref(&self) -> &Self::Target {
607         &self.base
608     }
609 }
610 
611 impl_cl_type_trait_base!(cl_mem, MemBase, [Buffer, Image], CL_INVALID_MEM_OBJECT);
612 impl_cl_type_trait!(cl_mem, Buffer, CL_INVALID_MEM_OBJECT, base.base);
613 impl_cl_type_trait!(cl_mem, Image, CL_INVALID_MEM_OBJECT, base.base);
614 
615 pub trait CLImageDescInfo {
type_info(&self) -> (u8, bool)616     fn type_info(&self) -> (u8, bool);
pixels(&self) -> usize617     fn pixels(&self) -> usize;
bx(&self) -> CLResult<pipe_box>618     fn bx(&self) -> CLResult<pipe_box>;
row_pitch(&self) -> CLResult<u32>619     fn row_pitch(&self) -> CLResult<u32>;
slice_pitch(&self) -> usize620     fn slice_pitch(&self) -> usize;
width(&self) -> CLResult<u32>621     fn width(&self) -> CLResult<u32>;
height(&self) -> CLResult<u32>622     fn height(&self) -> CLResult<u32>;
size(&self) -> CLVec<usize>623     fn size(&self) -> CLVec<usize>;
624 
dims(&self) -> u8625     fn dims(&self) -> u8 {
626         self.type_info().0
627     }
628 
dims_with_array(&self) -> u8629     fn dims_with_array(&self) -> u8 {
630         let array: u8 = self.is_array().into();
631         self.dims() + array
632     }
633 
has_slice(&self) -> bool634     fn has_slice(&self) -> bool {
635         self.dims() == 3 || self.is_array()
636     }
637 
is_array(&self) -> bool638     fn is_array(&self) -> bool {
639         self.type_info().1
640     }
641 }
642 
643 impl CLImageDescInfo for cl_image_desc {
type_info(&self) -> (u8, bool)644     fn type_info(&self) -> (u8, bool) {
645         match self.image_type {
646             CL_MEM_OBJECT_IMAGE1D | CL_MEM_OBJECT_IMAGE1D_BUFFER => (1, false),
647             CL_MEM_OBJECT_IMAGE1D_ARRAY => (1, true),
648             CL_MEM_OBJECT_IMAGE2D => (2, false),
649             CL_MEM_OBJECT_IMAGE2D_ARRAY => (2, true),
650             CL_MEM_OBJECT_IMAGE3D => (3, false),
651             _ => panic!("unknown image_type {:x}", self.image_type),
652         }
653     }
654 
pixels(&self) -> usize655     fn pixels(&self) -> usize {
656         let mut res = self.image_width;
657         let dims = self.dims();
658 
659         if dims > 1 {
660             res *= self.image_height;
661         }
662 
663         if dims > 2 {
664             res *= self.image_depth;
665         }
666 
667         if self.is_array() {
668             res *= self.image_array_size;
669         }
670 
671         res
672     }
673 
size(&self) -> CLVec<usize>674     fn size(&self) -> CLVec<usize> {
675         let mut height = cmp::max(self.image_height, 1);
676         let mut depth = cmp::max(self.image_depth, 1);
677 
678         match self.image_type {
679             CL_MEM_OBJECT_IMAGE1D_ARRAY => height = self.image_array_size,
680             CL_MEM_OBJECT_IMAGE2D_ARRAY => depth = self.image_array_size,
681             _ => {}
682         }
683 
684         CLVec::new([self.image_width, height, depth])
685     }
686 
bx(&self) -> CLResult<pipe_box>687     fn bx(&self) -> CLResult<pipe_box> {
688         create_pipe_box(CLVec::default(), self.size(), self.image_type)
689     }
690 
row_pitch(&self) -> CLResult<u32>691     fn row_pitch(&self) -> CLResult<u32> {
692         self.image_row_pitch
693             .try_into_with_err(CL_OUT_OF_HOST_MEMORY)
694     }
695 
slice_pitch(&self) -> usize696     fn slice_pitch(&self) -> usize {
697         self.image_slice_pitch
698     }
699 
width(&self) -> CLResult<u32>700     fn width(&self) -> CLResult<u32> {
701         self.image_width.try_into_with_err(CL_OUT_OF_HOST_MEMORY)
702     }
703 
height(&self) -> CLResult<u32>704     fn height(&self) -> CLResult<u32> {
705         self.image_height.try_into_with_err(CL_OUT_OF_HOST_MEMORY)
706     }
707 }
708 
sw_copy( src: *const c_void, dst: *mut c_void, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, pixel_size: u8, )709 fn sw_copy(
710     src: *const c_void,
711     dst: *mut c_void,
712     region: &CLVec<usize>,
713     src_origin: &CLVec<usize>,
714     src_row_pitch: usize,
715     src_slice_pitch: usize,
716     dst_origin: &CLVec<usize>,
717     dst_row_pitch: usize,
718     dst_slice_pitch: usize,
719     pixel_size: u8,
720 ) {
721     let pixel_size = pixel_size as usize;
722     for z in 0..region[2] {
723         if src_row_pitch == dst_row_pitch && region[1] * pixel_size == src_row_pitch {
724             unsafe {
725                 ptr::copy(
726                     src.byte_add(
727                         (*src_origin + [0, 0, z]) * [pixel_size, src_row_pitch, src_slice_pitch],
728                     ),
729                     dst.byte_add(
730                         (*dst_origin + [0, 0, z]) * [pixel_size, dst_row_pitch, dst_slice_pitch],
731                     ),
732                     region[0] * region[1] * pixel_size,
733                 )
734             }
735         } else {
736             for y in 0..region[1] {
737                 unsafe {
738                     ptr::copy(
739                         src.byte_add(
740                             (*src_origin + [0, y, z])
741                                 * [pixel_size, src_row_pitch, src_slice_pitch],
742                         ),
743                         dst.byte_add(
744                             (*dst_origin + [0, y, z])
745                                 * [pixel_size, dst_row_pitch, dst_slice_pitch],
746                         ),
747                         region[0] * pixel_size,
748                     )
749                 };
750             }
751         }
752     }
753 }
754 
755 impl MemBase {
new_buffer( context: Arc<Context>, flags: cl_mem_flags, size: usize, mut host_ptr: *mut c_void, props: Properties<cl_mem_properties>, ) -> CLResult<Arc<Buffer>>756     pub fn new_buffer(
757         context: Arc<Context>,
758         flags: cl_mem_flags,
759         size: usize,
760         mut host_ptr: *mut c_void,
761         props: Properties<cl_mem_properties>,
762     ) -> CLResult<Arc<Buffer>> {
763         let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
764             ResourceType::Staging
765         } else {
766             ResourceType::Normal
767         };
768 
769         let buffer = context.create_buffer(
770             size,
771             host_ptr,
772             bit_check(flags, CL_MEM_COPY_HOST_PTR),
773             res_type,
774         )?;
775 
776         // We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
777         if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
778             host_ptr = ptr::null_mut()
779         }
780 
781         let alloc = Allocation::new(buffer, 0, host_ptr);
782         Ok(Arc::new(Buffer {
783             base: Self {
784                 base: CLObjectBase::new(RusticlTypes::Buffer),
785                 context: context,
786                 mem_type: CL_MEM_OBJECT_BUFFER,
787                 flags: flags,
788                 size: size,
789                 props: props,
790                 gl_obj: None,
791                 cbs: Mutex::new(Vec::new()),
792                 alloc: alloc,
793             },
794             maps: Mutex::new(TrackedPointers::new()),
795         }))
796     }
797 
new_sub_buffer( parent: Arc<Buffer>, flags: cl_mem_flags, offset: usize, size: usize, ) -> Arc<Buffer>798     pub fn new_sub_buffer(
799         parent: Arc<Buffer>,
800         flags: cl_mem_flags,
801         offset: usize,
802         size: usize,
803     ) -> Arc<Buffer> {
804         Arc::new(Buffer {
805             base: Self {
806                 base: CLObjectBase::new(RusticlTypes::Buffer),
807                 context: Arc::clone(&parent.context),
808                 mem_type: CL_MEM_OBJECT_BUFFER,
809                 flags: flags,
810                 size: size,
811                 props: Properties::default(),
812                 gl_obj: None,
813                 cbs: Mutex::new(Vec::new()),
814                 alloc: Allocation::new_sub(Mem::Buffer(parent), offset),
815             },
816             maps: Mutex::new(TrackedPointers::new()),
817         })
818     }
819 
new_image( context: Arc<Context>, parent: Option<Mem>, flags: cl_mem_flags, image_format: &cl_image_format, mut image_desc: cl_image_desc, image_elem_size: u8, mut host_ptr: *mut c_void, props: Properties<cl_mem_properties>, ) -> CLResult<Arc<Image>>820     pub fn new_image(
821         context: Arc<Context>,
822         parent: Option<Mem>,
823         flags: cl_mem_flags,
824         image_format: &cl_image_format,
825         mut image_desc: cl_image_desc,
826         image_elem_size: u8,
827         mut host_ptr: *mut c_void,
828         props: Properties<cl_mem_properties>,
829     ) -> CLResult<Arc<Image>> {
830         // we have to sanitize the image_desc a little for internal use
831         let api_image_desc = image_desc;
832         let dims = image_desc.dims();
833         let is_array = image_desc.is_array();
834         if dims < 3 {
835             image_desc.image_depth = 1;
836         }
837         if dims < 2 {
838             image_desc.image_height = 1;
839         }
840         if !is_array {
841             image_desc.image_array_size = 1;
842         }
843 
844         let res_type = if bit_check(flags, CL_MEM_ALLOC_HOST_PTR) {
845             ResourceType::Staging
846         } else {
847             ResourceType::Normal
848         };
849 
850         let alloc = if let Some(parent) = parent {
851             Allocation::new_sub(parent, 0)
852         } else {
853             let mut texture = context.create_texture(
854                 &image_desc,
855                 image_format,
856                 host_ptr,
857                 bit_check(flags, CL_MEM_COPY_HOST_PTR),
858                 res_type,
859             );
860 
861             // if we error allocating a Staging resource, just try with normal as
862             // `CL_MEM_ALLOC_HOST_PTR` is just a performance hint.
863             if res_type == ResourceType::Staging && texture.is_err() {
864                 texture = context.create_texture(
865                     &image_desc,
866                     image_format,
867                     host_ptr,
868                     bit_check(flags, CL_MEM_COPY_HOST_PTR),
869                     ResourceType::Normal,
870                 )
871             }
872 
873             // We can only keep the host_ptr when `CL_MEM_USE_HOST_PTR` is set.
874             if !bit_check(flags, CL_MEM_USE_HOST_PTR) {
875                 host_ptr = ptr::null_mut()
876             }
877 
878             Allocation::new(texture?, 0, host_ptr)
879         };
880 
881         let pipe_format = image_format.to_pipe_format().unwrap();
882         Ok(Arc::new(Image {
883             base: Self {
884                 base: CLObjectBase::new(RusticlTypes::Image),
885                 context: context,
886                 mem_type: image_desc.image_type,
887                 flags: flags,
888                 size: image_desc.pixels() * image_format.pixel_size().unwrap() as usize,
889                 props: props,
890                 gl_obj: None,
891                 cbs: Mutex::new(Vec::new()),
892                 alloc: alloc,
893             },
894             image_format: *image_format,
895             pipe_format: pipe_format,
896             image_desc: api_image_desc,
897             image_elem_size: image_elem_size,
898             maps: Mutex::new(TrackedPointers::new()),
899         }))
900     }
901 
arc_from_raw(ptr: cl_mem) -> CLResult<Mem>902     pub fn arc_from_raw(ptr: cl_mem) -> CLResult<Mem> {
903         let mem = Self::ref_from_raw(ptr)?;
904         match mem.base.get_type()? {
905             RusticlTypes::Buffer => Ok(Mem::Buffer(Buffer::arc_from_raw(ptr)?)),
906             RusticlTypes::Image => Ok(Mem::Image(Image::arc_from_raw(ptr)?)),
907             _ => Err(CL_INVALID_MEM_OBJECT),
908         }
909     }
910 
arcs_from_arr(objs: *const cl_mem, count: u32) -> CLResult<Vec<Mem>>911     pub fn arcs_from_arr(objs: *const cl_mem, count: u32) -> CLResult<Vec<Mem>> {
912         let count = count as usize;
913         let mut res = Vec::with_capacity(count);
914         for i in 0..count {
915             res.push(Self::arc_from_raw(unsafe { *objs.add(i) })?);
916         }
917         Ok(res)
918     }
919 
from_gl( context: Arc<Context>, flags: cl_mem_flags, gl_export_manager: &GLExportManager, ) -> CLResult<cl_mem>920     pub fn from_gl(
921         context: Arc<Context>,
922         flags: cl_mem_flags,
923         gl_export_manager: &GLExportManager,
924     ) -> CLResult<cl_mem> {
925         let export_in = &gl_export_manager.export_in;
926         let export_out = &gl_export_manager.export_out;
927 
928         let (mem_type, gl_object_type) = target_from_gl(export_in.target)?;
929         let gl_mem_props = gl_export_manager.get_gl_mem_props()?;
930 
931         // Handle Buffers
932         let (image_format, pipe_format, rusticl_type) = if gl_export_manager.is_gl_buffer() {
933             (
934                 cl_image_format::default(),
935                 pipe_format::PIPE_FORMAT_NONE,
936                 RusticlTypes::Buffer,
937             )
938         } else {
939             let image_format =
940                 format_from_gl(export_out.internal_format).ok_or(CL_OUT_OF_HOST_MEMORY)?;
941             (
942                 image_format,
943                 image_format.to_pipe_format().unwrap(),
944                 RusticlTypes::Image,
945             )
946         };
947 
948         let imported_gl_tex = context.import_gl_buffer(
949             export_out.dmabuf_fd as u32,
950             export_out.modifier,
951             mem_type,
952             export_in.target,
953             image_format,
954             gl_mem_props.clone(),
955         )?;
956 
957         // Cube maps faces are not linear in memory, so copy all contents
958         // of desired face into a 2D image and copy it back after gl release.
959         let (shadow_map, texture) = if is_cube_map_face(export_in.target) {
960             let shadow = create_shadow_slice(&imported_gl_tex, image_format)?;
961 
962             let mut res_map = HashMap::new();
963             shadow
964                 .iter()
965                 .map(|(k, v)| {
966                     let gl_res = Arc::clone(imported_gl_tex.get(k).unwrap());
967                     res_map.insert(Arc::clone(v), gl_res);
968                 })
969                 .for_each(drop);
970 
971             (Some(res_map), shadow)
972         } else {
973             (None, imported_gl_tex)
974         };
975 
976         // it's kinda not supported, but we want to know if anything actually hits this as it's
977         // certainly not tested by the CL CTS.
978         if mem_type != CL_MEM_OBJECT_BUFFER {
979             assert_eq!(gl_mem_props.offset, 0);
980         }
981 
982         let base = Self {
983             base: CLObjectBase::new(rusticl_type),
984             context: context,
985             mem_type: mem_type,
986             flags: flags,
987             size: gl_mem_props.size(),
988             props: Properties::default(),
989             gl_obj: Some(GLObject {
990                 gl_object_target: gl_export_manager.export_in.target,
991                 gl_object_type: gl_object_type,
992                 gl_object_name: export_in.obj,
993                 shadow_map: shadow_map,
994             }),
995             cbs: Mutex::new(Vec::new()),
996             alloc: Allocation::new(texture, gl_mem_props.offset as usize, ptr::null_mut()),
997         };
998 
999         Ok(if rusticl_type == RusticlTypes::Buffer {
1000             Arc::new(Buffer {
1001                 base: base,
1002                 maps: Mutex::new(TrackedPointers::new()),
1003             })
1004             .into_cl()
1005         } else {
1006             Arc::new(Image {
1007                 base: base,
1008                 image_format: image_format,
1009                 pipe_format: pipe_format,
1010                 image_desc: cl_image_desc {
1011                     image_type: mem_type,
1012                     image_width: gl_mem_props.width as usize,
1013                     image_height: gl_mem_props.height as usize,
1014                     image_depth: gl_mem_props.depth as usize,
1015                     image_array_size: gl_mem_props.array_size as usize,
1016                     image_row_pitch: 0,
1017                     image_slice_pitch: 0,
1018                     num_mip_levels: 0,
1019                     num_samples: 0,
1020                     ..Default::default()
1021                 },
1022                 image_elem_size: gl_mem_props.pixel_size,
1023                 maps: Mutex::new(TrackedPointers::new()),
1024             })
1025             .into_cl()
1026         })
1027     }
1028 
is_buffer(&self) -> bool1029     pub fn is_buffer(&self) -> bool {
1030         self.mem_type == CL_MEM_OBJECT_BUFFER
1031     }
1032 
1033     /// Checks if the backing memory is actually the same memory object.
backing_memory_eq(&self, other: &Self) -> bool1034     pub fn backing_memory_eq(&self, other: &Self) -> bool {
1035         self.alloc.backing_resource_eq(&other.alloc)
1036     }
1037 
1038     // this is kinda bogus, because that won't work with system SVM, but the spec wants us to
1039     // implement this.
is_svm(&self) -> bool1040     pub fn is_svm(&self) -> bool {
1041         self.context
1042             .find_svm_alloc(self.host_ptr() as usize)
1043             .is_some()
1044             && bit_check(self.flags, CL_MEM_USE_HOST_PTR)
1045     }
1046 
get_res_for_access( &self, ctx: &QueueContext, rw: RWFlags, ) -> CLResult<&Arc<PipeResource>>1047     pub fn get_res_for_access(
1048         &self,
1049         ctx: &QueueContext,
1050         rw: RWFlags,
1051     ) -> CLResult<&Arc<PipeResource>> {
1052         self.alloc.get_res_for_access(ctx, rw)
1053     }
1054 
1055     /// Returns the parent memory object or None if self isn't a sub allocated memory object.
parent(&self) -> Option<&Mem>1056     pub fn parent(&self) -> Option<&Mem> {
1057         match &self.alloc {
1058             Allocation::SubAlloc(sub) => Some(&sub.mem),
1059             Allocation::Resource(_) => None,
1060         }
1061     }
1062 
host_ptr(&self) -> *mut c_void1063     pub fn host_ptr(&self) -> *mut c_void {
1064         self.alloc.host_ptr()
1065     }
1066 
is_pure_user_memory(&self, d: &Device) -> CLResult<bool>1067     fn is_pure_user_memory(&self, d: &Device) -> CLResult<bool> {
1068         // 1Dbuffer objects are weird. The parent memory object can be a host_ptr thing, but we are
1069         // not allowed to actually return a pointer based on the host_ptr when mapping.
1070         Ok(self.alloc.is_user_alloc_for_dev(d)? && !self.host_ptr().is_null())
1071     }
1072 
map<T>( &self, offset: usize, layout: Layout, writes: bool, maps: &Mutex<TrackedPointers<usize, Mapping<T>>>, inner: T, ) -> CLResult<MutMemoryPtr>1073     fn map<T>(
1074         &self,
1075         offset: usize,
1076         layout: Layout,
1077         writes: bool,
1078         maps: &Mutex<TrackedPointers<usize, Mapping<T>>>,
1079         inner: T,
1080     ) -> CLResult<MutMemoryPtr> {
1081         let host_ptr = self.host_ptr();
1082         let ptr = unsafe {
1083             let ptr = if !host_ptr.is_null() {
1084                 host_ptr.byte_add(offset)
1085             } else {
1086                 alloc::alloc(layout).cast()
1087             };
1088 
1089             MutMemoryPtr::from_ptr(ptr)
1090         };
1091 
1092         match maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1093             Entry::Occupied(mut e) => {
1094                 debug_assert!(!host_ptr.is_null());
1095                 e.get_mut().count += 1;
1096             }
1097             Entry::Vacant(e) => {
1098                 e.insert(Mapping {
1099                     layout: layout,
1100                     writes: writes,
1101                     ptr: host_ptr.is_null().then_some(ptr),
1102                     count: 1,
1103                     inner: inner,
1104                 });
1105             }
1106         }
1107 
1108         Ok(ptr)
1109     }
1110 }
1111 
1112 impl Drop for MemBase {
drop(&mut self)1113     fn drop(&mut self) {
1114         let cbs = mem::take(self.cbs.get_mut().unwrap());
1115         for cb in cbs.into_iter().rev() {
1116             cb.call(self);
1117         }
1118     }
1119 }
1120 
1121 impl Buffer {
apply_offset(&self, offset: usize) -> CLResult<usize>1122     fn apply_offset(&self, offset: usize) -> CLResult<usize> {
1123         self.offset()
1124             .checked_add(offset)
1125             .ok_or(CL_OUT_OF_HOST_MEMORY)
1126     }
1127 
copy_rect( &self, dst: &Self, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1128     pub fn copy_rect(
1129         &self,
1130         dst: &Self,
1131         ctx: &QueueContext,
1132         region: &CLVec<usize>,
1133         src_origin: &CLVec<usize>,
1134         src_row_pitch: usize,
1135         src_slice_pitch: usize,
1136         dst_origin: &CLVec<usize>,
1137         dst_row_pitch: usize,
1138         dst_slice_pitch: usize,
1139     ) -> CLResult<()> {
1140         let (offset, size) =
1141             CLVec::calc_offset_size(src_origin, region, [1, src_row_pitch, src_slice_pitch]);
1142         let tx_src = self.tx(ctx, offset, size, RWFlags::RD)?;
1143 
1144         let (offset, size) =
1145             CLVec::calc_offset_size(dst_origin, region, [1, dst_row_pitch, dst_slice_pitch]);
1146         let tx_dst = dst.tx(ctx, offset, size, RWFlags::WR)?;
1147 
1148         perf_warning!("clEnqueueCopyBufferRect stalls the GPU");
1149 
1150         // TODO check to use hw accelerated paths (e.g. resource_copy_region or blits)
1151         sw_copy(
1152             tx_src.ptr(),
1153             tx_dst.ptr(),
1154             region,
1155             &CLVec::default(),
1156             src_row_pitch,
1157             src_slice_pitch,
1158             &CLVec::default(),
1159             dst_row_pitch,
1160             dst_slice_pitch,
1161             1,
1162         );
1163 
1164         Ok(())
1165     }
1166 
copy_to_buffer( &self, ctx: &QueueContext, dst: &Buffer, src_offset: usize, dst_offset: usize, size: usize, ) -> CLResult<()>1167     pub fn copy_to_buffer(
1168         &self,
1169         ctx: &QueueContext,
1170         dst: &Buffer,
1171         src_offset: usize,
1172         dst_offset: usize,
1173         size: usize,
1174     ) -> CLResult<()> {
1175         let src_offset = self.apply_offset(src_offset)?;
1176         let dst_offset = dst.apply_offset(dst_offset)?;
1177         let src_res = self.get_res_for_access(ctx, RWFlags::RD)?;
1178         let dst_res = dst.get_res_for_access(ctx, RWFlags::WR)?;
1179 
1180         let bx = create_pipe_box(
1181             [src_offset, 0, 0].into(),
1182             [size, 1, 1].into(),
1183             CL_MEM_OBJECT_BUFFER,
1184         )?;
1185         let dst_origin: [u32; 3] = [dst_offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?, 0, 0];
1186 
1187         ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx);
1188         Ok(())
1189     }
1190 
copy_to_image( &self, ctx: &QueueContext, dst: &Image, src_offset: usize, dst_origin: CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1191     pub fn copy_to_image(
1192         &self,
1193         ctx: &QueueContext,
1194         dst: &Image,
1195         src_offset: usize,
1196         dst_origin: CLVec<usize>,
1197         region: &CLVec<usize>,
1198     ) -> CLResult<()> {
1199         let src_offset = self.apply_offset(src_offset)?;
1200         let bpp = dst.image_format.pixel_size().unwrap().into();
1201         let src_pitch = [bpp, bpp * region[0], bpp * region[0] * region[1]];
1202         let size = CLVec::calc_size(region, src_pitch);
1203         let tx_src = self.tx(ctx, src_offset, size, RWFlags::RD)?;
1204 
1205         // If image is created from a buffer, use image's slice and row pitch instead
1206         let tx_dst;
1207         let dst_pitch;
1208         if let Some(Mem::Buffer(buffer)) = dst.parent() {
1209             dst_pitch = [
1210                 bpp,
1211                 dst.image_desc.row_pitch()? as usize,
1212                 dst.image_desc.slice_pitch(),
1213             ];
1214 
1215             let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1216             tx_dst = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1217         } else {
1218             tx_dst = dst.tx_image(
1219                 ctx,
1220                 &create_pipe_box(dst_origin, *region, dst.mem_type)?,
1221                 RWFlags::WR,
1222             )?;
1223 
1224             dst_pitch = [1, tx_dst.row_pitch() as usize, tx_dst.slice_pitch()];
1225         }
1226 
1227         // Those pitch values cannot have 0 value in its coordinates
1228         debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1229         debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1230 
1231         perf_warning!("clEnqueueCopyBufferToImage stalls the GPU");
1232 
1233         sw_copy(
1234             tx_src.ptr(),
1235             tx_dst.ptr(),
1236             region,
1237             &CLVec::default(),
1238             src_pitch[1],
1239             src_pitch[2],
1240             &CLVec::default(),
1241             dst_pitch[1],
1242             dst_pitch[2],
1243             bpp as u8,
1244         );
1245         Ok(())
1246     }
1247 
fill( &self, ctx: &QueueContext, pattern: &[u8], offset: usize, size: usize, ) -> CLResult<()>1248     pub fn fill(
1249         &self,
1250         ctx: &QueueContext,
1251         pattern: &[u8],
1252         offset: usize,
1253         size: usize,
1254     ) -> CLResult<()> {
1255         let offset = self.apply_offset(offset)?;
1256         let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1257         ctx.clear_buffer(
1258             res,
1259             pattern,
1260             offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1261             size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1262         );
1263         Ok(())
1264     }
1265 
is_mapped_ptr(&self, ptr: *mut c_void) -> bool1266     fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
1267         let mut maps = self.maps.lock().unwrap();
1268         let entry = maps.entry(ptr as usize);
1269         matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
1270     }
1271 
map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr>1272     pub fn map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr> {
1273         let layout =
1274             unsafe { Layout::from_size_align_unchecked(size, size_of::<[cl_ulong; 16]>()) };
1275         self.base.map(
1276             offset,
1277             layout,
1278             writes,
1279             &self.maps,
1280             BufferMapping { offset: offset },
1281         )
1282     }
1283 
offset(&self) -> usize1284     pub fn offset(&self) -> usize {
1285         self.alloc.offset()
1286     }
1287 
read( &self, ctx: &QueueContext, offset: usize, ptr: MutMemoryPtr, size: usize, ) -> CLResult<()>1288     pub fn read(
1289         &self,
1290         ctx: &QueueContext,
1291         offset: usize,
1292         ptr: MutMemoryPtr,
1293         size: usize,
1294     ) -> CLResult<()> {
1295         let ptr = ptr.as_ptr();
1296         let tx = self.tx(ctx, offset, size, RWFlags::RD)?;
1297 
1298         perf_warning!("clEnqueueReadBuffer and clEnqueueMapBuffer stall the GPU");
1299 
1300         unsafe {
1301             ptr::copy(tx.ptr(), ptr, size);
1302         }
1303 
1304         Ok(())
1305     }
1306 
read_rect( &self, dst: MutMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1307     pub fn read_rect(
1308         &self,
1309         dst: MutMemoryPtr,
1310         ctx: &QueueContext,
1311         region: &CLVec<usize>,
1312         src_origin: &CLVec<usize>,
1313         src_row_pitch: usize,
1314         src_slice_pitch: usize,
1315         dst_origin: &CLVec<usize>,
1316         dst_row_pitch: usize,
1317         dst_slice_pitch: usize,
1318     ) -> CLResult<()> {
1319         let dst = dst.as_ptr();
1320         let (offset, size) =
1321             CLVec::calc_offset_size(src_origin, region, [1, src_row_pitch, src_slice_pitch]);
1322         let tx = self.tx(ctx, offset, size, RWFlags::RD)?;
1323 
1324         perf_warning!("clEnqueueReadBufferRect stalls the GPU");
1325 
1326         sw_copy(
1327             tx.ptr(),
1328             dst,
1329             region,
1330             &CLVec::default(),
1331             src_row_pitch,
1332             src_slice_pitch,
1333             dst_origin,
1334             dst_row_pitch,
1335             dst_slice_pitch,
1336             1,
1337         );
1338 
1339         Ok(())
1340     }
1341 
sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1342     pub fn sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1343         let maps = self.maps.lock().unwrap();
1344         let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
1345             return Err(CL_INVALID_VALUE);
1346         };
1347 
1348         // in this case we only need to migrate to the device if the data is located on a device not
1349         // having a userptr allocation.
1350         if self.is_pure_user_memory(ctx.dev)? {
1351             let rw = if mapping.writes {
1352                 RWFlags::RW
1353             } else {
1354                 RWFlags::RD
1355             };
1356 
1357             let _ = self.get_res_for_access(ctx, rw)?;
1358             return Ok(());
1359         }
1360 
1361         self.read(ctx, mapping.offset, ptr, mapping.size())
1362     }
1363 
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1364     pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1365         // no need to update
1366         if self.is_pure_user_memory(ctx.dev)? {
1367             return Ok(());
1368         }
1369 
1370         match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1371             Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1372             Entry::Occupied(entry) => {
1373                 let mapping = entry.get();
1374 
1375                 if mapping.writes {
1376                     self.write(ctx, mapping.offset, ptr.into(), mapping.size())?;
1377                 }
1378 
1379                 // only remove if the mapping wasn't reused in the meantime
1380                 if mapping.count == 0 {
1381                     entry.remove();
1382                 }
1383 
1384                 Ok(())
1385             }
1386         }
1387     }
1388 
tx<'a>( &self, ctx: &'a QueueContext, offset: usize, size: usize, rw: RWFlags, ) -> CLResult<PipeTransfer<'a>>1389     fn tx<'a>(
1390         &self,
1391         ctx: &'a QueueContext,
1392         offset: usize,
1393         size: usize,
1394         rw: RWFlags,
1395     ) -> CLResult<PipeTransfer<'a>> {
1396         let offset = self.apply_offset(offset)?;
1397         let r = self.get_res_for_access(ctx, rw)?;
1398 
1399         ctx.buffer_map(
1400             r,
1401             offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1402             size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1403             rw,
1404         )
1405         .ok_or(CL_OUT_OF_RESOURCES)
1406     }
1407 
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>1408     pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
1409         match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1410             Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1411             Entry::Occupied(mut entry) => {
1412                 let entry = entry.get_mut();
1413                 debug_assert!(entry.count > 0);
1414                 entry.count -= 1;
1415                 Ok(entry.count == 0)
1416             }
1417         }
1418     }
1419 
write( &self, ctx: &QueueContext, offset: usize, ptr: ConstMemoryPtr, size: usize, ) -> CLResult<()>1420     pub fn write(
1421         &self,
1422         ctx: &QueueContext,
1423         offset: usize,
1424         ptr: ConstMemoryPtr,
1425         size: usize,
1426     ) -> CLResult<()> {
1427         let ptr = ptr.as_ptr();
1428         let offset = self.apply_offset(offset)?;
1429         let r = self.get_res_for_access(ctx, RWFlags::WR)?;
1430 
1431         perf_warning!("clEnqueueWriteBuffer and clEnqueueUnmapMemObject might stall the GPU");
1432 
1433         ctx.buffer_subdata(
1434             r,
1435             offset.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1436             ptr,
1437             size.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1438         );
1439         Ok(())
1440     }
1441 
write_rect( &self, src: ConstMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, src_row_pitch: usize, src_slice_pitch: usize, dst_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1442     pub fn write_rect(
1443         &self,
1444         src: ConstMemoryPtr,
1445         ctx: &QueueContext,
1446         region: &CLVec<usize>,
1447         src_origin: &CLVec<usize>,
1448         src_row_pitch: usize,
1449         src_slice_pitch: usize,
1450         dst_origin: &CLVec<usize>,
1451         dst_row_pitch: usize,
1452         dst_slice_pitch: usize,
1453     ) -> CLResult<()> {
1454         let src = src.as_ptr();
1455         let (offset, size) =
1456             CLVec::calc_offset_size(dst_origin, region, [1, dst_row_pitch, dst_slice_pitch]);
1457         let tx = self.tx(ctx, offset, size, RWFlags::WR)?;
1458 
1459         perf_warning!("clEnqueueWriteBufferRect stalls the GPU");
1460 
1461         sw_copy(
1462             src,
1463             tx.ptr(),
1464             region,
1465             src_origin,
1466             src_row_pitch,
1467             src_slice_pitch,
1468             &CLVec::default(),
1469             dst_row_pitch,
1470             dst_slice_pitch,
1471             1,
1472         );
1473 
1474         Ok(())
1475     }
1476 }
1477 
1478 impl Image {
copy_to_buffer( &self, ctx: &QueueContext, dst: &Buffer, src_origin: CLVec<usize>, dst_offset: usize, region: &CLVec<usize>, ) -> CLResult<()>1479     pub fn copy_to_buffer(
1480         &self,
1481         ctx: &QueueContext,
1482         dst: &Buffer,
1483         src_origin: CLVec<usize>,
1484         dst_offset: usize,
1485         region: &CLVec<usize>,
1486     ) -> CLResult<()> {
1487         let bpp = self.image_format.pixel_size().unwrap().into();
1488 
1489         let src_pitch;
1490         let tx_src;
1491         if let Some(Mem::Buffer(buffer)) = self.parent() {
1492             src_pitch = [
1493                 bpp,
1494                 self.image_desc.row_pitch()? as usize,
1495                 self.image_desc.slice_pitch(),
1496             ];
1497             let (offset, size) = CLVec::calc_offset_size(src_origin, region, src_pitch);
1498             tx_src = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1499         } else {
1500             tx_src = self.tx_image(
1501                 ctx,
1502                 &create_pipe_box(src_origin, *region, self.mem_type)?,
1503                 RWFlags::RD,
1504             )?;
1505             src_pitch = [1, tx_src.row_pitch() as usize, tx_src.slice_pitch()];
1506         }
1507 
1508         // If image is created from a buffer, use image's slice and row pitch instead
1509         let dst_pitch = [bpp, bpp * region[0], bpp * region[0] * region[1]];
1510 
1511         let dst_origin: CLVec<usize> = [dst_offset, 0, 0].into();
1512         let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1513         let tx_dst = dst.tx(ctx, offset, size, RWFlags::WR)?;
1514 
1515         // Those pitch values cannot have 0 value in its coordinates
1516         debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1517         debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1518 
1519         perf_warning!("clEnqueueCopyImageToBuffer stalls the GPU");
1520 
1521         sw_copy(
1522             tx_src.ptr(),
1523             tx_dst.ptr(),
1524             region,
1525             &CLVec::default(),
1526             src_pitch[1],
1527             src_pitch[2],
1528             &CLVec::default(),
1529             dst_pitch[1],
1530             dst_pitch[2],
1531             bpp as u8,
1532         );
1533         Ok(())
1534     }
1535 
copy_to_image( &self, ctx: &QueueContext, dst: &Image, src_origin: CLVec<usize>, dst_origin: CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1536     pub fn copy_to_image(
1537         &self,
1538         ctx: &QueueContext,
1539         dst: &Image,
1540         src_origin: CLVec<usize>,
1541         dst_origin: CLVec<usize>,
1542         region: &CLVec<usize>,
1543     ) -> CLResult<()> {
1544         let src_res = self.get_res_for_access(ctx, RWFlags::RD)?;
1545         let dst_res = dst.get_res_for_access(ctx, RWFlags::WR)?;
1546 
1547         // We just want to use sw_copy if mem objects have different types or if copy can have
1548         // custom strides (image2d from buff/images)
1549         if self.is_parent_buffer() || dst.is_parent_buffer() {
1550             let bpp = self.image_format.pixel_size().unwrap().into();
1551 
1552             let tx_src;
1553             let tx_dst;
1554             let dst_pitch;
1555             let src_pitch;
1556             if let Some(Mem::Buffer(buffer)) = self.parent() {
1557                 src_pitch = [
1558                     bpp,
1559                     self.image_desc.row_pitch()? as usize,
1560                     self.image_desc.slice_pitch(),
1561                 ];
1562 
1563                 let (offset, size) = CLVec::calc_offset_size(src_origin, region, src_pitch);
1564                 tx_src = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1565             } else {
1566                 tx_src = self.tx_image(
1567                     ctx,
1568                     &create_pipe_box(src_origin, *region, self.mem_type)?,
1569                     RWFlags::RD,
1570                 )?;
1571 
1572                 src_pitch = [1, tx_src.row_pitch() as usize, tx_src.slice_pitch()];
1573             }
1574 
1575             if let Some(Mem::Buffer(buffer)) = dst.parent() {
1576                 // If image is created from a buffer, use image's slice and row pitch instead
1577                 dst_pitch = [
1578                     bpp,
1579                     dst.image_desc.row_pitch()? as usize,
1580                     dst.image_desc.slice_pitch(),
1581                 ];
1582 
1583                 let (offset, size) = CLVec::calc_offset_size(dst_origin, region, dst_pitch);
1584                 tx_dst = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1585             } else {
1586                 tx_dst = dst.tx_image(
1587                     ctx,
1588                     &create_pipe_box(dst_origin, *region, dst.mem_type)?,
1589                     RWFlags::WR,
1590                 )?;
1591 
1592                 dst_pitch = [1, tx_dst.row_pitch() as usize, tx_dst.slice_pitch()];
1593             }
1594 
1595             // Those pitch values cannot have 0 value in its coordinates
1596             debug_assert!(src_pitch[0] != 0 && src_pitch[1] != 0 && src_pitch[2] != 0);
1597             debug_assert!(dst_pitch[0] != 0 && dst_pitch[1] != 0 && dst_pitch[2] != 0);
1598 
1599             perf_warning!(
1600                 "clEnqueueCopyImage stalls the GPU when src or dst are created from a buffer"
1601             );
1602 
1603             sw_copy(
1604                 tx_src.ptr(),
1605                 tx_dst.ptr(),
1606                 region,
1607                 &CLVec::default(),
1608                 src_pitch[1],
1609                 src_pitch[2],
1610                 &CLVec::default(),
1611                 dst_pitch[1],
1612                 dst_pitch[2],
1613                 bpp as u8,
1614             )
1615         } else {
1616             let bx = create_pipe_box(src_origin, *region, self.mem_type)?;
1617             let mut dst_origin: [u32; 3] = dst_origin.try_into()?;
1618 
1619             if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1620                 (dst_origin[1], dst_origin[2]) = (dst_origin[2], dst_origin[1]);
1621             }
1622 
1623             ctx.resource_copy_region(src_res, dst_res, &dst_origin, &bx);
1624         }
1625         Ok(())
1626     }
1627 
fill( &self, ctx: &QueueContext, pattern: [u32; 4], origin: &CLVec<usize>, region: &CLVec<usize>, ) -> CLResult<()>1628     pub fn fill(
1629         &self,
1630         ctx: &QueueContext,
1631         pattern: [u32; 4],
1632         origin: &CLVec<usize>,
1633         region: &CLVec<usize>,
1634     ) -> CLResult<()> {
1635         let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1636 
1637         // make sure we allocate multiples of 4 bytes so drivers don't read out of bounds or
1638         // unaligned.
1639         let pixel_size: usize = self.image_format.pixel_size().unwrap().into();
1640         let mut new_pattern: Vec<u32> = vec![0; pixel_size.div_ceil(size_of::<u32>())];
1641 
1642         // SAFETY: pointers have to be valid for read/writes of exactly one pixel of their
1643         // respective format.
1644         // `new_pattern` has the correct size due to the `size` above.
1645         // `pattern` is validated through the CL API and allows undefined behavior if not followed
1646         // by CL API rules.
1647         unsafe {
1648             util_format_pack_rgba(
1649                 self.pipe_format,
1650                 new_pattern.as_mut_ptr().cast(),
1651                 pattern.as_ptr().cast(),
1652                 1,
1653             );
1654         }
1655 
1656         // If image is created from a buffer, use clear_image_buffer instead
1657         if self.is_parent_buffer() {
1658             let strides = (
1659                 self.image_desc.row_pitch()? as usize,
1660                 self.image_desc.slice_pitch(),
1661             );
1662             ctx.clear_image_buffer(res, &new_pattern, origin, region, strides, pixel_size);
1663         } else {
1664             let bx = create_pipe_box(*origin, *region, self.mem_type)?;
1665             ctx.clear_texture(res, &new_pattern, &bx);
1666         }
1667 
1668         Ok(())
1669     }
1670 
is_mapped_ptr(&self, ptr: *mut c_void) -> bool1671     fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
1672         let mut maps = self.maps.lock().unwrap();
1673         let entry = maps.entry(ptr as usize);
1674         matches!(entry, Entry::Occupied(entry) if entry.get().count > 0)
1675     }
1676 
is_parent_buffer(&self) -> bool1677     pub fn is_parent_buffer(&self) -> bool {
1678         matches!(self.parent(), Some(Mem::Buffer(_)))
1679     }
1680 
map( &self, origin: CLVec<usize>, region: CLVec<usize>, row_pitch: &mut usize, slice_pitch: &mut usize, writes: bool, ) -> CLResult<MutMemoryPtr>1681     pub fn map(
1682         &self,
1683         origin: CLVec<usize>,
1684         region: CLVec<usize>,
1685         row_pitch: &mut usize,
1686         slice_pitch: &mut usize,
1687         writes: bool,
1688     ) -> CLResult<MutMemoryPtr> {
1689         let pixel_size = self.image_format.pixel_size().unwrap() as usize;
1690 
1691         *row_pitch = self.image_desc.row_pitch()? as usize;
1692         *slice_pitch = self.image_desc.slice_pitch();
1693 
1694         let offset = CLVec::calc_offset(origin, [pixel_size, *row_pitch, *slice_pitch]);
1695 
1696         // From the CL Spec:
1697         //
1698         //   The pointer returned maps a 1D, 2D or 3D region starting at origin and is at least
1699         //   region[0] pixels in size for a 1D image, 1D image buffer or 1D image array,
1700         //   (image_row_pitch × region[1]) pixels in size for a 2D image or 2D image array, and
1701         //   (image_slice_pitch × region[2]) pixels in size for a 3D image. The result of a memory
1702         //   access outside this region is undefined.
1703         //
1704         // It's not guaranteed that the row_pitch is taken into account for 1D images, but the CL
1705         // CTS relies on this behavior.
1706         //
1707         // Also note, that the spec wording is wrong in regards to arrays, which need to take the
1708         // image_slice_pitch into account.
1709         let size = if self.image_desc.is_array() || self.image_desc.dims() == 3 {
1710             debug_assert_ne!(*slice_pitch, 0);
1711             // the slice count is in region[1] for 1D array images
1712             if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1713                 region[1] * *slice_pitch
1714             } else {
1715                 region[2] * *slice_pitch
1716             }
1717         } else {
1718             debug_assert_ne!(*row_pitch, 0);
1719             region[1] * *row_pitch
1720         };
1721 
1722         let layout;
1723         unsafe {
1724             layout = Layout::from_size_align_unchecked(size, size_of::<[u32; 4]>());
1725         }
1726 
1727         self.base.map(
1728             offset,
1729             layout,
1730             writes,
1731             &self.maps,
1732             ImageMapping {
1733                 origin: origin,
1734                 region: region,
1735             },
1736         )
1737     }
1738 
pipe_image_host_access(&self) -> u161739     fn pipe_image_host_access(&self) -> u16 {
1740         // those flags are all mutually exclusive
1741         (if bit_check(self.flags, CL_MEM_HOST_READ_ONLY) {
1742             PIPE_IMAGE_ACCESS_READ
1743         } else if bit_check(self.flags, CL_MEM_HOST_WRITE_ONLY) {
1744             PIPE_IMAGE_ACCESS_WRITE
1745         } else if bit_check(self.flags, CL_MEM_HOST_NO_ACCESS) {
1746             0
1747         } else {
1748             PIPE_IMAGE_ACCESS_READ_WRITE
1749         }) as u16
1750     }
1751 
read( &self, dst: MutMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_origin: &CLVec<usize>, dst_row_pitch: usize, dst_slice_pitch: usize, ) -> CLResult<()>1752     pub fn read(
1753         &self,
1754         dst: MutMemoryPtr,
1755         ctx: &QueueContext,
1756         region: &CLVec<usize>,
1757         src_origin: &CLVec<usize>,
1758         dst_row_pitch: usize,
1759         dst_slice_pitch: usize,
1760     ) -> CLResult<()> {
1761         let dst = dst.as_ptr();
1762         let pixel_size = self.image_format.pixel_size().unwrap();
1763 
1764         let tx;
1765         let src_row_pitch;
1766         let src_slice_pitch;
1767         if let Some(Mem::Buffer(buffer)) = self.parent() {
1768             src_row_pitch = self.image_desc.image_row_pitch;
1769             src_slice_pitch = self.image_desc.image_slice_pitch;
1770 
1771             let (offset, size) = CLVec::calc_offset_size(
1772                 src_origin,
1773                 region,
1774                 [pixel_size.into(), src_row_pitch, src_slice_pitch],
1775             );
1776 
1777             tx = buffer.tx(ctx, offset, size, RWFlags::RD)?;
1778         } else {
1779             let bx = create_pipe_box(*src_origin, *region, self.mem_type)?;
1780             tx = self.tx_image(ctx, &bx, RWFlags::RD)?;
1781             src_row_pitch = tx.row_pitch() as usize;
1782             src_slice_pitch = tx.slice_pitch();
1783         };
1784 
1785         perf_warning!("clEnqueueReadImage and clEnqueueMapImage stall the GPU");
1786 
1787         sw_copy(
1788             tx.ptr(),
1789             dst,
1790             region,
1791             &CLVec::default(),
1792             src_row_pitch,
1793             src_slice_pitch,
1794             &CLVec::default(),
1795             dst_row_pitch,
1796             dst_slice_pitch,
1797             pixel_size,
1798         );
1799 
1800         Ok(())
1801     }
1802 
sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1803     pub fn sync_map(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1804         let maps = self.maps.lock().unwrap();
1805         let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
1806             return Err(CL_INVALID_VALUE);
1807         };
1808 
1809         // in this case we only need to migrate to the device if the data is located on a device not
1810         // having a userptr allocation.
1811         if self.is_pure_user_memory(ctx.dev)? {
1812             let rw = if mapping.writes {
1813                 RWFlags::RW
1814             } else {
1815                 RWFlags::RD
1816             };
1817 
1818             let _ = self.get_res_for_access(ctx, rw)?;
1819             return Ok(());
1820         }
1821 
1822         let row_pitch = self.image_desc.row_pitch()? as usize;
1823         let slice_pitch = self.image_desc.slice_pitch();
1824 
1825         self.read(
1826             ptr,
1827             ctx,
1828             &mapping.region,
1829             &mapping.origin,
1830             row_pitch,
1831             slice_pitch,
1832         )
1833     }
1834 
sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()>1835     pub fn sync_unmap(&self, ctx: &QueueContext, ptr: MutMemoryPtr) -> CLResult<()> {
1836         // no need to update
1837         if self.is_pure_user_memory(ctx.dev)? {
1838             return Ok(());
1839         }
1840 
1841         match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1842             Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1843             Entry::Occupied(entry) => {
1844                 let mapping = entry.get();
1845                 let row_pitch = self.image_desc.row_pitch()? as usize;
1846                 let slice_pitch = self.image_desc.slice_pitch();
1847 
1848                 if mapping.writes {
1849                     self.write(
1850                         ptr.into(),
1851                         ctx,
1852                         &mapping.region,
1853                         row_pitch,
1854                         slice_pitch,
1855                         &mapping.origin,
1856                     )?;
1857                 }
1858 
1859                 // only remove if the mapping wasn't reused in the meantime
1860                 if mapping.count == 0 {
1861                     entry.remove();
1862                 }
1863 
1864                 Ok(())
1865             }
1866         }
1867     }
1868 
tx_image<'a>( &self, ctx: &'a QueueContext, bx: &pipe_box, rw: RWFlags, ) -> CLResult<PipeTransfer<'a>>1869     fn tx_image<'a>(
1870         &self,
1871         ctx: &'a QueueContext,
1872         bx: &pipe_box,
1873         rw: RWFlags,
1874     ) -> CLResult<PipeTransfer<'a>> {
1875         let r = self.get_res_for_access(ctx, rw)?;
1876         ctx.texture_map(r, bx, rw).ok_or(CL_OUT_OF_RESOURCES)
1877     }
1878 
unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool>1879     pub fn unmap(&self, ptr: MutMemoryPtr) -> CLResult<bool> {
1880         match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
1881             Entry::Vacant(_) => Err(CL_INVALID_VALUE),
1882             Entry::Occupied(mut entry) => {
1883                 let entry = entry.get_mut();
1884                 debug_assert!(entry.count > 0);
1885                 entry.count -= 1;
1886                 Ok(entry.count == 0)
1887             }
1888         }
1889     }
1890 
write( &self, src: ConstMemoryPtr, ctx: &QueueContext, region: &CLVec<usize>, src_row_pitch: usize, mut src_slice_pitch: usize, dst_origin: &CLVec<usize>, ) -> CLResult<()>1891     pub fn write(
1892         &self,
1893         src: ConstMemoryPtr,
1894         ctx: &QueueContext,
1895         region: &CLVec<usize>,
1896         src_row_pitch: usize,
1897         mut src_slice_pitch: usize,
1898         dst_origin: &CLVec<usize>,
1899     ) -> CLResult<()> {
1900         let src = src.as_ptr();
1901         let dst_row_pitch = self.image_desc.image_row_pitch;
1902         let dst_slice_pitch = self.image_desc.image_slice_pitch;
1903 
1904         // texture_subdata most likely maps the resource anyway
1905         perf_warning!("clEnqueueWriteImage and clEnqueueUnmapMemObject stall the GPU");
1906 
1907         if let Some(Mem::Buffer(buffer)) = self.parent() {
1908             let pixel_size = self.image_format.pixel_size().unwrap();
1909             let (offset, size) = CLVec::calc_offset_size(
1910                 dst_origin,
1911                 region,
1912                 [pixel_size.into(), dst_row_pitch, dst_slice_pitch],
1913             );
1914             let tx = buffer.tx(ctx, offset, size, RWFlags::WR)?;
1915 
1916             sw_copy(
1917                 src,
1918                 tx.ptr(),
1919                 region,
1920                 &CLVec::default(),
1921                 src_row_pitch,
1922                 src_slice_pitch,
1923                 &CLVec::default(),
1924                 dst_row_pitch,
1925                 dst_slice_pitch,
1926                 pixel_size,
1927             );
1928         } else {
1929             let res = self.get_res_for_access(ctx, RWFlags::WR)?;
1930             let bx = create_pipe_box(*dst_origin, *region, self.mem_type)?;
1931 
1932             if self.mem_type == CL_MEM_OBJECT_IMAGE1D_ARRAY {
1933                 src_slice_pitch = src_row_pitch;
1934             }
1935 
1936             ctx.texture_subdata(
1937                 res,
1938                 &bx,
1939                 src,
1940                 src_row_pitch.try_into_with_err(CL_OUT_OF_HOST_MEMORY)?,
1941                 src_slice_pitch,
1942             );
1943         }
1944         Ok(())
1945     }
1946 
1947     /// Creates metadata when an 2D image or sampler view is created over a buffer resource.
buffer_2d_info(&self) -> CLResult<AppImgInfo>1948     fn buffer_2d_info(&self) -> CLResult<AppImgInfo> {
1949         Ok(AppImgInfo::new(
1950             self.image_desc.row_pitch()? / self.image_elem_size as u32,
1951             self.image_desc.width()?,
1952             self.image_desc.height()?,
1953         ))
1954     }
1955 
sampler_view<'c>(&self, ctx: &'c QueueContext) -> CLResult<PipeSamplerView<'c, '_>>1956     pub fn sampler_view<'c>(&self, ctx: &'c QueueContext) -> CLResult<PipeSamplerView<'c, '_>> {
1957         let res = self.get_res_for_access(ctx, RWFlags::RD)?;
1958 
1959         let template = if res.is_buffer() && self.mem_type == CL_MEM_OBJECT_IMAGE2D {
1960             res.pipe_sampler_view_template_2d_buffer(self.pipe_format, &self.buffer_2d_info()?)
1961         } else if res.is_buffer() {
1962             // we need to pass in the size of the buffer, not the width.
1963             let size = self.size.try_into_with_err(CL_OUT_OF_RESOURCES)?;
1964             res.pipe_sampler_view_template_1d_buffer(self.pipe_format, size)
1965         } else {
1966             res.pipe_sampler_view_template()
1967         };
1968 
1969         PipeSamplerView::new(ctx, res, &template).ok_or(CL_OUT_OF_HOST_MEMORY)
1970     }
1971 
image_view(&self, ctx: &QueueContext, read_write: bool) -> CLResult<PipeImageView>1972     pub fn image_view(&self, ctx: &QueueContext, read_write: bool) -> CLResult<PipeImageView> {
1973         let rw = if read_write { RWFlags::RW } else { RWFlags::WR };
1974 
1975         let res = self.get_res_for_access(ctx, rw)?;
1976         if res.is_buffer() && self.mem_type == CL_MEM_OBJECT_IMAGE2D {
1977             Ok(res.pipe_image_view_2d_buffer(
1978                 self.pipe_format,
1979                 read_write,
1980                 self.pipe_image_host_access(),
1981                 &self.buffer_2d_info()?,
1982             ))
1983         } else if res.is_buffer() {
1984             let size = self.size.try_into_with_err(CL_OUT_OF_RESOURCES)?;
1985             Ok(res.pipe_image_view_1d_buffer(
1986                 self.pipe_format,
1987                 read_write,
1988                 self.pipe_image_host_access(),
1989                 size,
1990             ))
1991         } else {
1992             Ok(res.pipe_image_view(read_write, self.pipe_image_host_access()))
1993         }
1994     }
1995 }
1996 
1997 pub struct Sampler {
1998     pub base: CLObjectBase<CL_INVALID_SAMPLER>,
1999     pub context: Arc<Context>,
2000     pub normalized_coords: bool,
2001     pub addressing_mode: cl_addressing_mode,
2002     pub filter_mode: cl_filter_mode,
2003     pub props: Properties<cl_sampler_properties>,
2004 }
2005 
2006 impl_cl_type_trait!(cl_sampler, Sampler, CL_INVALID_SAMPLER);
2007 
2008 impl Sampler {
new( context: Arc<Context>, normalized_coords: bool, addressing_mode: cl_addressing_mode, filter_mode: cl_filter_mode, props: Properties<cl_sampler_properties>, ) -> Arc<Sampler>2009     pub fn new(
2010         context: Arc<Context>,
2011         normalized_coords: bool,
2012         addressing_mode: cl_addressing_mode,
2013         filter_mode: cl_filter_mode,
2014         props: Properties<cl_sampler_properties>,
2015     ) -> Arc<Sampler> {
2016         Arc::new(Self {
2017             base: CLObjectBase::new(RusticlTypes::Sampler),
2018             context: context,
2019             normalized_coords: normalized_coords,
2020             addressing_mode: addressing_mode,
2021             filter_mode: filter_mode,
2022             props: props,
2023         })
2024     }
2025 
nir_to_cl( addressing_mode: u32, filter_mode: u32, normalized_coords: u32, ) -> (cl_addressing_mode, cl_filter_mode, bool)2026     pub fn nir_to_cl(
2027         addressing_mode: u32,
2028         filter_mode: u32,
2029         normalized_coords: u32,
2030     ) -> (cl_addressing_mode, cl_filter_mode, bool) {
2031         let addr_mode = match addressing_mode {
2032             cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_NONE => CL_ADDRESS_NONE,
2033             cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE => {
2034                 CL_ADDRESS_CLAMP_TO_EDGE
2035             }
2036             cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_CLAMP => CL_ADDRESS_CLAMP,
2037             cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_REPEAT => CL_ADDRESS_REPEAT,
2038             cl_sampler_addressing_mode::SAMPLER_ADDRESSING_MODE_REPEAT_MIRRORED => {
2039                 CL_ADDRESS_MIRRORED_REPEAT
2040             }
2041             _ => panic!("unknown addressing_mode"),
2042         };
2043 
2044         let filter = match filter_mode {
2045             cl_sampler_filter_mode::SAMPLER_FILTER_MODE_NEAREST => CL_FILTER_NEAREST,
2046             cl_sampler_filter_mode::SAMPLER_FILTER_MODE_LINEAR => CL_FILTER_LINEAR,
2047             _ => panic!("unknown filter_mode"),
2048         };
2049 
2050         (addr_mode, filter, normalized_coords != 0)
2051     }
2052 
cl_to_pipe( (addressing_mode, filter_mode, normalized_coords): ( cl_addressing_mode, cl_filter_mode, bool, ), ) -> pipe_sampler_state2053     pub fn cl_to_pipe(
2054         (addressing_mode, filter_mode, normalized_coords): (
2055             cl_addressing_mode,
2056             cl_filter_mode,
2057             bool,
2058         ),
2059     ) -> pipe_sampler_state {
2060         let mut res = pipe_sampler_state::default();
2061 
2062         let wrap = match addressing_mode {
2063             CL_ADDRESS_CLAMP_TO_EDGE => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_EDGE,
2064             CL_ADDRESS_CLAMP => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_BORDER,
2065             CL_ADDRESS_REPEAT => pipe_tex_wrap::PIPE_TEX_WRAP_REPEAT,
2066             CL_ADDRESS_MIRRORED_REPEAT => pipe_tex_wrap::PIPE_TEX_WRAP_MIRROR_REPEAT,
2067             // TODO: what's a reasonable default?
2068             _ => pipe_tex_wrap::PIPE_TEX_WRAP_CLAMP_TO_EDGE,
2069         };
2070 
2071         let img_filter = match filter_mode {
2072             CL_FILTER_NEAREST => pipe_tex_filter::PIPE_TEX_FILTER_NEAREST,
2073             CL_FILTER_LINEAR => pipe_tex_filter::PIPE_TEX_FILTER_LINEAR,
2074             _ => panic!("unknown filter_mode"),
2075         };
2076 
2077         res.set_min_img_filter(img_filter);
2078         res.set_mag_img_filter(img_filter);
2079         res.set_unnormalized_coords((!normalized_coords).into());
2080         res.set_wrap_r(wrap);
2081         res.set_wrap_s(wrap);
2082         res.set_wrap_t(wrap);
2083 
2084         res
2085     }
2086 
pipe(&self) -> pipe_sampler_state2087     pub fn pipe(&self) -> pipe_sampler_state {
2088         Self::cl_to_pipe((
2089             self.addressing_mode,
2090             self.filter_mode,
2091             self.normalized_coords,
2092         ))
2093     }
2094 }
2095