• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Intel MIC Platform Software Stack (MPSS)
3   *
4   * This file is provided under a dual BSD/GPLv2 license.  When using or
5   * redistributing this file, you may do so under either license.
6   *
7   * GPL LICENSE SUMMARY
8   *
9   * Copyright(c) 2014 Intel Corporation.
10   *
11   * This program is free software; you can redistribute it and/or modify
12   * it under the terms of version 2 of the GNU General Public License as
13   * published by the Free Software Foundation.
14   *
15   * This program is distributed in the hope that it will be useful, but
16   * WITHOUT ANY WARRANTY; without even the implied warranty of
17   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18   * General Public License for more details.
19   *
20   * BSD LICENSE
21   *
22   * Copyright(c) 2014 Intel Corporation.
23   *
24   * Redistribution and use in source and binary forms, with or without
25   * modification, are permitted provided that the following conditions
26   * are met:
27   *
28   * * Redistributions of source code must retain the above copyright
29   *   notice, this list of conditions and the following disclaimer.
30   * * Redistributions in binary form must reproduce the above copyright
31   *   notice, this list of conditions and the following disclaimer in
32   *   the documentation and/or other materials provided with the
33   *   distribution.
34   * * Neither the name of Intel Corporation nor the names of its
35   *   contributors may be used to endorse or promote products derived
36   *   from this software without specific prior written permission.
37   *
38   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41   * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42   * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49   *
50   * Intel SCIF driver.
51   *
52   */
53  #ifndef __SCIF_H__
54  #define __SCIF_H__
55  
56  #include <linux/types.h>
57  #include <linux/poll.h>
58  #include <linux/device.h>
59  #include <linux/scif_ioctl.h>
60  
61  #define SCIF_ACCEPT_SYNC	1
62  #define SCIF_SEND_BLOCK		1
63  #define SCIF_RECV_BLOCK		1
64  
65  enum {
66  	SCIF_PROT_READ = (1 << 0),
67  	SCIF_PROT_WRITE = (1 << 1)
68  };
69  
70  enum {
71  	SCIF_MAP_FIXED = 0x10,
72  	SCIF_MAP_KERNEL	= 0x20,
73  };
74  
75  enum {
76  	SCIF_FENCE_INIT_SELF = (1 << 0),
77  	SCIF_FENCE_INIT_PEER = (1 << 1),
78  	SCIF_SIGNAL_LOCAL = (1 << 4),
79  	SCIF_SIGNAL_REMOTE = (1 << 5)
80  };
81  
82  enum {
83  	SCIF_RMA_USECPU = (1 << 0),
84  	SCIF_RMA_USECACHE = (1 << 1),
85  	SCIF_RMA_SYNC = (1 << 2),
86  	SCIF_RMA_ORDERED = (1 << 3)
87  };
88  
89  /* End of SCIF Admin Reserved Ports */
90  #define SCIF_ADMIN_PORT_END	1024
91  
92  /* End of SCIF Reserved Ports */
93  #define SCIF_PORT_RSVD		1088
94  
95  typedef struct scif_endpt *scif_epd_t;
96  typedef struct scif_pinned_pages *scif_pinned_pages_t;
97  
98  /**
99   * struct scif_range - SCIF registered range used in kernel mode
100   * @cookie: cookie used internally by SCIF
101   * @nr_pages: number of pages of PAGE_SIZE
102   * @prot_flags: R/W protection
103   * @phys_addr: Array of bus addresses
104   * @va: Array of kernel virtual addresses backed by the pages in the phys_addr
105   *	array. The va is populated only when called on the host for a remote
106   *	SCIF connection on MIC. This is required to support the use case of DMA
107   *	between MIC and another device which is not a SCIF node e.g., an IB or
108   *	ethernet NIC.
109   */
110  struct scif_range {
111  	void *cookie;
112  	int nr_pages;
113  	int prot_flags;
114  	dma_addr_t *phys_addr;
115  	void __iomem **va;
116  };
117  
118  /**
119   * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll
120   * @epd: SCIF endpoint
121   * @events: requested events
122   * @revents: returned events
123   */
124  struct scif_pollepd {
125  	scif_epd_t epd;
126  	short events;
127  	short revents;
128  };
129  
130  /**
131   * scif_peer_dev - representation of a peer SCIF device
132   *
133   * Peer devices show up as PCIe devices for the mgmt node but not the cards.
134   * The mgmt node discovers all the cards on the PCIe bus and informs the other
135   * cards about their peers. Upon notification of a peer a node adds a peer
136   * device to the peer bus to maintain symmetry in the way devices are
137   * discovered across all nodes in the SCIF network.
138   *
139   * @dev: underlying device
140   * @dnode - The destination node which this device will communicate with.
141   */
142  struct scif_peer_dev {
143  	struct device dev;
144  	u8 dnode;
145  };
146  
147  /**
148   * scif_client - representation of a SCIF client
149   * @name: client name
150   * @probe - client method called when a peer device is registered
151   * @remove - client method called when a peer device is unregistered
152   * @si - subsys_interface used internally for implementing SCIF clients
153   */
154  struct scif_client {
155  	const char *name;
156  	void (*probe)(struct scif_peer_dev *spdev);
157  	void (*remove)(struct scif_peer_dev *spdev);
158  	struct subsys_interface si;
159  };
160  
161  #define SCIF_OPEN_FAILED ((scif_epd_t)-1)
162  #define SCIF_REGISTER_FAILED ((off_t)-1)
163  #define SCIF_MMAP_FAILED ((void *)-1)
164  
165  /**
166   * scif_open() - Create an endpoint
167   *
168   * Return:
169   * Upon successful completion, scif_open() returns an endpoint descriptor to
170   * be used in subsequent SCIF functions calls to refer to that endpoint;
171   * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
172   * returned and errno is set to indicate the error; in kernel mode a NULL
173   * scif_epd_t is returned.
174   *
175   * Errors:
176   * ENOMEM - Insufficient kernel memory was available
177   */
178  scif_epd_t scif_open(void);
179  
180  /**
181   * scif_bind() - Bind an endpoint to a port
182   * @epd:	endpoint descriptor
183   * @pn:		port number
184   *
185   * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
186   * local node. If pn is zero, a port number greater than or equal to
187   * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
188   * exactly one local port. Ports less than 1024 when requested can only be bound
189   * by system (or root) processes or by processes executed by privileged users.
190   *
191   * Return:
192   * Upon successful completion, scif_bind() returns the port number to which epd
193   * is bound; otherwise in user mode -1 is returned and errno is set to
194   * indicate the error; in kernel mode the negative of one of the following
195   * errors is returned.
196   *
197   * Errors:
198   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
199   * EINVAL - the endpoint or the port is already bound
200   * EISCONN - The endpoint is already connected
201   * ENOSPC - No port number available for assignment
202   * EACCES - The port requested is protected and the user is not the superuser
203   */
204  int scif_bind(scif_epd_t epd, u16 pn);
205  
206  /**
207   * scif_listen() - Listen for connections on an endpoint
208   * @epd:	endpoint descriptor
209   * @backlog:	maximum pending connection requests
210   *
211   * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
212   * an endpoint that will be used to accept incoming connection requests. Once
213   * so marked, the endpoint is said to be in the listening state and may not be
214   * used as the endpoint of a connection.
215   *
216   * The endpoint, epd, must have been bound to a port.
217   *
218   * The backlog argument defines the maximum length to which the queue of
219   * pending connections for epd may grow. If a connection request arrives when
220   * the queue is full, the client may receive an error with an indication that
221   * the connection was refused.
222   *
223   * Return:
224   * Upon successful completion, scif_listen() returns 0; otherwise in user mode
225   * -1 is returned and errno is set to indicate the error; in kernel mode the
226   * negative of one of the following errors is returned.
227   *
228   * Errors:
229   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
230   * EINVAL - the endpoint is not bound to a port
231   * EISCONN - The endpoint is already connected or listening
232   */
233  int scif_listen(scif_epd_t epd, int backlog);
234  
235  /**
236   * scif_connect() - Initiate a connection on a port
237   * @epd:	endpoint descriptor
238   * @dst:	global id of port to which to connect
239   *
240   * The scif_connect() function requests the connection of endpoint epd to remote
241   * port dst. If the connection is successful, a peer endpoint, bound to dst, is
242   * created on node dst.node. On successful return, the connection is complete.
243   *
244   * If the endpoint epd has not already been bound to a port, scif_connect()
245   * will bind it to an unused local port.
246   *
247   * A connection is terminated when an endpoint of the connection is closed,
248   * either explicitly by scif_close(), or when a process that owns one of the
249   * endpoints of the connection is terminated.
250   *
251   * In user space, scif_connect() supports an asynchronous connection mode
252   * if the application has set the O_NONBLOCK flag on the endpoint via the
253   * fcntl() system call. Setting this flag will result in the calling process
254   * not to wait during scif_connect().
255   *
256   * Return:
257   * Upon successful completion, scif_connect() returns the port ID to which the
258   * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
259   * set to indicate the error; in kernel mode the negative of one of the
260   * following errors is returned.
261   *
262   * Errors:
263   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
264   * ECONNREFUSED - The destination was not listening for connections or refused
265   * the connection request
266   * EINVAL - dst.port is not a valid port ID
267   * EISCONN - The endpoint is already connected
268   * ENOMEM - No buffer space is available
269   * ENODEV - The destination node does not exist, or the node is lost or existed,
270   * but is not currently in the network since it may have crashed
271   * ENOSPC - No port number available for assignment
272   * EOPNOTSUPP - The endpoint is listening and cannot be connected
273   */
274  int scif_connect(scif_epd_t epd, struct scif_port_id *dst);
275  
276  /**
277   * scif_accept() - Accept a connection on an endpoint
278   * @epd:	endpoint descriptor
279   * @peer:	global id of port to which connected
280   * @newepd:	new connected endpoint descriptor
281   * @flags:	flags
282   *
283   * The scif_accept() call extracts the first connection request from the queue
284   * of pending connections for the port on which epd is listening. scif_accept()
285   * creates a new endpoint, bound to the same port as epd, and allocates a new
286   * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
287   * endpoint is connected to the endpoint through which the connection was
288   * requested. epd is unaffected by this call, and remains in the listening
289   * state.
290   *
291   * On successful return, peer holds the global port identifier (node id and
292   * local port number) of the port which requested the connection.
293   *
294   * A connection is terminated when an endpoint of the connection is closed,
295   * either explicitly by scif_close(), or when a process that owns one of the
296   * endpoints of the connection is terminated.
297   *
298   * The number of connections that can (subsequently) be accepted on epd is only
299   * limited by system resources (memory).
300   *
301   * The flags argument is formed by OR'ing together zero or more of the
302   * following values.
303   * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
304   *			SCIF_ACCEPT_SYNC is not in flags, and no pending
305   *			connections are present on the queue, scif_accept()
306   *			fails with an EAGAIN error
307   *
308   * In user mode, the select() and poll() functions can be used to determine
309   * when there is a connection request. In kernel mode, the scif_poll()
310   * function may be used for this purpose. A readable event will be delivered
311   * when a connection is requested.
312   *
313   * Return:
314   * Upon successful completion, scif_accept() returns 0; otherwise in user mode
315   * -1 is returned and errno is set to indicate the error; in kernel mode the
316   *	negative of one of the following errors is returned.
317   *
318   * Errors:
319   * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
320   * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
321   * its connection request
322   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
323   * EINTR - Interrupted function
324   * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
325   * NULL, or newepd is NULL
326   * ENODEV - The requesting node is lost or existed, but is not currently in the
327   * network since it may have crashed
328   * ENOMEM - Not enough space
329   * ENOENT - Secondary part of epd registration failed
330   */
331  int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
332  		*newepd, int flags);
333  
334  /**
335   * scif_close() - Close an endpoint
336   * @epd:	endpoint descriptor
337   *
338   * scif_close() closes an endpoint and performs necessary teardown of
339   * facilities associated with that endpoint.
340   *
341   * If epd is a listening endpoint then it will no longer accept connection
342   * requests on the port to which it is bound. Any pending connection requests
343   * are rejected.
344   *
345   * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
346   * which are in-process through epd or its peer endpoint will complete before
347   * scif_close() returns. Registered windows of the local and peer endpoints are
348   * released as if scif_unregister() was called against each window.
349   *
350   * Closing a SCIF endpoint does not affect local registered memory mapped by
351   * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
352   * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
353   *
354   * If the peer endpoint's receive queue is not empty at the time that epd is
355   * closed, then the peer endpoint can be passed as the endpoint parameter to
356   * scif_recv() until the receive queue is empty.
357   *
358   * epd is freed and may no longer be accessed.
359   *
360   * Return:
361   * Upon successful completion, scif_close() returns 0; otherwise in user mode
362   * -1 is returned and errno is set to indicate the error; in kernel mode the
363   * negative of one of the following errors is returned.
364   *
365   * Errors:
366   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
367   */
368  int scif_close(scif_epd_t epd);
369  
370  /**
371   * scif_send() - Send a message
372   * @epd:	endpoint descriptor
373   * @msg:	message buffer address
374   * @len:	message length
375   * @flags:	blocking mode flags
376   *
377   * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
378   * are copied from memory starting at address msg. On successful execution the
379   * return value of scif_send() is the number of bytes that were sent, and is
380   * zero if no bytes were sent because len was zero. scif_send() may be called
381   * only when the endpoint is in a connected state.
382   *
383   * If a scif_send() call is non-blocking, then it sends only those bytes which
384   * can be sent without waiting, up to a maximum of len bytes.
385   *
386   * If a scif_send() call is blocking, then it normally returns after sending
387   * all len bytes. If a blocking call is interrupted or the connection is
388   * reset, the call is considered successful if some bytes were sent or len is
389   * zero, otherwise the call is considered unsuccessful.
390   *
391   * In user mode, the select() and poll() functions can be used to determine
392   * when the send queue is not full. In kernel mode, the scif_poll() function
393   * may be used for this purpose.
394   *
395   * It is recommended that scif_send()/scif_recv() only be used for short
396   * control-type message communication between SCIF endpoints. The SCIF RMA
397   * APIs are expected to provide better performance for transfer sizes of
398   * 1024 bytes or longer for the current MIC hardware and software
399   * implementation.
400   *
401   * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
402   * is passed as the flags argument.
403   *
404   * Return:
405   * Upon successful completion, scif_send() returns the number of bytes sent;
406   * otherwise in user mode -1 is returned and errno is set to indicate the
407   * error; in kernel mode the negative of one of the following errors is
408   * returned.
409   *
410   * Errors:
411   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
412   * ECONNRESET - Connection reset by peer
413   * EINVAL - flags is invalid, or len is negative
414   * ENODEV - The remote node is lost or existed, but is not currently in the
415   * network since it may have crashed
416   * ENOMEM - Not enough space
417   * ENOTCONN - The endpoint is not connected
418   */
419  int scif_send(scif_epd_t epd, void *msg, int len, int flags);
420  
421  /**
422   * scif_recv() - Receive a message
423   * @epd:	endpoint descriptor
424   * @msg:	message buffer address
425   * @len:	message buffer length
426   * @flags:	blocking mode flags
427   *
428   * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
429   * data are copied to memory starting at address msg. On successful execution
430   * the return value of scif_recv() is the number of bytes that were received,
431   * and is zero if no bytes were received because len was zero. scif_recv() may
432   * be called only when the endpoint is in a connected state.
433   *
434   * If a scif_recv() call is non-blocking, then it receives only those bytes
435   * which can be received without waiting, up to a maximum of len bytes.
436   *
437   * If a scif_recv() call is blocking, then it normally returns after receiving
438   * all len bytes. If the blocking call was interrupted due to a disconnection,
439   * subsequent calls to scif_recv() will copy all bytes received upto the point
440   * of disconnection.
441   *
442   * In user mode, the select() and poll() functions can be used to determine
443   * when data is available to be received. In kernel mode, the scif_poll()
444   * function may be used for this purpose.
445   *
446   * It is recommended that scif_send()/scif_recv() only be used for short
447   * control-type message communication between SCIF endpoints. The SCIF RMA
448   * APIs are expected to provide better performance for transfer sizes of
449   * 1024 bytes or longer for the current MIC hardware and software
450   * implementation.
451   *
452   * scif_recv() will block until the entire message is received if
453   * SCIF_RECV_BLOCK is passed as the flags argument.
454   *
455   * Return:
456   * Upon successful completion, scif_recv() returns the number of bytes
457   * received; otherwise in user mode -1 is returned and errno is set to
458   * indicate the error; in kernel mode the negative of one of the following
459   * errors is returned.
460   *
461   * Errors:
462   * EAGAIN - The destination node is returning from a low power state
463   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
464   * ECONNRESET - Connection reset by peer
465   * EINVAL - flags is invalid, or len is negative
466   * ENODEV - The remote node is lost or existed, but is not currently in the
467   * network since it may have crashed
468   * ENOMEM - Not enough space
469   * ENOTCONN - The endpoint is not connected
470   */
471  int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
472  
473  /**
474   * scif_register() - Mark a memory region for remote access.
475   * @epd:		endpoint descriptor
476   * @addr:		starting virtual address
477   * @len:		length of range
478   * @offset:		offset of window
479   * @prot_flags:		read/write protection flags
480   * @map_flags:		mapping flags
481   *
482   * The scif_register() function opens a window, a range of whole pages of the
483   * registered address space of the endpoint epd, starting at offset po and
484   * continuing for len bytes. The value of po, further described below, is a
485   * function of the parameters offset and len, and the value of map_flags. Each
486   * page of the window represents the physical memory page which backs the
487   * corresponding page of the range of virtual address pages starting at addr
488   * and continuing for len bytes. addr and len are constrained to be multiples
489   * of the page size. A successful scif_register() call returns po.
490   *
491   * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
492   * exactly, and offset is constrained to be a multiple of the page size. The
493   * mapping established by scif_register() will not replace any existing
494   * registration; an error is returned if any page within the range [offset,
495   * offset + len - 1] intersects an existing window.
496   *
497   * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
498   * implementation-defined manner to arrive at po. The po value so chosen will
499   * be an area of the registered address space that the implementation deems
500   * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
501   * granting the implementation complete freedom in selecting po, subject to
502   * constraints described below. A non-zero value of offset is taken to be a
503   * suggestion of an offset near which the mapping should be placed. When the
504   * implementation selects a value for po, it does not replace any extant
505   * window. In all cases, po will be a multiple of the page size.
506   *
507   * The physical pages which are so represented by a window are available for
508   * access in calls to mmap(), scif_readfrom(), scif_writeto(),
509   * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
510   * physical pages represented by the window will not be reused by the memory
511   * subsystem for any other purpose. Note that the same physical page may be
512   * represented by multiple windows.
513   *
514   * Subsequent operations which change the memory pages to which virtual
515   * addresses are mapped (such as mmap(), munmap()) have no effect on
516   * existing window.
517   *
518   * If the process will fork(), it is recommended that the registered
519   * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
520   * problems due to copy-on-write semantics.
521   *
522   * The prot_flags argument is formed by OR'ing together one or more of the
523   * following values.
524   * SCIF_PROT_READ - allow read operations from the window
525   * SCIF_PROT_WRITE - allow write operations to the window
526   *
527   * Return:
528   * Upon successful completion, scif_register() returns the offset at which the
529   * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
530   * is (off_t *)-1) is returned and errno is set to indicate the error; in
531   * kernel mode the negative of one of the following errors is returned.
532   *
533   * Errors:
534   * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
535   * [offset, offset + len -1] are already registered
536   * EAGAIN - The mapping could not be performed due to lack of resources
537   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
538   * ECONNRESET - Connection reset by peer
539   * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
540   * set in flags, and offset is not a multiple of the page size, or addr is not a
541   * multiple of the page size, or len is not a multiple of the page size, or is
542   * 0, or offset is negative
543   * ENODEV - The remote node is lost or existed, but is not currently in the
544   * network since it may have crashed
545   * ENOMEM - Not enough space
546   * ENOTCONN -The endpoint is not connected
547   */
548  off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
549  		    int prot_flags, int map_flags);
550  
551  /**
552   * scif_unregister() - Mark a memory region for remote access.
553   * @epd:	endpoint descriptor
554   * @offset:	start of range to unregister
555   * @len:	length of range to unregister
556   *
557   * The scif_unregister() function closes those previously registered windows
558   * which are entirely within the range [offset, offset + len - 1]. It is an
559   * error to specify a range which intersects only a subrange of a window.
560   *
561   * On a successful return, pages within the window may no longer be specified
562   * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
563   * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
564   * however, continues to exist until all previous references against it are
565   * removed. A window is referenced if there is a mapping to it created by
566   * mmap(), or if scif_get_pages() was called against the window
567   * (and the pages have not been returned via scif_put_pages()). A window is
568   * also referenced while an RMA, in which some range of the window is a source
569   * or destination, is in progress. Finally a window is referenced while some
570   * offset in that window was specified to scif_fence_signal(), and the RMAs
571   * marked by that call to scif_fence_signal() have not completed. While a
572   * window is in this state, its registered address space pages are not
573   * available for use in a new registered window.
574   *
575   * When all such references to the window have been removed, its references to
576   * all the physical pages which it represents are removed. Similarly, the
577   * registered address space pages of the window become available for
578   * registration in a new window.
579   *
580   * Return:
581   * Upon successful completion, scif_unregister() returns 0; otherwise in user
582   * mode -1 is returned and errno is set to indicate the error; in kernel mode
583   * the negative of one of the following errors is returned. In the event of an
584   * error, no windows are unregistered.
585   *
586   * Errors:
587   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
588   * ECONNRESET - Connection reset by peer
589   * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
590   * window, or offset is negative
591   * ENODEV - The remote node is lost or existed, but is not currently in the
592   * network since it may have crashed
593   * ENOTCONN - The endpoint is not connected
594   * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
595   * registered address space of epd
596   */
597  int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
598  
599  /**
600   * scif_readfrom() - Copy from a remote address space
601   * @epd:	endpoint descriptor
602   * @loffset:	offset in local registered address space to
603   *		which to copy
604   * @len:	length of range to copy
605   * @roffset:	offset in remote registered address space
606   *		from which to copy
607   * @rma_flags:	transfer mode flags
608   *
609   * scif_readfrom() copies len bytes from the remote registered address space of
610   * the peer of endpoint epd, starting at the offset roffset to the local
611   * registered address space of epd, starting at the offset loffset.
612   *
613   * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
614   * roffset + len - 1] must be within some registered window or windows of the
615   * local and remote nodes. A range may intersect multiple registered windows,
616   * but only if those windows are contiguous in the registered address space.
617   *
618   * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
619   * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
620   * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
621   * transfer is complete. Otherwise, the transfer may be performed asynchron-
622   * ously. The order in which any two asynchronous RMA operations complete
623   * is non-deterministic. The synchronization functions, scif_fence_mark()/
624   * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
625   * the completion of asynchronous RMA operations on the same endpoint.
626   *
627   * The DMA transfer of individual bytes is not guaranteed to complete in
628   * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
629   * cacheline or partial cacheline of the source range will become visible on
630   * the destination node after all other transferred data in the source
631   * range has become visible on the destination node.
632   *
633   * The optimal DMA performance will likely be realized if both
634   * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
635   * performance will likely be realized if loffset and roffset are not
636   * cacheline aligned but are separated by some multiple of 64. The lowest level
637   * of performance is likely if loffset and roffset are not separated by a
638   * multiple of 64.
639   *
640   * The rma_flags argument is formed by ORing together zero or more of the
641   * following values.
642   * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
643   *	engine.
644   * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
645   *		transfer has completed. Passing this flag results in the
646   *		current implementation busy waiting and consuming CPU cycles
647   *		while the DMA transfer is in progress for best performance by
648   *		avoiding the interrupt latency.
649   * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
650   *		the source range becomes visible on the destination node
651   *		after all other transferred data in the source range has
652   *		become visible on the destination
653   *
654   * Return:
655   * Upon successful completion, scif_readfrom() returns 0; otherwise in user
656   * mode -1 is returned and errno is set to indicate the error; in kernel mode
657   * the negative of one of the following errors is returned.
658   *
659   * Errors:
660   * EACCESS - Attempt to write to a read-only range
661   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
662   * ECONNRESET - Connection reset by peer
663   * EINVAL - rma_flags is invalid
664   * ENODEV - The remote node is lost or existed, but is not currently in the
665   * network since it may have crashed
666   * ENOTCONN - The endpoint is not connected
667   * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
668   * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
669   * for the registered address space of the peer of epd, or loffset or roffset
670   * is negative
671   */
672  int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
673  		  roffset, int rma_flags);
674  
675  /**
676   * scif_writeto() - Copy to a remote address space
677   * @epd:	endpoint descriptor
678   * @loffset:	offset in local registered address space
679   *		from which to copy
680   * @len:	length of range to copy
681   * @roffset:	offset in remote registered address space to
682   *		which to copy
683   * @rma_flags:	transfer mode flags
684   *
685   * scif_writeto() copies len bytes from the local registered address space of
686   * epd, starting at the offset loffset to the remote registered address space
687   * of the peer of endpoint epd, starting at the offset roffset.
688   *
689   * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
690   * roffset + len - 1] must be within some registered window or windows of the
691   * local and remote nodes. A range may intersect multiple registered windows,
692   * but only if those windows are contiguous in the registered address space.
693   *
694   * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
695   * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
696   * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
697   * transfer is complete. Otherwise, the transfer may be performed asynchron-
698   * ously. The order in which any two asynchronous RMA operations complete
699   * is non-deterministic. The synchronization functions, scif_fence_mark()/
700   * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
701   * the completion of asynchronous RMA operations on the same endpoint.
702   *
703   * The DMA transfer of individual bytes is not guaranteed to complete in
704   * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
705   * cacheline or partial cacheline of the source range will become visible on
706   * the destination node after all other transferred data in the source
707   * range has become visible on the destination node.
708   *
709   * The optimal DMA performance will likely be realized if both
710   * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
711   * performance will likely be realized if loffset and roffset are not cacheline
712   * aligned but are separated by some multiple of 64. The lowest level of
713   * performance is likely if loffset and roffset are not separated by a multiple
714   * of 64.
715   *
716   * The rma_flags argument is formed by ORing together zero or more of the
717   * following values.
718   * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
719   *			engine.
720   * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
721   *		transfer has completed. Passing this flag results in the
722   *		current implementation busy waiting and consuming CPU cycles
723   *		while the DMA transfer is in progress for best performance by
724   *		avoiding the interrupt latency.
725   * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
726   *		the source range becomes visible on the destination node
727   *		after all other transferred data in the source range has
728   *		become visible on the destination
729   *
730   * Return:
731   * Upon successful completion, scif_readfrom() returns 0; otherwise in user
732   * mode -1 is returned and errno is set to indicate the error; in kernel mode
733   * the negative of one of the following errors is returned.
734   *
735   * Errors:
736   * EACCESS - Attempt to write to a read-only range
737   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
738   * ECONNRESET - Connection reset by peer
739   * EINVAL - rma_flags is invalid
740   * ENODEV - The remote node is lost or existed, but is not currently in the
741   * network since it may have crashed
742   * ENOTCONN - The endpoint is not connected
743   * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
744   * address space of epd, or, The range [roffset , roffset + len -1] is invalid
745   * for the registered address space of the peer of epd, or loffset or roffset
746   * is negative
747   */
748  int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
749  		 roffset, int rma_flags);
750  
751  /**
752   * scif_vreadfrom() - Copy from a remote address space
753   * @epd:	endpoint descriptor
754   * @addr:	address to which to copy
755   * @len:	length of range to copy
756   * @roffset:	offset in remote registered address space
757   *		from which to copy
758   * @rma_flags:	transfer mode flags
759   *
760   * scif_vreadfrom() copies len bytes from the remote registered address
761   * space of the peer of endpoint epd, starting at the offset roffset, to local
762   * memory, starting at addr.
763   *
764   * The specified range [roffset, roffset + len - 1] must be within some
765   * registered window or windows of the remote nodes. The range may
766   * intersect multiple registered windows, but only if those windows are
767   * contiguous in the registered address space.
768   *
769   * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
770   * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
771   * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
772   * transfer is complete. Otherwise, the transfer may be performed asynchron-
773   * ously. The order in which any two asynchronous RMA operations complete
774   * is non-deterministic. The synchronization functions, scif_fence_mark()/
775   * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
776   * the completion of asynchronous RMA operations on the same endpoint.
777   *
778   * The DMA transfer of individual bytes is not guaranteed to complete in
779   * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
780   * cacheline or partial cacheline of the source range will become visible on
781   * the destination node after all other transferred data in the source
782   * range has become visible on the destination node.
783   *
784   * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
785   * the specified local memory range may be remain in a pinned state even after
786   * the specified transfer completes. This may reduce overhead if some or all of
787   * the same virtual address range is referenced in a subsequent call of
788   * scif_vreadfrom() or scif_vwriteto().
789   *
790   * The optimal DMA performance will likely be realized if both
791   * addr and roffset are cacheline aligned (are a multiple of 64). Lower
792   * performance will likely be realized if addr and roffset are not
793   * cacheline aligned but are separated by some multiple of 64. The lowest level
794   * of performance is likely if addr and roffset are not separated by a
795   * multiple of 64.
796   *
797   * The rma_flags argument is formed by ORing together zero or more of the
798   * following values.
799   * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
800   *	engine.
801   * SCIF_RMA_USECACHE - enable registration caching
802   * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
803   *		transfer has completed. Passing this flag results in the
804   *		current implementation busy waiting and consuming CPU cycles
805   *		while the DMA transfer is in progress for best performance by
806   *		avoiding the interrupt latency.
807   * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
808   *	the source range becomes visible on the destination node
809   *	after all other transferred data in the source range has
810   *	become visible on the destination
811   *
812   * Return:
813   * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
814   * mode -1 is returned and errno is set to indicate the error; in kernel mode
815   * the negative of one of the following errors is returned.
816   *
817   * Errors:
818   * EACCESS - Attempt to write to a read-only range
819   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
820   * ECONNRESET - Connection reset by peer
821   * EINVAL - rma_flags is invalid
822   * ENODEV - The remote node is lost or existed, but is not currently in the
823   * network since it may have crashed
824   * ENOTCONN - The endpoint is not connected
825   * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
826   * registered address space of epd
827   */
828  int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
829  		   int rma_flags);
830  
831  /**
832   * scif_vwriteto() - Copy to a remote address space
833   * @epd:	endpoint descriptor
834   * @addr:	address from which to copy
835   * @len:	length of range to copy
836   * @roffset:	offset in remote registered address space to
837   *		which to copy
838   * @rma_flags:	transfer mode flags
839   *
840   * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
841   * the remote registered address space of the peer of endpoint epd, starting at
842   * the offset roffset.
843   *
844   * The specified range [roffset, roffset + len - 1] must be within some
845   * registered window or windows of the remote nodes. The range may intersect
846   * multiple registered windows, but only if those windows are contiguous in the
847   * registered address space.
848   *
849   * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
850   * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
851   * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
852   * transfer is complete. Otherwise, the transfer may be performed asynchron-
853   * ously. The order in which any two asynchronous RMA operations complete
854   * is non-deterministic. The synchronization functions, scif_fence_mark()/
855   * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
856   * the completion of asynchronous RMA operations on the same endpoint.
857   *
858   * The DMA transfer of individual bytes is not guaranteed to complete in
859   * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
860   * cacheline or partial cacheline of the source range will become visible on
861   * the destination node after all other transferred data in the source
862   * range has become visible on the destination node.
863   *
864   * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
865   * the specified local memory range may be remain in a pinned state even after
866   * the specified transfer completes. This may reduce overhead if some or all of
867   * the same virtual address range is referenced in a subsequent call of
868   * scif_vreadfrom() or scif_vwriteto().
869   *
870   * The optimal DMA performance will likely be realized if both
871   * addr and offset are cacheline aligned (are a multiple of 64). Lower
872   * performance will likely be realized if addr and offset are not cacheline
873   * aligned but are separated by some multiple of 64. The lowest level of
874   * performance is likely if addr and offset are not separated by a multiple of
875   * 64.
876   *
877   * The rma_flags argument is formed by ORing together zero or more of the
878   * following values.
879   * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
880   *	engine.
881   * SCIF_RMA_USECACHE - allow registration caching
882   * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
883   *		transfer has completed. Passing this flag results in the
884   *		current implementation busy waiting and consuming CPU cycles
885   *		while the DMA transfer is in progress for best performance by
886   *		avoiding the interrupt latency.
887   * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
888   *		the source range becomes visible on the destination node
889   *		after all other transferred data in the source range has
890   *		become visible on the destination
891   *
892   * Return:
893   * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
894   * mode -1 is returned and errno is set to indicate the error; in kernel mode
895   * the negative of one of the following errors is returned.
896   *
897   * Errors:
898   * EACCESS - Attempt to write to a read-only range
899   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
900   * ECONNRESET - Connection reset by peer
901   * EINVAL - rma_flags is invalid
902   * ENODEV - The remote node is lost or existed, but is not currently in the
903   * network since it may have crashed
904   * ENOTCONN - The endpoint is not connected
905   * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
906   * registered address space of epd
907   */
908  int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
909  		  int rma_flags);
910  
911  /**
912   * scif_fence_mark() - Mark previously issued RMAs
913   * @epd:	endpoint descriptor
914   * @flags:	control flags
915   * @mark:	marked value returned as output.
916   *
917   * scif_fence_mark() returns after marking the current set of all uncompleted
918   * RMAs initiated through the endpoint epd or the current set of all
919   * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
920   * marked with a value returned at mark. The application may subsequently call
921   * scif_fence_wait(), passing the value returned at mark, to await completion
922   * of all RMAs so marked.
923   *
924   * The flags argument has exactly one of the following values.
925   * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
926   *	epd are marked
927   * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
928   *	of endpoint epd are marked
929   *
930   * Return:
931   * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
932   * mode -1 is returned and errno is set to indicate the error; in kernel mode
933   * the negative of one of the following errors is returned.
934   *
935   * Errors:
936   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
937   * ECONNRESET - Connection reset by peer
938   * EINVAL - flags is invalid
939   * ENODEV - The remote node is lost or existed, but is not currently in the
940   * network since it may have crashed
941   * ENOTCONN - The endpoint is not connected
942   * ENOMEM - Insufficient kernel memory was available
943   */
944  int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
945  
946  /**
947   * scif_fence_wait() - Wait for completion of marked RMAs
948   * @epd:	endpoint descriptor
949   * @mark:	mark request
950   *
951   * scif_fence_wait() returns after all RMAs marked with mark have completed.
952   * The value passed in mark must have been obtained in a previous call to
953   * scif_fence_mark().
954   *
955   * Return:
956   * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
957   * mode -1 is returned and errno is set to indicate the error; in kernel mode
958   * the negative of one of the following errors is returned.
959   *
960   * Errors:
961   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
962   * ECONNRESET - Connection reset by peer
963   * ENODEV - The remote node is lost or existed, but is not currently in the
964   * network since it may have crashed
965   * ENOTCONN - The endpoint is not connected
966   * ENOMEM - Insufficient kernel memory was available
967   */
968  int scif_fence_wait(scif_epd_t epd, int mark);
969  
970  /**
971   * scif_fence_signal() - Request a memory update on completion of RMAs
972   * @epd:	endpoint descriptor
973   * @loff:	local offset
974   * @lval:	local value to write to loffset
975   * @roff:	remote offset
976   * @rval:	remote value to write to roffset
977   * @flags:	flags
978   *
979   * scif_fence_signal() returns after marking the current set of all uncompleted
980   * RMAs initiated through the endpoint epd or marking the current set of all
981   * uncompleted RMAs initiated through the peer of endpoint epd.
982   *
983   * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
984   * marked set, lval is written to memory at the address corresponding to offset
985   * loff in the local registered address space of epd. loff must be within a
986   * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
987   * of the RMAs in the marked set, rval is written to memory at the address
988   * corresponding to offset roff in the remote registered address space of epd.
989   * roff must be within a remote registered window of the peer of epd. Note
990   * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
991   *
992   * The flags argument is formed by OR'ing together the following.
993   * Exactly one of the following values.
994   * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
995   *	epd are marked
996   * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
997   *	of endpoint epd are marked
998   * One or more of the following values.
999   * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
1000   *	memory at the address corresponding to offset loff in the local
1001   *	registered address space of epd.
1002   * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
1003   *	memory at the address corresponding to offset roff in the remote
1004   *	registered address space of epd.
1005   *
1006   * Return:
1007   * Upon successful completion, scif_fence_signal() returns 0; otherwise in
1008   * user mode -1 is returned and errno is set to indicate the error; in kernel
1009   * mode the negative of one of the following errors is returned.
1010   *
1011   * Errors:
1012   * EBADF, ENOTTY - epd is not a valid endpoint descriptor
1013   * ECONNRESET - Connection reset by peer
1014   * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
1015   * ENODEV - The remote node is lost or existed, but is not currently in the
1016   * network since it may have crashed
1017   * ENOTCONN - The endpoint is not connected
1018   * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
1019   * for the registered address space, of the peer of epd
1020   */
1021  int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
1022  		      u64 rval, int flags);
1023  
1024  /**
1025   * scif_get_node_ids() - Return information about online nodes
1026   * @nodes:	array in which to return online node IDs
1027   * @len:	number of entries in the nodes array
1028   * @self:	address to place the node ID of the local node
1029   *
1030   * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
1031   * nodes in the SCIF network. If there is not enough space in nodes, as
1032   * indicated by the len parameter, only len node IDs are returned in nodes. The
1033   * return value of scif_get_node_ids() is the total number of nodes currently in
1034   * the SCIF network. By checking the return value against the len parameter,
1035   * the user may determine if enough space for nodes was allocated.
1036   *
1037   * The node ID of the local node is returned at self.
1038   *
1039   * Return:
1040   * Upon successful completion, scif_get_node_ids() returns the actual number of
1041   * online nodes in the SCIF network including 'self'; otherwise in user mode
1042   * -1 is returned and errno is set to indicate the error; in kernel mode no
1043   * errors are returned.
1044   */
1045  int scif_get_node_ids(u16 *nodes, int len, u16 *self);
1046  
1047  /**
1048   * scif_pin_pages() - Pin a set of pages
1049   * @addr:		Virtual address of range to pin
1050   * @len:		Length of range to pin
1051   * @prot_flags:		Page protection flags
1052   * @map_flags:		Page classification flags
1053   * @pinned_pages:	Handle to pinned pages
1054   *
1055   * scif_pin_pages() pins (locks in physical memory) the physical pages which
1056   * back the range of virtual address pages starting at addr and continuing for
1057   * len bytes. addr and len are constrained to be multiples of the page size. A
1058   * successful scif_pin_pages() call returns a handle to pinned_pages which may
1059   * be used in subsequent calls to scif_register_pinned_pages().
1060   *
1061   * The pages will remain pinned as long as there is a reference against the
1062   * scif_pinned_pages_t value returned by scif_pin_pages() and until
1063   * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A
1064   * reference is added to a scif_pinned_pages_t value each time a window is
1065   * created by calling scif_register_pinned_pages() and passing the
1066   * scif_pinned_pages_t value. A reference is removed from a
1067   * scif_pinned_pages_t value each time such a window is deleted.
1068   *
1069   * Subsequent operations which change the memory pages to which virtual
1070   * addresses are mapped (such as mmap(), munmap()) have no effect on the
1071   * scif_pinned_pages_t value or windows created against it.
1072   *
1073   * If the process will fork(), it is recommended that the registered
1074   * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
1075   * problems due to copy-on-write semantics.
1076   *
1077   * The prot_flags argument is formed by OR'ing together one or more of the
1078   * following values.
1079   * SCIF_PROT_READ - allow read operations against the pages
1080   * SCIF_PROT_WRITE - allow write operations against the pages
1081   * The map_flags argument can be set as SCIF_MAP_KERNEL to interpret addr as a
1082   * kernel space address. By default, addr is interpreted as a user space
1083   * address.
1084   *
1085   * Return:
1086   * Upon successful completion, scif_pin_pages() returns 0; otherwise the
1087   * negative of one of the following errors is returned.
1088   *
1089   * Errors:
1090   * EINVAL - prot_flags is invalid, map_flags is invalid, or offset is negative
1091   * ENOMEM - Not enough space
1092   */
1093  int scif_pin_pages(void *addr, size_t len, int prot_flags, int map_flags,
1094  		   scif_pinned_pages_t *pinned_pages);
1095  
1096  /**
1097   * scif_unpin_pages() - Unpin a set of pages
1098   * @pinned_pages:	Handle to pinned pages to be unpinned
1099   *
1100   * scif_unpin_pages() prevents scif_register_pinned_pages() from registering new
1101   * windows against pinned_pages. The physical pages represented by pinned_pages
1102   * will remain pinned until all windows previously registered against
1103   * pinned_pages are deleted (the window is scif_unregister()'d and all
1104   * references to the window are removed (see scif_unregister()).
1105   *
1106   * pinned_pages must have been obtain from a previous call to scif_pin_pages().
1107   * After calling scif_unpin_pages(), it is an error to pass pinned_pages to
1108   * scif_register_pinned_pages().
1109   *
1110   * Return:
1111   * Upon successful completion, scif_unpin_pages() returns 0; otherwise the
1112   * negative of one of the following errors is returned.
1113   *
1114   * Errors:
1115   * EINVAL - pinned_pages is not valid
1116   */
1117  int scif_unpin_pages(scif_pinned_pages_t pinned_pages);
1118  
1119  /**
1120   * scif_register_pinned_pages() - Mark a memory region for remote access.
1121   * @epd:		endpoint descriptor
1122   * @pinned_pages:	Handle to pinned pages
1123   * @offset:		Registered address space offset
1124   * @map_flags:		Flags which control where pages are mapped
1125   *
1126   * The scif_register_pinned_pages() function opens a window, a range of whole
1127   * pages of the registered address space of the endpoint epd, starting at
1128   * offset po. The value of po, further described below, is a function of the
1129   * parameters offset and pinned_pages, and the value of map_flags. Each page of
1130   * the window represents a corresponding physical memory page of the range
1131   * represented by pinned_pages; the length of the window is the same as the
1132   * length of range represented by pinned_pages. A successful
1133   * scif_register_pinned_pages() call returns po as the return value.
1134   *
1135   * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
1136   * exactly, and offset is constrained to be a multiple of the page size. The
1137   * mapping established by scif_register_pinned_pages() will not replace any
1138   * existing registration; an error is returned if any page of the new window
1139   * would intersect an existing window.
1140   *
1141   * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
1142   * implementation-defined manner to arrive at po. The po so chosen will be an
1143   * area of the registered address space that the implementation deems suitable
1144   * for a mapping of the required size. An offset value of 0 is interpreted as
1145   * granting the implementation complete freedom in selecting po, subject to
1146   * constraints described below. A non-zero value of offset is taken to be a
1147   * suggestion of an offset near which the mapping should be placed. When the
1148   * implementation selects a value for po, it does not replace any extant
1149   * window. In all cases, po will be a multiple of the page size.
1150   *
1151   * The physical pages which are so represented by a window are available for
1152   * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(),
1153   * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
1154   * physical pages represented by the window will not be reused by the memory
1155   * subsystem for any other purpose. Note that the same physical page may be
1156   * represented by multiple windows.
1157   *
1158   * Windows created by scif_register_pinned_pages() are unregistered by
1159   * scif_unregister().
1160   *
1161   * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
1162   * fixed offset.
1163   *
1164   * Return:
1165   * Upon successful completion, scif_register_pinned_pages() returns the offset
1166   * at which the mapping was placed (po); otherwise the negative of one of the
1167   * following errors is returned.
1168   *
1169   * Errors:
1170   * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags and pages in the new window
1171   * would intersect an existing window
1172   * EAGAIN - The mapping could not be performed due to lack of resources
1173   * ECONNRESET - Connection reset by peer
1174   * EINVAL - map_flags is invalid, or SCIF_MAP_FIXED is set in map_flags, and
1175   * offset is not a multiple of the page size, or offset is negative
1176   * ENODEV - The remote node is lost or existed, but is not currently in the
1177   * network since it may have crashed
1178   * ENOMEM - Not enough space
1179   * ENOTCONN - The endpoint is not connected
1180   */
1181  off_t scif_register_pinned_pages(scif_epd_t epd,
1182  				 scif_pinned_pages_t pinned_pages,
1183  				 off_t offset, int map_flags);
1184  
1185  /**
1186   * scif_get_pages() - Add references to remote registered pages
1187   * @epd:	endpoint descriptor
1188   * @offset:	remote registered offset
1189   * @len:	length of range of pages
1190   * @pages:	returned scif_range structure
1191   *
1192   * scif_get_pages() returns the addresses of the physical pages represented by
1193   * those pages of the registered address space of the peer of epd, starting at
1194   * offset and continuing for len bytes. offset and len are constrained to be
1195   * multiples of the page size.
1196   *
1197   * All of the pages in the specified range [offset, offset + len - 1] must be
1198   * within a single window of the registered address space of the peer of epd.
1199   *
1200   * The addresses are returned as a virtually contiguous array pointed to by the
1201   * phys_addr component of the scif_range structure whose address is returned in
1202   * pages. The nr_pages component of scif_range is the length of the array. The
1203   * prot_flags component of scif_range holds the protection flag value passed
1204   * when the pages were registered.
1205   *
1206   * Each physical page whose address is returned by scif_get_pages() remains
1207   * available and will not be released for reuse until the scif_range structure
1208   * is returned in a call to scif_put_pages(). The scif_range structure returned
1209   * by scif_get_pages() must be unmodified.
1210   *
1211   * It is an error to call scif_close() on an endpoint on which a scif_range
1212   * structure of that endpoint has not been returned to scif_put_pages().
1213   *
1214   * Return:
1215   * Upon successful completion, scif_get_pages() returns 0; otherwise the
1216   * negative of one of the following errors is returned.
1217   * Errors:
1218   * ECONNRESET - Connection reset by peer.
1219   * EINVAL - offset is not a multiple of the page size, or offset is negative, or
1220   * len is not a multiple of the page size
1221   * ENODEV - The remote node is lost or existed, but is not currently in the
1222   * network since it may have crashed
1223   * ENOTCONN - The endpoint is not connected
1224   * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid
1225   * for the registered address space of the peer epd
1226   */
1227  int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
1228  		   struct scif_range **pages);
1229  
1230  /**
1231   * scif_put_pages() - Remove references from remote registered pages
1232   * @pages:	pages to be returned
1233   *
1234   * scif_put_pages() releases a scif_range structure previously obtained by
1235   * calling scif_get_pages(). The physical pages represented by pages may
1236   * be reused when the window which represented those pages is unregistered.
1237   * Therefore, those pages must not be accessed after calling scif_put_pages().
1238   *
1239   * Return:
1240   * Upon successful completion, scif_put_pages() returns 0; otherwise the
1241   * negative of one of the following errors is returned.
1242   * Errors:
1243   * EINVAL - pages does not point to a valid scif_range structure, or
1244   * the scif_range structure pointed to by pages was already returned
1245   * ENODEV - The remote node is lost or existed, but is not currently in the
1246   * network since it may have crashed
1247   * ENOTCONN - The endpoint is not connected
1248   */
1249  int scif_put_pages(struct scif_range *pages);
1250  
1251  /**
1252   * scif_poll() - Wait for some event on an endpoint
1253   * @epds:	Array of endpoint descriptors
1254   * @nepds:	Length of epds
1255   * @timeout:	Upper limit on time for which scif_poll() will block
1256   *
1257   * scif_poll() waits for one of a set of endpoints to become ready to perform
1258   * an I/O operation.
1259   *
1260   * The epds argument specifies the endpoint descriptors to be examined and the
1261   * events of interest for each endpoint descriptor. epds is a pointer to an
1262   * array with one member for each open endpoint descriptor of interest.
1263   *
1264   * The number of items in the epds array is specified in nepds. The epd field
1265   * of scif_pollepd is an endpoint descriptor of an open endpoint. The field
1266   * events is a bitmask specifying the events which the application is
1267   * interested in. The field revents is an output parameter, filled by the
1268   * kernel with the events that actually occurred. The bits returned in revents
1269   * can include any of those specified in events, or one of the values POLLERR,
1270   * POLLHUP, or POLLNVAL. (These three bits are meaningless in the events
1271   * field, and will be set in the revents field whenever the corresponding
1272   * condition is true.)
1273   *
1274   * If none of the events requested (and no error) has occurred for any of the
1275   * endpoint descriptors, then scif_poll() blocks until one of the events occurs.
1276   *
1277   * The timeout argument specifies an upper limit on the time for which
1278   * scif_poll() will block, in milliseconds. Specifying a negative value in
1279   * timeout means an infinite timeout.
1280   *
1281   * The following bits may be set in events and returned in revents.
1282   * POLLIN - Data may be received without blocking. For a connected
1283   * endpoint, this means that scif_recv() may be called without blocking. For a
1284   * listening endpoint, this means that scif_accept() may be called without
1285   * blocking.
1286   * POLLOUT - Data may be sent without blocking. For a connected endpoint, this
1287   * means that scif_send() may be called without blocking. POLLOUT may also be
1288   * used to block waiting for a non-blocking connect to complete. This bit value
1289   * has no meaning for a listening endpoint and is ignored if specified.
1290   *
1291   * The following bits are only returned in revents, and are ignored if set in
1292   * events.
1293   * POLLERR - An error occurred on the endpoint
1294   * POLLHUP - The connection to the peer endpoint was disconnected
1295   * POLLNVAL - The specified endpoint descriptor is invalid.
1296   *
1297   * Return:
1298   * Upon successful completion, scif_poll() returns a non-negative value. A
1299   * positive value indicates the total number of endpoint descriptors that have
1300   * been selected (that is, endpoint descriptors for which the revents member is
1301   * non-zero). A value of 0 indicates that the call timed out and no endpoint
1302   * descriptors have been selected. Otherwise in user mode -1 is returned and
1303   * errno is set to indicate the error; in kernel mode the negative of one of
1304   * the following errors is returned.
1305   *
1306   * Errors:
1307   * EINTR - A signal occurred before any requested event
1308   * EINVAL - The nepds argument is greater than {OPEN_MAX}
1309   * ENOMEM - There was no space to allocate file descriptor tables
1310   */
1311  int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout);
1312  
1313  /**
1314   * scif_client_register() - Register a SCIF client
1315   * @client:	client to be registered
1316   *
1317   * scif_client_register() registers a SCIF client. The probe() method
1318   * of the client is called when SCIF peer devices come online and the
1319   * remove() method is called when the peer devices disappear.
1320   *
1321   * Return:
1322   * Upon successful completion, scif_client_register() returns a non-negative
1323   * value. Otherwise the return value is the same as subsys_interface_register()
1324   * in the kernel.
1325   */
1326  int scif_client_register(struct scif_client *client);
1327  
1328  /**
1329   * scif_client_unregister() - Unregister a SCIF client
1330   * @client:	client to be unregistered
1331   *
1332   * scif_client_unregister() unregisters a SCIF client.
1333   *
1334   * Return:
1335   * None
1336   */
1337  void scif_client_unregister(struct scif_client *client);
1338  
1339  #endif /* __SCIF_H__ */
1340