1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017, Microsoft Corporation.
4 *
5 * Author(s): Long Li <longli@microsoft.com>
6 */
7 #include <linux/module.h>
8 #include <linux/highmem.h>
9 #include <linux/folio_queue.h>
10 #include "../common/smbdirect/smbdirect_pdu.h"
11 #include "smbdirect.h"
12 #include "cifs_debug.h"
13 #include "cifsproto.h"
14 #include "smb2proto.h"
15
16 static struct smbd_response *get_receive_buffer(
17 struct smbd_connection *info);
18 static void put_receive_buffer(
19 struct smbd_connection *info,
20 struct smbd_response *response);
21 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
22 static void destroy_receive_buffers(struct smbd_connection *info);
23
24 static void enqueue_reassembly(
25 struct smbd_connection *info,
26 struct smbd_response *response, int data_length);
27 static struct smbd_response *_get_first_reassembly(
28 struct smbd_connection *info);
29
30 static int smbd_post_recv(
31 struct smbd_connection *info,
32 struct smbd_response *response);
33
34 static int smbd_post_send_empty(struct smbd_connection *info);
35
36 static void destroy_mr_list(struct smbd_connection *info);
37 static int allocate_mr_list(struct smbd_connection *info);
38
39 struct smb_extract_to_rdma {
40 struct ib_sge *sge;
41 unsigned int nr_sge;
42 unsigned int max_sge;
43 struct ib_device *device;
44 u32 local_dma_lkey;
45 enum dma_data_direction direction;
46 };
47 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
48 struct smb_extract_to_rdma *rdma);
49
50 /* Port numbers for SMBD transport */
51 #define SMB_PORT 445
52 #define SMBD_PORT 5445
53
54 /* Address lookup and resolve timeout in ms */
55 #define RDMA_RESOLVE_TIMEOUT 5000
56
57 /* SMBD negotiation timeout in seconds */
58 #define SMBD_NEGOTIATE_TIMEOUT 120
59
60 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
61 #define SMBD_MIN_RECEIVE_SIZE 128
62 #define SMBD_MIN_FRAGMENTED_SIZE 131072
63
64 /*
65 * Default maximum number of RDMA read/write outstanding on this connection
66 * This value is possibly decreased during QP creation on hardware limit
67 */
68 #define SMBD_CM_RESPONDER_RESOURCES 32
69
70 /* Maximum number of retries on data transfer operations */
71 #define SMBD_CM_RETRY 6
72 /* No need to retry on Receiver Not Ready since SMBD manages credits */
73 #define SMBD_CM_RNR_RETRY 0
74
75 /*
76 * User configurable initial values per SMBD transport connection
77 * as defined in [MS-SMBD] 3.1.1.1
78 * Those may change after a SMBD negotiation
79 */
80 /* The local peer's maximum number of credits to grant to the peer */
81 int smbd_receive_credit_max = 255;
82
83 /* The remote peer's credit request of local peer */
84 int smbd_send_credit_target = 255;
85
86 /* The maximum single message size can be sent to remote peer */
87 int smbd_max_send_size = 1364;
88
89 /* The maximum fragmented upper-layer payload receive size supported */
90 int smbd_max_fragmented_recv_size = 1024 * 1024;
91
92 /* The maximum single-message size which can be received */
93 int smbd_max_receive_size = 1364;
94
95 /* The timeout to initiate send of a keepalive message on idle */
96 int smbd_keep_alive_interval = 120;
97
98 /*
99 * User configurable initial values for RDMA transport
100 * The actual values used may be lower and are limited to hardware capabilities
101 */
102 /* Default maximum number of pages in a single RDMA write/read */
103 int smbd_max_frmr_depth = 2048;
104
105 /* If payload is less than this byte, use RDMA send/recv not read/write */
106 int rdma_readwrite_threshold = 4096;
107
108 /* Transport logging functions
109 * Logging are defined as classes. They can be OR'ed to define the actual
110 * logging level via module parameter smbd_logging_class
111 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
112 * log_rdma_event()
113 */
114 #define LOG_OUTGOING 0x1
115 #define LOG_INCOMING 0x2
116 #define LOG_READ 0x4
117 #define LOG_WRITE 0x8
118 #define LOG_RDMA_SEND 0x10
119 #define LOG_RDMA_RECV 0x20
120 #define LOG_KEEP_ALIVE 0x40
121 #define LOG_RDMA_EVENT 0x80
122 #define LOG_RDMA_MR 0x100
123 static unsigned int smbd_logging_class;
124 module_param(smbd_logging_class, uint, 0644);
125 MODULE_PARM_DESC(smbd_logging_class,
126 "Logging class for SMBD transport 0x0 to 0x100");
127
128 #define ERR 0x0
129 #define INFO 0x1
130 static unsigned int smbd_logging_level = ERR;
131 module_param(smbd_logging_level, uint, 0644);
132 MODULE_PARM_DESC(smbd_logging_level,
133 "Logging level for SMBD transport, 0 (default): error, 1: info");
134
135 #define log_rdma(level, class, fmt, args...) \
136 do { \
137 if (level <= smbd_logging_level || class & smbd_logging_class) \
138 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
139 } while (0)
140
141 #define log_outgoing(level, fmt, args...) \
142 log_rdma(level, LOG_OUTGOING, fmt, ##args)
143 #define log_incoming(level, fmt, args...) \
144 log_rdma(level, LOG_INCOMING, fmt, ##args)
145 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
146 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
147 #define log_rdma_send(level, fmt, args...) \
148 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
149 #define log_rdma_recv(level, fmt, args...) \
150 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
151 #define log_keep_alive(level, fmt, args...) \
152 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
153 #define log_rdma_event(level, fmt, args...) \
154 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
155 #define log_rdma_mr(level, fmt, args...) \
156 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
157
smbd_disconnect_rdma_work(struct work_struct * work)158 static void smbd_disconnect_rdma_work(struct work_struct *work)
159 {
160 struct smbd_connection *info =
161 container_of(work, struct smbd_connection, disconnect_work);
162 struct smbdirect_socket *sc = &info->socket;
163
164 if (sc->status == SMBDIRECT_SOCKET_CONNECTED) {
165 sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
166 rdma_disconnect(sc->rdma.cm_id);
167 }
168 }
169
smbd_disconnect_rdma_connection(struct smbd_connection * info)170 static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
171 {
172 queue_work(info->workqueue, &info->disconnect_work);
173 }
174
175 /* Upcall from RDMA CM */
smbd_conn_upcall(struct rdma_cm_id * id,struct rdma_cm_event * event)176 static int smbd_conn_upcall(
177 struct rdma_cm_id *id, struct rdma_cm_event *event)
178 {
179 struct smbd_connection *info = id->context;
180 struct smbdirect_socket *sc = &info->socket;
181
182 log_rdma_event(INFO, "event=%d status=%d\n",
183 event->event, event->status);
184
185 switch (event->event) {
186 case RDMA_CM_EVENT_ADDR_RESOLVED:
187 case RDMA_CM_EVENT_ROUTE_RESOLVED:
188 info->ri_rc = 0;
189 complete(&info->ri_done);
190 break;
191
192 case RDMA_CM_EVENT_ADDR_ERROR:
193 info->ri_rc = -EHOSTUNREACH;
194 complete(&info->ri_done);
195 break;
196
197 case RDMA_CM_EVENT_ROUTE_ERROR:
198 info->ri_rc = -ENETUNREACH;
199 complete(&info->ri_done);
200 break;
201
202 case RDMA_CM_EVENT_ESTABLISHED:
203 log_rdma_event(INFO, "connected event=%d\n", event->event);
204 sc->status = SMBDIRECT_SOCKET_CONNECTED;
205 wake_up_interruptible(&info->conn_wait);
206 break;
207
208 case RDMA_CM_EVENT_CONNECT_ERROR:
209 case RDMA_CM_EVENT_UNREACHABLE:
210 case RDMA_CM_EVENT_REJECTED:
211 log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
212 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
213 wake_up_interruptible(&info->conn_wait);
214 break;
215
216 case RDMA_CM_EVENT_DEVICE_REMOVAL:
217 case RDMA_CM_EVENT_DISCONNECTED:
218 /* This happens when we fail the negotiation */
219 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
220 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
221 wake_up(&info->conn_wait);
222 break;
223 }
224
225 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
226 wake_up_interruptible(&info->disconn_wait);
227 wake_up_interruptible(&info->wait_reassembly_queue);
228 wake_up_interruptible_all(&info->wait_send_queue);
229 break;
230
231 default:
232 break;
233 }
234
235 return 0;
236 }
237
238 /* Upcall from RDMA QP */
239 static void
smbd_qp_async_error_upcall(struct ib_event * event,void * context)240 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
241 {
242 struct smbd_connection *info = context;
243
244 log_rdma_event(ERR, "%s on device %s info %p\n",
245 ib_event_msg(event->event), event->device->name, info);
246
247 switch (event->event) {
248 case IB_EVENT_CQ_ERR:
249 case IB_EVENT_QP_FATAL:
250 smbd_disconnect_rdma_connection(info);
251 break;
252
253 default:
254 break;
255 }
256 }
257
smbd_request_payload(struct smbd_request * request)258 static inline void *smbd_request_payload(struct smbd_request *request)
259 {
260 return (void *)request->packet;
261 }
262
smbd_response_payload(struct smbd_response * response)263 static inline void *smbd_response_payload(struct smbd_response *response)
264 {
265 return (void *)response->packet;
266 }
267
268 /* Called when a RDMA send is done */
send_done(struct ib_cq * cq,struct ib_wc * wc)269 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
270 {
271 int i;
272 struct smbd_request *request =
273 container_of(wc->wr_cqe, struct smbd_request, cqe);
274 struct smbd_connection *info = request->info;
275 struct smbdirect_socket *sc = &info->socket;
276
277 log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
278 request, wc->status);
279
280 for (i = 0; i < request->num_sge; i++)
281 ib_dma_unmap_single(sc->ib.dev,
282 request->sge[i].addr,
283 request->sge[i].length,
284 DMA_TO_DEVICE);
285
286 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
287 log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
288 wc->status, wc->opcode);
289 mempool_free(request, info->request_mempool);
290 smbd_disconnect_rdma_connection(info);
291 return;
292 }
293
294 if (atomic_dec_and_test(&request->info->send_pending))
295 wake_up(&request->info->wait_send_pending);
296
297 wake_up(&request->info->wait_post_send);
298
299 mempool_free(request, request->info->request_mempool);
300 }
301
dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp * resp)302 static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
303 {
304 log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
305 resp->min_version, resp->max_version,
306 resp->negotiated_version, resp->credits_requested,
307 resp->credits_granted, resp->status,
308 resp->max_readwrite_size, resp->preferred_send_size,
309 resp->max_receive_size, resp->max_fragmented_size);
310 }
311
312 /*
313 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
314 * response, packet_length: the negotiation response message
315 * return value: true if negotiation is a success, false if failed
316 */
process_negotiation_response(struct smbd_response * response,int packet_length)317 static bool process_negotiation_response(
318 struct smbd_response *response, int packet_length)
319 {
320 struct smbd_connection *info = response->info;
321 struct smbdirect_socket *sc = &info->socket;
322 struct smbdirect_socket_parameters *sp = &sc->parameters;
323 struct smbdirect_negotiate_resp *packet = smbd_response_payload(response);
324
325 if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
326 log_rdma_event(ERR,
327 "error: packet_length=%d\n", packet_length);
328 return false;
329 }
330
331 if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
332 log_rdma_event(ERR, "error: negotiated_version=%x\n",
333 le16_to_cpu(packet->negotiated_version));
334 return false;
335 }
336 info->protocol = le16_to_cpu(packet->negotiated_version);
337
338 if (packet->credits_requested == 0) {
339 log_rdma_event(ERR, "error: credits_requested==0\n");
340 return false;
341 }
342 info->receive_credit_target = le16_to_cpu(packet->credits_requested);
343
344 if (packet->credits_granted == 0) {
345 log_rdma_event(ERR, "error: credits_granted==0\n");
346 return false;
347 }
348 atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
349
350 atomic_set(&info->receive_credits, 0);
351
352 if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
353 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
354 le32_to_cpu(packet->preferred_send_size));
355 return false;
356 }
357 sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
358
359 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
360 log_rdma_event(ERR, "error: max_receive_size=%d\n",
361 le32_to_cpu(packet->max_receive_size));
362 return false;
363 }
364 sp->max_send_size = min_t(u32, sp->max_send_size,
365 le32_to_cpu(packet->max_receive_size));
366
367 if (le32_to_cpu(packet->max_fragmented_size) <
368 SMBD_MIN_FRAGMENTED_SIZE) {
369 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
370 le32_to_cpu(packet->max_fragmented_size));
371 return false;
372 }
373 sp->max_fragmented_send_size =
374 le32_to_cpu(packet->max_fragmented_size);
375 info->rdma_readwrite_threshold =
376 rdma_readwrite_threshold > sp->max_fragmented_send_size ?
377 sp->max_fragmented_send_size :
378 rdma_readwrite_threshold;
379
380
381 sp->max_read_write_size = min_t(u32,
382 le32_to_cpu(packet->max_readwrite_size),
383 info->max_frmr_depth * PAGE_SIZE);
384 info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
385
386 return true;
387 }
388
smbd_post_send_credits(struct work_struct * work)389 static void smbd_post_send_credits(struct work_struct *work)
390 {
391 int ret = 0;
392 int rc;
393 struct smbd_response *response;
394 struct smbd_connection *info =
395 container_of(work, struct smbd_connection,
396 post_send_credits_work);
397 struct smbdirect_socket *sc = &info->socket;
398
399 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
400 wake_up(&info->wait_receive_queues);
401 return;
402 }
403
404 if (info->receive_credit_target >
405 atomic_read(&info->receive_credits)) {
406 while (true) {
407 response = get_receive_buffer(info);
408 if (!response)
409 break;
410
411 response->type = SMBD_TRANSFER_DATA;
412 response->first_segment = false;
413 rc = smbd_post_recv(info, response);
414 if (rc) {
415 log_rdma_recv(ERR,
416 "post_recv failed rc=%d\n", rc);
417 put_receive_buffer(info, response);
418 break;
419 }
420
421 ret++;
422 }
423 }
424
425 spin_lock(&info->lock_new_credits_offered);
426 info->new_credits_offered += ret;
427 spin_unlock(&info->lock_new_credits_offered);
428
429 /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
430 info->send_immediate = true;
431 if (atomic_read(&info->receive_credits) <
432 info->receive_credit_target - 1) {
433 if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
434 info->send_immediate) {
435 log_keep_alive(INFO, "send an empty message\n");
436 smbd_post_send_empty(info);
437 }
438 }
439 }
440
441 /* Called from softirq, when recv is done */
recv_done(struct ib_cq * cq,struct ib_wc * wc)442 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
443 {
444 struct smbdirect_data_transfer *data_transfer;
445 struct smbd_response *response =
446 container_of(wc->wr_cqe, struct smbd_response, cqe);
447 struct smbd_connection *info = response->info;
448 int data_length = 0;
449
450 log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
451 response, response->type, wc->status, wc->opcode,
452 wc->byte_len, wc->pkey_index);
453
454 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
455 log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
456 wc->status, wc->opcode);
457 goto error;
458 }
459
460 ib_dma_sync_single_for_cpu(
461 wc->qp->device,
462 response->sge.addr,
463 response->sge.length,
464 DMA_FROM_DEVICE);
465
466 switch (response->type) {
467 /* SMBD negotiation response */
468 case SMBD_NEGOTIATE_RESP:
469 dump_smbdirect_negotiate_resp(smbd_response_payload(response));
470 info->full_packet_received = true;
471 info->negotiate_done =
472 process_negotiation_response(response, wc->byte_len);
473 put_receive_buffer(info, response);
474 complete(&info->negotiate_completion);
475 return;
476
477 /* SMBD data transfer packet */
478 case SMBD_TRANSFER_DATA:
479 data_transfer = smbd_response_payload(response);
480 data_length = le32_to_cpu(data_transfer->data_length);
481
482 if (data_length) {
483 if (info->full_packet_received)
484 response->first_segment = true;
485
486 if (le32_to_cpu(data_transfer->remaining_data_length))
487 info->full_packet_received = false;
488 else
489 info->full_packet_received = true;
490 }
491
492 atomic_dec(&info->receive_credits);
493 info->receive_credit_target =
494 le16_to_cpu(data_transfer->credits_requested);
495 if (le16_to_cpu(data_transfer->credits_granted)) {
496 atomic_add(le16_to_cpu(data_transfer->credits_granted),
497 &info->send_credits);
498 /*
499 * We have new send credits granted from remote peer
500 * If any sender is waiting for credits, unblock it
501 */
502 wake_up_interruptible(&info->wait_send_queue);
503 }
504
505 log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
506 le16_to_cpu(data_transfer->flags),
507 le32_to_cpu(data_transfer->data_offset),
508 le32_to_cpu(data_transfer->data_length),
509 le32_to_cpu(data_transfer->remaining_data_length));
510
511 /* Send a KEEP_ALIVE response right away if requested */
512 info->keep_alive_requested = KEEP_ALIVE_NONE;
513 if (le16_to_cpu(data_transfer->flags) &
514 SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
515 info->keep_alive_requested = KEEP_ALIVE_PENDING;
516 }
517
518 /*
519 * If this is a packet with data playload place the data in
520 * reassembly queue and wake up the reading thread
521 */
522 if (data_length) {
523 enqueue_reassembly(info, response, data_length);
524 wake_up_interruptible(&info->wait_reassembly_queue);
525 } else
526 put_receive_buffer(info, response);
527
528 return;
529 }
530
531 /*
532 * This is an internal error!
533 */
534 log_rdma_recv(ERR, "unexpected response type=%d\n", response->type);
535 WARN_ON_ONCE(response->type != SMBD_TRANSFER_DATA);
536 error:
537 put_receive_buffer(info, response);
538 smbd_disconnect_rdma_connection(info);
539 }
540
smbd_create_id(struct smbd_connection * info,struct sockaddr * dstaddr,int port)541 static struct rdma_cm_id *smbd_create_id(
542 struct smbd_connection *info,
543 struct sockaddr *dstaddr, int port)
544 {
545 struct rdma_cm_id *id;
546 int rc;
547 __be16 *sport;
548
549 id = rdma_create_id(&init_net, smbd_conn_upcall, info,
550 RDMA_PS_TCP, IB_QPT_RC);
551 if (IS_ERR(id)) {
552 rc = PTR_ERR(id);
553 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
554 return id;
555 }
556
557 if (dstaddr->sa_family == AF_INET6)
558 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
559 else
560 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
561
562 *sport = htons(port);
563
564 init_completion(&info->ri_done);
565 info->ri_rc = -ETIMEDOUT;
566
567 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
568 RDMA_RESOLVE_TIMEOUT);
569 if (rc) {
570 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
571 goto out;
572 }
573 rc = wait_for_completion_interruptible_timeout(
574 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
575 /* e.g. if interrupted returns -ERESTARTSYS */
576 if (rc < 0) {
577 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
578 goto out;
579 }
580 rc = info->ri_rc;
581 if (rc) {
582 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
583 goto out;
584 }
585
586 info->ri_rc = -ETIMEDOUT;
587 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
588 if (rc) {
589 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
590 goto out;
591 }
592 rc = wait_for_completion_interruptible_timeout(
593 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
594 /* e.g. if interrupted returns -ERESTARTSYS */
595 if (rc < 0) {
596 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
597 goto out;
598 }
599 rc = info->ri_rc;
600 if (rc) {
601 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
602 goto out;
603 }
604
605 return id;
606
607 out:
608 rdma_destroy_id(id);
609 return ERR_PTR(rc);
610 }
611
612 /*
613 * Test if FRWR (Fast Registration Work Requests) is supported on the device
614 * This implementation requires FRWR on RDMA read/write
615 * return value: true if it is supported
616 */
frwr_is_supported(struct ib_device_attr * attrs)617 static bool frwr_is_supported(struct ib_device_attr *attrs)
618 {
619 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
620 return false;
621 if (attrs->max_fast_reg_page_list_len == 0)
622 return false;
623 return true;
624 }
625
smbd_ia_open(struct smbd_connection * info,struct sockaddr * dstaddr,int port)626 static int smbd_ia_open(
627 struct smbd_connection *info,
628 struct sockaddr *dstaddr, int port)
629 {
630 struct smbdirect_socket *sc = &info->socket;
631 int rc;
632
633 sc->rdma.cm_id = smbd_create_id(info, dstaddr, port);
634 if (IS_ERR(sc->rdma.cm_id)) {
635 rc = PTR_ERR(sc->rdma.cm_id);
636 goto out1;
637 }
638 sc->ib.dev = sc->rdma.cm_id->device;
639
640 if (!frwr_is_supported(&sc->ib.dev->attrs)) {
641 log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
642 log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
643 sc->ib.dev->attrs.device_cap_flags,
644 sc->ib.dev->attrs.max_fast_reg_page_list_len);
645 rc = -EPROTONOSUPPORT;
646 goto out2;
647 }
648 info->max_frmr_depth = min_t(int,
649 smbd_max_frmr_depth,
650 sc->ib.dev->attrs.max_fast_reg_page_list_len);
651 info->mr_type = IB_MR_TYPE_MEM_REG;
652 if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
653 info->mr_type = IB_MR_TYPE_SG_GAPS;
654
655 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
656 if (IS_ERR(sc->ib.pd)) {
657 rc = PTR_ERR(sc->ib.pd);
658 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
659 goto out2;
660 }
661
662 return 0;
663
664 out2:
665 rdma_destroy_id(sc->rdma.cm_id);
666 sc->rdma.cm_id = NULL;
667
668 out1:
669 return rc;
670 }
671
672 /*
673 * Send a negotiation request message to the peer
674 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
675 * After negotiation, the transport is connected and ready for
676 * carrying upper layer SMB payload
677 */
smbd_post_send_negotiate_req(struct smbd_connection * info)678 static int smbd_post_send_negotiate_req(struct smbd_connection *info)
679 {
680 struct smbdirect_socket *sc = &info->socket;
681 struct smbdirect_socket_parameters *sp = &sc->parameters;
682 struct ib_send_wr send_wr;
683 int rc = -ENOMEM;
684 struct smbd_request *request;
685 struct smbdirect_negotiate_req *packet;
686
687 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
688 if (!request)
689 return rc;
690
691 request->info = info;
692
693 packet = smbd_request_payload(request);
694 packet->min_version = cpu_to_le16(SMBDIRECT_V1);
695 packet->max_version = cpu_to_le16(SMBDIRECT_V1);
696 packet->reserved = 0;
697 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
698 packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
699 packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
700 packet->max_fragmented_size =
701 cpu_to_le32(sp->max_fragmented_recv_size);
702
703 request->num_sge = 1;
704 request->sge[0].addr = ib_dma_map_single(
705 sc->ib.dev, (void *)packet,
706 sizeof(*packet), DMA_TO_DEVICE);
707 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
708 rc = -EIO;
709 goto dma_mapping_failed;
710 }
711
712 request->sge[0].length = sizeof(*packet);
713 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
714
715 ib_dma_sync_single_for_device(
716 sc->ib.dev, request->sge[0].addr,
717 request->sge[0].length, DMA_TO_DEVICE);
718
719 request->cqe.done = send_done;
720
721 send_wr.next = NULL;
722 send_wr.wr_cqe = &request->cqe;
723 send_wr.sg_list = request->sge;
724 send_wr.num_sge = request->num_sge;
725 send_wr.opcode = IB_WR_SEND;
726 send_wr.send_flags = IB_SEND_SIGNALED;
727
728 log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
729 request->sge[0].addr,
730 request->sge[0].length, request->sge[0].lkey);
731
732 atomic_inc(&info->send_pending);
733 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
734 if (!rc)
735 return 0;
736
737 /* if we reach here, post send failed */
738 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
739 atomic_dec(&info->send_pending);
740 ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
741 request->sge[0].length, DMA_TO_DEVICE);
742
743 smbd_disconnect_rdma_connection(info);
744
745 dma_mapping_failed:
746 mempool_free(request, info->request_mempool);
747 return rc;
748 }
749
750 /*
751 * Extend the credits to remote peer
752 * This implements [MS-SMBD] 3.1.5.9
753 * The idea is that we should extend credits to remote peer as quickly as
754 * it's allowed, to maintain data flow. We allocate as much receive
755 * buffer as possible, and extend the receive credits to remote peer
756 * return value: the new credtis being granted.
757 */
manage_credits_prior_sending(struct smbd_connection * info)758 static int manage_credits_prior_sending(struct smbd_connection *info)
759 {
760 int new_credits;
761
762 spin_lock(&info->lock_new_credits_offered);
763 new_credits = info->new_credits_offered;
764 info->new_credits_offered = 0;
765 spin_unlock(&info->lock_new_credits_offered);
766
767 return new_credits;
768 }
769
770 /*
771 * Check if we need to send a KEEP_ALIVE message
772 * The idle connection timer triggers a KEEP_ALIVE message when expires
773 * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
774 * back a response.
775 * return value:
776 * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
777 * 0: otherwise
778 */
manage_keep_alive_before_sending(struct smbd_connection * info)779 static int manage_keep_alive_before_sending(struct smbd_connection *info)
780 {
781 if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
782 info->keep_alive_requested = KEEP_ALIVE_SENT;
783 return 1;
784 }
785 return 0;
786 }
787
788 /* Post the send request */
smbd_post_send(struct smbd_connection * info,struct smbd_request * request)789 static int smbd_post_send(struct smbd_connection *info,
790 struct smbd_request *request)
791 {
792 struct smbdirect_socket *sc = &info->socket;
793 struct smbdirect_socket_parameters *sp = &sc->parameters;
794 struct ib_send_wr send_wr;
795 int rc, i;
796
797 for (i = 0; i < request->num_sge; i++) {
798 log_rdma_send(INFO,
799 "rdma_request sge[%d] addr=0x%llx length=%u\n",
800 i, request->sge[i].addr, request->sge[i].length);
801 ib_dma_sync_single_for_device(
802 sc->ib.dev,
803 request->sge[i].addr,
804 request->sge[i].length,
805 DMA_TO_DEVICE);
806 }
807
808 request->cqe.done = send_done;
809
810 send_wr.next = NULL;
811 send_wr.wr_cqe = &request->cqe;
812 send_wr.sg_list = request->sge;
813 send_wr.num_sge = request->num_sge;
814 send_wr.opcode = IB_WR_SEND;
815 send_wr.send_flags = IB_SEND_SIGNALED;
816
817 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
818 if (rc) {
819 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
820 smbd_disconnect_rdma_connection(info);
821 rc = -EAGAIN;
822 } else
823 /* Reset timer for idle connection after packet is sent */
824 mod_delayed_work(info->workqueue, &info->idle_timer_work,
825 msecs_to_jiffies(sp->keepalive_interval_msec));
826
827 return rc;
828 }
829
smbd_post_send_iter(struct smbd_connection * info,struct iov_iter * iter,int * _remaining_data_length)830 static int smbd_post_send_iter(struct smbd_connection *info,
831 struct iov_iter *iter,
832 int *_remaining_data_length)
833 {
834 struct smbdirect_socket *sc = &info->socket;
835 struct smbdirect_socket_parameters *sp = &sc->parameters;
836 int i, rc;
837 int header_length;
838 int data_length;
839 struct smbd_request *request;
840 struct smbdirect_data_transfer *packet;
841 int new_credits = 0;
842
843 wait_credit:
844 /* Wait for send credits. A SMBD packet needs one credit */
845 rc = wait_event_interruptible(info->wait_send_queue,
846 atomic_read(&info->send_credits) > 0 ||
847 sc->status != SMBDIRECT_SOCKET_CONNECTED);
848 if (rc)
849 goto err_wait_credit;
850
851 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
852 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
853 rc = -EAGAIN;
854 goto err_wait_credit;
855 }
856 if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
857 atomic_inc(&info->send_credits);
858 goto wait_credit;
859 }
860
861 wait_send_queue:
862 wait_event(info->wait_post_send,
863 atomic_read(&info->send_pending) < sp->send_credit_target ||
864 sc->status != SMBDIRECT_SOCKET_CONNECTED);
865
866 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
867 log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
868 rc = -EAGAIN;
869 goto err_wait_send_queue;
870 }
871
872 if (unlikely(atomic_inc_return(&info->send_pending) >
873 sp->send_credit_target)) {
874 atomic_dec(&info->send_pending);
875 goto wait_send_queue;
876 }
877
878 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
879 if (!request) {
880 rc = -ENOMEM;
881 goto err_alloc;
882 }
883
884 request->info = info;
885 memset(request->sge, 0, sizeof(request->sge));
886
887 /* Fill in the data payload to find out how much data we can add */
888 if (iter) {
889 struct smb_extract_to_rdma extract = {
890 .nr_sge = 1,
891 .max_sge = SMBDIRECT_MAX_SEND_SGE,
892 .sge = request->sge,
893 .device = sc->ib.dev,
894 .local_dma_lkey = sc->ib.pd->local_dma_lkey,
895 .direction = DMA_TO_DEVICE,
896 };
897 size_t payload_len = umin(*_remaining_data_length,
898 sp->max_send_size - sizeof(*packet));
899
900 rc = smb_extract_iter_to_rdma(iter, payload_len,
901 &extract);
902 if (rc < 0)
903 goto err_dma;
904 data_length = rc;
905 request->num_sge = extract.nr_sge;
906 *_remaining_data_length -= data_length;
907 } else {
908 data_length = 0;
909 request->num_sge = 1;
910 }
911
912 /* Fill in the packet header */
913 packet = smbd_request_payload(request);
914 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
915
916 new_credits = manage_credits_prior_sending(info);
917 atomic_add(new_credits, &info->receive_credits);
918 packet->credits_granted = cpu_to_le16(new_credits);
919
920 info->send_immediate = false;
921
922 packet->flags = 0;
923 if (manage_keep_alive_before_sending(info))
924 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
925
926 packet->reserved = 0;
927 if (!data_length)
928 packet->data_offset = 0;
929 else
930 packet->data_offset = cpu_to_le32(24);
931 packet->data_length = cpu_to_le32(data_length);
932 packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
933 packet->padding = 0;
934
935 log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
936 le16_to_cpu(packet->credits_requested),
937 le16_to_cpu(packet->credits_granted),
938 le32_to_cpu(packet->data_offset),
939 le32_to_cpu(packet->data_length),
940 le32_to_cpu(packet->remaining_data_length));
941
942 /* Map the packet to DMA */
943 header_length = sizeof(struct smbdirect_data_transfer);
944 /* If this is a packet without payload, don't send padding */
945 if (!data_length)
946 header_length = offsetof(struct smbdirect_data_transfer, padding);
947
948 request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
949 (void *)packet,
950 header_length,
951 DMA_TO_DEVICE);
952 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
953 rc = -EIO;
954 request->sge[0].addr = 0;
955 goto err_dma;
956 }
957
958 request->sge[0].length = header_length;
959 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
960
961 rc = smbd_post_send(info, request);
962 if (!rc)
963 return 0;
964
965 err_dma:
966 for (i = 0; i < request->num_sge; i++)
967 if (request->sge[i].addr)
968 ib_dma_unmap_single(sc->ib.dev,
969 request->sge[i].addr,
970 request->sge[i].length,
971 DMA_TO_DEVICE);
972 mempool_free(request, info->request_mempool);
973
974 /* roll back receive credits and credits to be offered */
975 spin_lock(&info->lock_new_credits_offered);
976 info->new_credits_offered += new_credits;
977 spin_unlock(&info->lock_new_credits_offered);
978 atomic_sub(new_credits, &info->receive_credits);
979
980 err_alloc:
981 if (atomic_dec_and_test(&info->send_pending))
982 wake_up(&info->wait_send_pending);
983
984 err_wait_send_queue:
985 /* roll back send credits and pending */
986 atomic_inc(&info->send_credits);
987
988 err_wait_credit:
989 return rc;
990 }
991
992 /*
993 * Send an empty message
994 * Empty message is used to extend credits to peer to for keep live
995 * while there is no upper layer payload to send at the time
996 */
smbd_post_send_empty(struct smbd_connection * info)997 static int smbd_post_send_empty(struct smbd_connection *info)
998 {
999 int remaining_data_length = 0;
1000
1001 info->count_send_empty++;
1002 return smbd_post_send_iter(info, NULL, &remaining_data_length);
1003 }
1004
smbd_post_send_full_iter(struct smbd_connection * info,struct iov_iter * iter,int * _remaining_data_length)1005 static int smbd_post_send_full_iter(struct smbd_connection *info,
1006 struct iov_iter *iter,
1007 int *_remaining_data_length)
1008 {
1009 int rc = 0;
1010
1011 /*
1012 * smbd_post_send_iter() respects the
1013 * negotiated max_send_size, so we need to
1014 * loop until the full iter is posted
1015 */
1016
1017 while (iov_iter_count(iter) > 0) {
1018 rc = smbd_post_send_iter(info, iter, _remaining_data_length);
1019 if (rc < 0)
1020 break;
1021 }
1022
1023 return rc;
1024 }
1025
1026 /*
1027 * Post a receive request to the transport
1028 * The remote peer can only send data when a receive request is posted
1029 * The interaction is controlled by send/receive credit system
1030 */
smbd_post_recv(struct smbd_connection * info,struct smbd_response * response)1031 static int smbd_post_recv(
1032 struct smbd_connection *info, struct smbd_response *response)
1033 {
1034 struct smbdirect_socket *sc = &info->socket;
1035 struct smbdirect_socket_parameters *sp = &sc->parameters;
1036 struct ib_recv_wr recv_wr;
1037 int rc = -EIO;
1038
1039 response->sge.addr = ib_dma_map_single(
1040 sc->ib.dev, response->packet,
1041 sp->max_recv_size, DMA_FROM_DEVICE);
1042 if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
1043 return rc;
1044
1045 response->sge.length = sp->max_recv_size;
1046 response->sge.lkey = sc->ib.pd->local_dma_lkey;
1047
1048 response->cqe.done = recv_done;
1049
1050 recv_wr.wr_cqe = &response->cqe;
1051 recv_wr.next = NULL;
1052 recv_wr.sg_list = &response->sge;
1053 recv_wr.num_sge = 1;
1054
1055 rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
1056 if (rc) {
1057 ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
1058 response->sge.length, DMA_FROM_DEVICE);
1059 response->sge.length = 0;
1060 smbd_disconnect_rdma_connection(info);
1061 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1062 }
1063
1064 return rc;
1065 }
1066
1067 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
smbd_negotiate(struct smbd_connection * info)1068 static int smbd_negotiate(struct smbd_connection *info)
1069 {
1070 int rc;
1071 struct smbd_response *response = get_receive_buffer(info);
1072
1073 response->type = SMBD_NEGOTIATE_RESP;
1074 rc = smbd_post_recv(info, response);
1075 log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
1076 rc, response->sge.addr,
1077 response->sge.length, response->sge.lkey);
1078 if (rc) {
1079 put_receive_buffer(info, response);
1080 return rc;
1081 }
1082
1083 init_completion(&info->negotiate_completion);
1084 info->negotiate_done = false;
1085 rc = smbd_post_send_negotiate_req(info);
1086 if (rc)
1087 return rc;
1088
1089 rc = wait_for_completion_interruptible_timeout(
1090 &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
1091 log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
1092
1093 if (info->negotiate_done)
1094 return 0;
1095
1096 if (rc == 0)
1097 rc = -ETIMEDOUT;
1098 else if (rc == -ERESTARTSYS)
1099 rc = -EINTR;
1100 else
1101 rc = -ENOTCONN;
1102
1103 return rc;
1104 }
1105
1106 /*
1107 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1108 * This is a queue for reassembling upper layer payload and present to upper
1109 * layer. All the inncoming payload go to the reassembly queue, regardless of
1110 * if reassembly is required. The uuper layer code reads from the queue for all
1111 * incoming payloads.
1112 * Put a received packet to the reassembly queue
1113 * response: the packet received
1114 * data_length: the size of payload in this packet
1115 */
enqueue_reassembly(struct smbd_connection * info,struct smbd_response * response,int data_length)1116 static void enqueue_reassembly(
1117 struct smbd_connection *info,
1118 struct smbd_response *response,
1119 int data_length)
1120 {
1121 spin_lock(&info->reassembly_queue_lock);
1122 list_add_tail(&response->list, &info->reassembly_queue);
1123 info->reassembly_queue_length++;
1124 /*
1125 * Make sure reassembly_data_length is updated after list and
1126 * reassembly_queue_length are updated. On the dequeue side
1127 * reassembly_data_length is checked without a lock to determine
1128 * if reassembly_queue_length and list is up to date
1129 */
1130 virt_wmb();
1131 info->reassembly_data_length += data_length;
1132 spin_unlock(&info->reassembly_queue_lock);
1133 info->count_reassembly_queue++;
1134 info->count_enqueue_reassembly_queue++;
1135 }
1136
1137 /*
1138 * Get the first entry at the front of reassembly queue
1139 * Caller is responsible for locking
1140 * return value: the first entry if any, NULL if queue is empty
1141 */
_get_first_reassembly(struct smbd_connection * info)1142 static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
1143 {
1144 struct smbd_response *ret = NULL;
1145
1146 if (!list_empty(&info->reassembly_queue)) {
1147 ret = list_first_entry(
1148 &info->reassembly_queue,
1149 struct smbd_response, list);
1150 }
1151 return ret;
1152 }
1153
1154 /*
1155 * Get a receive buffer
1156 * For each remote send, we need to post a receive. The receive buffers are
1157 * pre-allocated in advance.
1158 * return value: the receive buffer, NULL if none is available
1159 */
get_receive_buffer(struct smbd_connection * info)1160 static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
1161 {
1162 struct smbd_response *ret = NULL;
1163 unsigned long flags;
1164
1165 spin_lock_irqsave(&info->receive_queue_lock, flags);
1166 if (!list_empty(&info->receive_queue)) {
1167 ret = list_first_entry(
1168 &info->receive_queue,
1169 struct smbd_response, list);
1170 list_del(&ret->list);
1171 info->count_receive_queue--;
1172 info->count_get_receive_buffer++;
1173 }
1174 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1175
1176 return ret;
1177 }
1178
1179 /*
1180 * Return a receive buffer
1181 * Upon returning of a receive buffer, we can post new receive and extend
1182 * more receive credits to remote peer. This is done immediately after a
1183 * receive buffer is returned.
1184 */
put_receive_buffer(struct smbd_connection * info,struct smbd_response * response)1185 static void put_receive_buffer(
1186 struct smbd_connection *info, struct smbd_response *response)
1187 {
1188 struct smbdirect_socket *sc = &info->socket;
1189 unsigned long flags;
1190
1191 if (likely(response->sge.length != 0)) {
1192 ib_dma_unmap_single(sc->ib.dev,
1193 response->sge.addr,
1194 response->sge.length,
1195 DMA_FROM_DEVICE);
1196 response->sge.length = 0;
1197 }
1198
1199 spin_lock_irqsave(&info->receive_queue_lock, flags);
1200 list_add_tail(&response->list, &info->receive_queue);
1201 info->count_receive_queue++;
1202 info->count_put_receive_buffer++;
1203 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1204
1205 queue_work(info->workqueue, &info->post_send_credits_work);
1206 }
1207
1208 /* Preallocate all receive buffer on transport establishment */
allocate_receive_buffers(struct smbd_connection * info,int num_buf)1209 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
1210 {
1211 int i;
1212 struct smbd_response *response;
1213
1214 INIT_LIST_HEAD(&info->reassembly_queue);
1215 spin_lock_init(&info->reassembly_queue_lock);
1216 info->reassembly_data_length = 0;
1217 info->reassembly_queue_length = 0;
1218
1219 INIT_LIST_HEAD(&info->receive_queue);
1220 spin_lock_init(&info->receive_queue_lock);
1221 info->count_receive_queue = 0;
1222
1223 init_waitqueue_head(&info->wait_receive_queues);
1224
1225 for (i = 0; i < num_buf; i++) {
1226 response = mempool_alloc(info->response_mempool, GFP_KERNEL);
1227 if (!response)
1228 goto allocate_failed;
1229
1230 response->info = info;
1231 response->sge.length = 0;
1232 list_add_tail(&response->list, &info->receive_queue);
1233 info->count_receive_queue++;
1234 }
1235
1236 return 0;
1237
1238 allocate_failed:
1239 while (!list_empty(&info->receive_queue)) {
1240 response = list_first_entry(
1241 &info->receive_queue,
1242 struct smbd_response, list);
1243 list_del(&response->list);
1244 info->count_receive_queue--;
1245
1246 mempool_free(response, info->response_mempool);
1247 }
1248 return -ENOMEM;
1249 }
1250
destroy_receive_buffers(struct smbd_connection * info)1251 static void destroy_receive_buffers(struct smbd_connection *info)
1252 {
1253 struct smbd_response *response;
1254
1255 while ((response = get_receive_buffer(info)))
1256 mempool_free(response, info->response_mempool);
1257 }
1258
1259 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
idle_connection_timer(struct work_struct * work)1260 static void idle_connection_timer(struct work_struct *work)
1261 {
1262 struct smbd_connection *info = container_of(
1263 work, struct smbd_connection,
1264 idle_timer_work.work);
1265 struct smbdirect_socket *sc = &info->socket;
1266 struct smbdirect_socket_parameters *sp = &sc->parameters;
1267
1268 if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
1269 log_keep_alive(ERR,
1270 "error status info->keep_alive_requested=%d\n",
1271 info->keep_alive_requested);
1272 smbd_disconnect_rdma_connection(info);
1273 return;
1274 }
1275
1276 log_keep_alive(INFO, "about to send an empty idle message\n");
1277 smbd_post_send_empty(info);
1278
1279 /* Setup the next idle timeout work */
1280 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1281 msecs_to_jiffies(sp->keepalive_interval_msec));
1282 }
1283
1284 /*
1285 * Destroy the transport and related RDMA and memory resources
1286 * Need to go through all the pending counters and make sure on one is using
1287 * the transport while it is destroyed
1288 */
smbd_destroy(struct TCP_Server_Info * server)1289 void smbd_destroy(struct TCP_Server_Info *server)
1290 {
1291 struct smbd_connection *info = server->smbd_conn;
1292 struct smbdirect_socket *sc;
1293 struct smbdirect_socket_parameters *sp;
1294 struct smbd_response *response;
1295 unsigned long flags;
1296
1297 if (!info) {
1298 log_rdma_event(INFO, "rdma session already destroyed\n");
1299 return;
1300 }
1301 sc = &info->socket;
1302 sp = &sc->parameters;
1303
1304 log_rdma_event(INFO, "destroying rdma session\n");
1305 if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) {
1306 rdma_disconnect(sc->rdma.cm_id);
1307 log_rdma_event(INFO, "wait for transport being disconnected\n");
1308 wait_event_interruptible(
1309 info->disconn_wait,
1310 sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
1311 }
1312
1313 log_rdma_event(INFO, "cancelling post_send_credits_work\n");
1314 disable_work_sync(&info->post_send_credits_work);
1315
1316 log_rdma_event(INFO, "destroying qp\n");
1317 ib_drain_qp(sc->ib.qp);
1318 rdma_destroy_qp(sc->rdma.cm_id);
1319 sc->ib.qp = NULL;
1320
1321 log_rdma_event(INFO, "cancelling idle timer\n");
1322 cancel_delayed_work_sync(&info->idle_timer_work);
1323
1324 /* It's not possible for upper layer to get to reassembly */
1325 log_rdma_event(INFO, "drain the reassembly queue\n");
1326 do {
1327 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
1328 response = _get_first_reassembly(info);
1329 if (response) {
1330 list_del(&response->list);
1331 spin_unlock_irqrestore(
1332 &info->reassembly_queue_lock, flags);
1333 put_receive_buffer(info, response);
1334 } else
1335 spin_unlock_irqrestore(
1336 &info->reassembly_queue_lock, flags);
1337 } while (response);
1338 info->reassembly_data_length = 0;
1339
1340 log_rdma_event(INFO, "free receive buffers\n");
1341 wait_event(info->wait_receive_queues,
1342 info->count_receive_queue == sp->recv_credit_max);
1343 destroy_receive_buffers(info);
1344
1345 /*
1346 * For performance reasons, memory registration and deregistration
1347 * are not locked by srv_mutex. It is possible some processes are
1348 * blocked on transport srv_mutex while holding memory registration.
1349 * Release the transport srv_mutex to allow them to hit the failure
1350 * path when sending data, and then release memory registrations.
1351 */
1352 log_rdma_event(INFO, "freeing mr list\n");
1353 wake_up_interruptible_all(&info->wait_mr);
1354 while (atomic_read(&info->mr_used_count)) {
1355 cifs_server_unlock(server);
1356 msleep(1000);
1357 cifs_server_lock(server);
1358 }
1359 destroy_mr_list(info);
1360
1361 ib_free_cq(sc->ib.send_cq);
1362 ib_free_cq(sc->ib.recv_cq);
1363 ib_dealloc_pd(sc->ib.pd);
1364 rdma_destroy_id(sc->rdma.cm_id);
1365
1366 /* free mempools */
1367 mempool_destroy(info->request_mempool);
1368 kmem_cache_destroy(info->request_cache);
1369
1370 mempool_destroy(info->response_mempool);
1371 kmem_cache_destroy(info->response_cache);
1372
1373 sc->status = SMBDIRECT_SOCKET_DESTROYED;
1374
1375 destroy_workqueue(info->workqueue);
1376 log_rdma_event(INFO, "rdma session destroyed\n");
1377 kfree(info);
1378 server->smbd_conn = NULL;
1379 }
1380
1381 /*
1382 * Reconnect this SMBD connection, called from upper layer
1383 * return value: 0 on success, or actual error code
1384 */
smbd_reconnect(struct TCP_Server_Info * server)1385 int smbd_reconnect(struct TCP_Server_Info *server)
1386 {
1387 log_rdma_event(INFO, "reconnecting rdma session\n");
1388
1389 if (!server->smbd_conn) {
1390 log_rdma_event(INFO, "rdma session already destroyed\n");
1391 goto create_conn;
1392 }
1393
1394 /*
1395 * This is possible if transport is disconnected and we haven't received
1396 * notification from RDMA, but upper layer has detected timeout
1397 */
1398 if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
1399 log_rdma_event(INFO, "disconnecting transport\n");
1400 smbd_destroy(server);
1401 }
1402
1403 create_conn:
1404 log_rdma_event(INFO, "creating rdma session\n");
1405 server->smbd_conn = smbd_get_connection(
1406 server, (struct sockaddr *) &server->dstaddr);
1407
1408 if (server->smbd_conn) {
1409 cifs_dbg(VFS, "RDMA transport re-established\n");
1410 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
1411 return 0;
1412 }
1413 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
1414 return -ENOENT;
1415 }
1416
destroy_caches_and_workqueue(struct smbd_connection * info)1417 static void destroy_caches_and_workqueue(struct smbd_connection *info)
1418 {
1419 destroy_receive_buffers(info);
1420 destroy_workqueue(info->workqueue);
1421 mempool_destroy(info->response_mempool);
1422 kmem_cache_destroy(info->response_cache);
1423 mempool_destroy(info->request_mempool);
1424 kmem_cache_destroy(info->request_cache);
1425 }
1426
1427 #define MAX_NAME_LEN 80
allocate_caches_and_workqueue(struct smbd_connection * info)1428 static int allocate_caches_and_workqueue(struct smbd_connection *info)
1429 {
1430 struct smbdirect_socket *sc = &info->socket;
1431 struct smbdirect_socket_parameters *sp = &sc->parameters;
1432 char name[MAX_NAME_LEN];
1433 int rc;
1434
1435 if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
1436 return -ENOMEM;
1437
1438 scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
1439 info->request_cache =
1440 kmem_cache_create(
1441 name,
1442 sizeof(struct smbd_request) +
1443 sizeof(struct smbdirect_data_transfer),
1444 0, SLAB_HWCACHE_ALIGN, NULL);
1445 if (!info->request_cache)
1446 return -ENOMEM;
1447
1448 info->request_mempool =
1449 mempool_create(sp->send_credit_target, mempool_alloc_slab,
1450 mempool_free_slab, info->request_cache);
1451 if (!info->request_mempool)
1452 goto out1;
1453
1454 scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
1455
1456 struct kmem_cache_args response_args = {
1457 .align = __alignof__(struct smbd_response),
1458 .useroffset = (offsetof(struct smbd_response, packet) +
1459 sizeof(struct smbdirect_data_transfer)),
1460 .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
1461 };
1462 info->response_cache =
1463 kmem_cache_create(name,
1464 sizeof(struct smbd_response) + sp->max_recv_size,
1465 &response_args, SLAB_HWCACHE_ALIGN);
1466 if (!info->response_cache)
1467 goto out2;
1468
1469 info->response_mempool =
1470 mempool_create(sp->recv_credit_max, mempool_alloc_slab,
1471 mempool_free_slab, info->response_cache);
1472 if (!info->response_mempool)
1473 goto out3;
1474
1475 scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
1476 info->workqueue = create_workqueue(name);
1477 if (!info->workqueue)
1478 goto out4;
1479
1480 rc = allocate_receive_buffers(info, sp->recv_credit_max);
1481 if (rc) {
1482 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1483 goto out5;
1484 }
1485
1486 return 0;
1487
1488 out5:
1489 destroy_workqueue(info->workqueue);
1490 out4:
1491 mempool_destroy(info->response_mempool);
1492 out3:
1493 kmem_cache_destroy(info->response_cache);
1494 out2:
1495 mempool_destroy(info->request_mempool);
1496 out1:
1497 kmem_cache_destroy(info->request_cache);
1498 return -ENOMEM;
1499 }
1500
1501 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)1502 static struct smbd_connection *_smbd_get_connection(
1503 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1504 {
1505 int rc;
1506 struct smbd_connection *info;
1507 struct smbdirect_socket *sc;
1508 struct smbdirect_socket_parameters *sp;
1509 struct rdma_conn_param conn_param;
1510 struct ib_qp_init_attr qp_attr;
1511 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1512 struct ib_port_immutable port_immutable;
1513 u32 ird_ord_hdr[2];
1514
1515 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1516 if (!info)
1517 return NULL;
1518 sc = &info->socket;
1519 sp = &sc->parameters;
1520
1521 sc->status = SMBDIRECT_SOCKET_CONNECTING;
1522 rc = smbd_ia_open(info, dstaddr, port);
1523 if (rc) {
1524 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1525 goto create_id_failed;
1526 }
1527
1528 if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe ||
1529 smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
1530 log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1531 smbd_send_credit_target,
1532 sc->ib.dev->attrs.max_cqe,
1533 sc->ib.dev->attrs.max_qp_wr);
1534 goto config_failed;
1535 }
1536
1537 if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe ||
1538 smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) {
1539 log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1540 smbd_receive_credit_max,
1541 sc->ib.dev->attrs.max_cqe,
1542 sc->ib.dev->attrs.max_qp_wr);
1543 goto config_failed;
1544 }
1545
1546 sp->recv_credit_max = smbd_receive_credit_max;
1547 sp->send_credit_target = smbd_send_credit_target;
1548 sp->max_send_size = smbd_max_send_size;
1549 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1550 sp->max_recv_size = smbd_max_receive_size;
1551 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
1552
1553 if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
1554 sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
1555 log_rdma_event(ERR,
1556 "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
1557 IB_DEVICE_NAME_MAX,
1558 sc->ib.dev->name,
1559 sc->ib.dev->attrs.max_send_sge,
1560 sc->ib.dev->attrs.max_recv_sge);
1561 goto config_failed;
1562 }
1563
1564 sc->ib.send_cq =
1565 ib_alloc_cq_any(sc->ib.dev, info,
1566 sp->send_credit_target, IB_POLL_SOFTIRQ);
1567 if (IS_ERR(sc->ib.send_cq)) {
1568 sc->ib.send_cq = NULL;
1569 goto alloc_cq_failed;
1570 }
1571
1572 sc->ib.recv_cq =
1573 ib_alloc_cq_any(sc->ib.dev, info,
1574 sp->recv_credit_max, IB_POLL_SOFTIRQ);
1575 if (IS_ERR(sc->ib.recv_cq)) {
1576 sc->ib.recv_cq = NULL;
1577 goto alloc_cq_failed;
1578 }
1579
1580 memset(&qp_attr, 0, sizeof(qp_attr));
1581 qp_attr.event_handler = smbd_qp_async_error_upcall;
1582 qp_attr.qp_context = info;
1583 qp_attr.cap.max_send_wr = sp->send_credit_target;
1584 qp_attr.cap.max_recv_wr = sp->recv_credit_max;
1585 qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
1586 qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
1587 qp_attr.cap.max_inline_data = 0;
1588 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1589 qp_attr.qp_type = IB_QPT_RC;
1590 qp_attr.send_cq = sc->ib.send_cq;
1591 qp_attr.recv_cq = sc->ib.recv_cq;
1592 qp_attr.port_num = ~0;
1593
1594 rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
1595 if (rc) {
1596 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1597 goto create_qp_failed;
1598 }
1599 sc->ib.qp = sc->rdma.cm_id->qp;
1600
1601 memset(&conn_param, 0, sizeof(conn_param));
1602 conn_param.initiator_depth = 0;
1603
1604 conn_param.responder_resources =
1605 min(sc->ib.dev->attrs.max_qp_rd_atom,
1606 SMBD_CM_RESPONDER_RESOURCES);
1607 info->responder_resources = conn_param.responder_resources;
1608 log_rdma_mr(INFO, "responder_resources=%d\n",
1609 info->responder_resources);
1610
1611 /* Need to send IRD/ORD in private data for iWARP */
1612 sc->ib.dev->ops.get_port_immutable(
1613 sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
1614 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1615 ird_ord_hdr[0] = info->responder_resources;
1616 ird_ord_hdr[1] = 1;
1617 conn_param.private_data = ird_ord_hdr;
1618 conn_param.private_data_len = sizeof(ird_ord_hdr);
1619 } else {
1620 conn_param.private_data = NULL;
1621 conn_param.private_data_len = 0;
1622 }
1623
1624 conn_param.retry_count = SMBD_CM_RETRY;
1625 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1626 conn_param.flow_control = 0;
1627
1628 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1629 &addr_in->sin_addr, port);
1630
1631 init_waitqueue_head(&info->conn_wait);
1632 init_waitqueue_head(&info->disconn_wait);
1633 init_waitqueue_head(&info->wait_reassembly_queue);
1634 rc = rdma_connect(sc->rdma.cm_id, &conn_param);
1635 if (rc) {
1636 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1637 goto rdma_connect_failed;
1638 }
1639
1640 wait_event_interruptible_timeout(
1641 info->conn_wait,
1642 sc->status != SMBDIRECT_SOCKET_CONNECTING,
1643 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
1644
1645 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1646 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1647 goto rdma_connect_failed;
1648 }
1649
1650 log_rdma_event(INFO, "rdma_connect connected\n");
1651
1652 rc = allocate_caches_and_workqueue(info);
1653 if (rc) {
1654 log_rdma_event(ERR, "cache allocation failed\n");
1655 goto allocate_cache_failed;
1656 }
1657
1658 init_waitqueue_head(&info->wait_send_queue);
1659 INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
1660 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1661 msecs_to_jiffies(sp->keepalive_interval_msec));
1662
1663 init_waitqueue_head(&info->wait_send_pending);
1664 atomic_set(&info->send_pending, 0);
1665
1666 init_waitqueue_head(&info->wait_post_send);
1667
1668 INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
1669 INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
1670 info->new_credits_offered = 0;
1671 spin_lock_init(&info->lock_new_credits_offered);
1672
1673 rc = smbd_negotiate(info);
1674 if (rc) {
1675 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1676 goto negotiation_failed;
1677 }
1678
1679 rc = allocate_mr_list(info);
1680 if (rc) {
1681 log_rdma_mr(ERR, "memory registration allocation failed\n");
1682 goto allocate_mr_failed;
1683 }
1684
1685 return info;
1686
1687 allocate_mr_failed:
1688 /* At this point, need to a full transport shutdown */
1689 server->smbd_conn = info;
1690 smbd_destroy(server);
1691 return NULL;
1692
1693 negotiation_failed:
1694 cancel_delayed_work_sync(&info->idle_timer_work);
1695 destroy_caches_and_workqueue(info);
1696 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
1697 rdma_disconnect(sc->rdma.cm_id);
1698 wait_event(info->conn_wait,
1699 sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
1700
1701 allocate_cache_failed:
1702 rdma_connect_failed:
1703 rdma_destroy_qp(sc->rdma.cm_id);
1704
1705 create_qp_failed:
1706 alloc_cq_failed:
1707 if (sc->ib.send_cq)
1708 ib_free_cq(sc->ib.send_cq);
1709 if (sc->ib.recv_cq)
1710 ib_free_cq(sc->ib.recv_cq);
1711
1712 config_failed:
1713 ib_dealloc_pd(sc->ib.pd);
1714 rdma_destroy_id(sc->rdma.cm_id);
1715
1716 create_id_failed:
1717 kfree(info);
1718 return NULL;
1719 }
1720
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)1721 struct smbd_connection *smbd_get_connection(
1722 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
1723 {
1724 struct smbd_connection *ret;
1725 int port = SMBD_PORT;
1726
1727 try_again:
1728 ret = _smbd_get_connection(server, dstaddr, port);
1729
1730 /* Try SMB_PORT if SMBD_PORT doesn't work */
1731 if (!ret && port == SMBD_PORT) {
1732 port = SMB_PORT;
1733 goto try_again;
1734 }
1735 return ret;
1736 }
1737
1738 /*
1739 * Receive data from the transport's receive reassembly queue
1740 * All the incoming data packets are placed in reassembly queue
1741 * iter: the buffer to read data into
1742 * size: the length of data to read
1743 * return value: actual data read
1744 *
1745 * Note: this implementation copies the data from reassembly queue to receive
1746 * buffers used by upper layer. This is not the optimal code path. A better way
1747 * to do it is to not have upper layer allocate its receive buffers but rather
1748 * borrow the buffer from reassembly queue, and return it after data is
1749 * consumed. But this will require more changes to upper layer code, and also
1750 * need to consider packet boundaries while they still being reassembled.
1751 */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)1752 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
1753 {
1754 struct smbdirect_socket *sc = &info->socket;
1755 struct smbd_response *response;
1756 struct smbdirect_data_transfer *data_transfer;
1757 size_t size = iov_iter_count(&msg->msg_iter);
1758 int to_copy, to_read, data_read, offset;
1759 u32 data_length, remaining_data_length, data_offset;
1760 int rc;
1761
1762 if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
1763 return -EINVAL; /* It's a bug in upper layer to get there */
1764
1765 again:
1766 /*
1767 * No need to hold the reassembly queue lock all the time as we are
1768 * the only one reading from the front of the queue. The transport
1769 * may add more entries to the back of the queue at the same time
1770 */
1771 log_read(INFO, "size=%zd info->reassembly_data_length=%d\n", size,
1772 info->reassembly_data_length);
1773 if (info->reassembly_data_length >= size) {
1774 int queue_length;
1775 int queue_removed = 0;
1776
1777 /*
1778 * Need to make sure reassembly_data_length is read before
1779 * reading reassembly_queue_length and calling
1780 * _get_first_reassembly. This call is lock free
1781 * as we never read at the end of the queue which are being
1782 * updated in SOFTIRQ as more data is received
1783 */
1784 virt_rmb();
1785 queue_length = info->reassembly_queue_length;
1786 data_read = 0;
1787 to_read = size;
1788 offset = info->first_entry_offset;
1789 while (data_read < size) {
1790 response = _get_first_reassembly(info);
1791 data_transfer = smbd_response_payload(response);
1792 data_length = le32_to_cpu(data_transfer->data_length);
1793 remaining_data_length =
1794 le32_to_cpu(
1795 data_transfer->remaining_data_length);
1796 data_offset = le32_to_cpu(data_transfer->data_offset);
1797
1798 /*
1799 * The upper layer expects RFC1002 length at the
1800 * beginning of the payload. Return it to indicate
1801 * the total length of the packet. This minimize the
1802 * change to upper layer packet processing logic. This
1803 * will be eventually remove when an intermediate
1804 * transport layer is added
1805 */
1806 if (response->first_segment && size == 4) {
1807 unsigned int rfc1002_len =
1808 data_length + remaining_data_length;
1809 __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
1810 if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
1811 &msg->msg_iter) != sizeof(rfc1002_hdr))
1812 return -EFAULT;
1813 data_read = 4;
1814 response->first_segment = false;
1815 log_read(INFO, "returning rfc1002 length %d\n",
1816 rfc1002_len);
1817 goto read_rfc1002_done;
1818 }
1819
1820 to_copy = min_t(int, data_length - offset, to_read);
1821 if (copy_to_iter((char *)data_transfer + data_offset + offset,
1822 to_copy, &msg->msg_iter) != to_copy)
1823 return -EFAULT;
1824
1825 /* move on to the next buffer? */
1826 if (to_copy == data_length - offset) {
1827 queue_length--;
1828 /*
1829 * No need to lock if we are not at the
1830 * end of the queue
1831 */
1832 if (queue_length)
1833 list_del(&response->list);
1834 else {
1835 spin_lock_irq(
1836 &info->reassembly_queue_lock);
1837 list_del(&response->list);
1838 spin_unlock_irq(
1839 &info->reassembly_queue_lock);
1840 }
1841 queue_removed++;
1842 info->count_reassembly_queue--;
1843 info->count_dequeue_reassembly_queue++;
1844 put_receive_buffer(info, response);
1845 offset = 0;
1846 log_read(INFO, "put_receive_buffer offset=0\n");
1847 } else
1848 offset += to_copy;
1849
1850 to_read -= to_copy;
1851 data_read += to_copy;
1852
1853 log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
1854 to_copy, data_length - offset,
1855 to_read, data_read, offset);
1856 }
1857
1858 spin_lock_irq(&info->reassembly_queue_lock);
1859 info->reassembly_data_length -= data_read;
1860 info->reassembly_queue_length -= queue_removed;
1861 spin_unlock_irq(&info->reassembly_queue_lock);
1862
1863 info->first_entry_offset = offset;
1864 log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
1865 data_read, info->reassembly_data_length,
1866 info->first_entry_offset);
1867 read_rfc1002_done:
1868 return data_read;
1869 }
1870
1871 log_read(INFO, "wait_event on more data\n");
1872 rc = wait_event_interruptible(
1873 info->wait_reassembly_queue,
1874 info->reassembly_data_length >= size ||
1875 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1876 /* Don't return any data if interrupted */
1877 if (rc)
1878 return rc;
1879
1880 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1881 log_read(ERR, "disconnected\n");
1882 return -ECONNABORTED;
1883 }
1884
1885 goto again;
1886 }
1887
1888 /*
1889 * Send data to transport
1890 * Each rqst is transported as a SMBDirect payload
1891 * rqst: the data to write
1892 * return value: 0 if successfully write, otherwise error code
1893 */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)1894 int smbd_send(struct TCP_Server_Info *server,
1895 int num_rqst, struct smb_rqst *rqst_array)
1896 {
1897 struct smbd_connection *info = server->smbd_conn;
1898 struct smbdirect_socket *sc = &info->socket;
1899 struct smbdirect_socket_parameters *sp = &sc->parameters;
1900 struct smb_rqst *rqst;
1901 struct iov_iter iter;
1902 unsigned int remaining_data_length, klen;
1903 int rc, i, rqst_idx;
1904
1905 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1906 return -EAGAIN;
1907
1908 /*
1909 * Add in the page array if there is one. The caller needs to set
1910 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
1911 * ends at page boundary
1912 */
1913 remaining_data_length = 0;
1914 for (i = 0; i < num_rqst; i++)
1915 remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
1916
1917 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
1918 /* assertion: payload never exceeds negotiated maximum */
1919 log_write(ERR, "payload size %d > max size %d\n",
1920 remaining_data_length, sp->max_fragmented_send_size);
1921 return -EINVAL;
1922 }
1923
1924 log_write(INFO, "num_rqst=%d total length=%u\n",
1925 num_rqst, remaining_data_length);
1926
1927 rqst_idx = 0;
1928 do {
1929 rqst = &rqst_array[rqst_idx];
1930
1931 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
1932 rqst_idx, smb_rqst_len(server, rqst));
1933 for (i = 0; i < rqst->rq_nvec; i++)
1934 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
1935
1936 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
1937 rqst_idx, rqst->rq_nvec, remaining_data_length,
1938 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
1939
1940 /* Send the metadata pages. */
1941 klen = 0;
1942 for (i = 0; i < rqst->rq_nvec; i++)
1943 klen += rqst->rq_iov[i].iov_len;
1944 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
1945
1946 rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length);
1947 if (rc < 0)
1948 break;
1949
1950 if (iov_iter_count(&rqst->rq_iter) > 0) {
1951 /* And then the data pages if there are any */
1952 rc = smbd_post_send_full_iter(info, &rqst->rq_iter,
1953 &remaining_data_length);
1954 if (rc < 0)
1955 break;
1956 }
1957
1958 } while (++rqst_idx < num_rqst);
1959
1960 /*
1961 * As an optimization, we don't wait for individual I/O to finish
1962 * before sending the next one.
1963 * Send them all and wait for pending send count to get to 0
1964 * that means all the I/Os have been out and we are good to return
1965 */
1966
1967 wait_event(info->wait_send_pending,
1968 atomic_read(&info->send_pending) == 0 ||
1969 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1970
1971 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
1972 rc = -EAGAIN;
1973
1974 return rc;
1975 }
1976
register_mr_done(struct ib_cq * cq,struct ib_wc * wc)1977 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
1978 {
1979 struct smbd_mr *mr;
1980 struct ib_cqe *cqe;
1981
1982 if (wc->status) {
1983 log_rdma_mr(ERR, "status=%d\n", wc->status);
1984 cqe = wc->wr_cqe;
1985 mr = container_of(cqe, struct smbd_mr, cqe);
1986 smbd_disconnect_rdma_connection(mr->conn);
1987 }
1988 }
1989
1990 /*
1991 * The work queue function that recovers MRs
1992 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
1993 * again. Both calls are slow, so finish them in a workqueue. This will not
1994 * block I/O path.
1995 * There is one workqueue that recovers MRs, there is no need to lock as the
1996 * I/O requests calling smbd_register_mr will never update the links in the
1997 * mr_list.
1998 */
smbd_mr_recovery_work(struct work_struct * work)1999 static void smbd_mr_recovery_work(struct work_struct *work)
2000 {
2001 struct smbd_connection *info =
2002 container_of(work, struct smbd_connection, mr_recovery_work);
2003 struct smbdirect_socket *sc = &info->socket;
2004 struct smbd_mr *smbdirect_mr;
2005 int rc;
2006
2007 list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
2008 if (smbdirect_mr->state == MR_ERROR) {
2009
2010 /* recover this MR entry */
2011 rc = ib_dereg_mr(smbdirect_mr->mr);
2012 if (rc) {
2013 log_rdma_mr(ERR,
2014 "ib_dereg_mr failed rc=%x\n",
2015 rc);
2016 smbd_disconnect_rdma_connection(info);
2017 continue;
2018 }
2019
2020 smbdirect_mr->mr = ib_alloc_mr(
2021 sc->ib.pd, info->mr_type,
2022 info->max_frmr_depth);
2023 if (IS_ERR(smbdirect_mr->mr)) {
2024 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2025 info->mr_type,
2026 info->max_frmr_depth);
2027 smbd_disconnect_rdma_connection(info);
2028 continue;
2029 }
2030 } else
2031 /* This MR is being used, don't recover it */
2032 continue;
2033
2034 smbdirect_mr->state = MR_READY;
2035
2036 /* smbdirect_mr->state is updated by this function
2037 * and is read and updated by I/O issuing CPUs trying
2038 * to get a MR, the call to atomic_inc_return
2039 * implicates a memory barrier and guarantees this
2040 * value is updated before waking up any calls to
2041 * get_mr() from the I/O issuing CPUs
2042 */
2043 if (atomic_inc_return(&info->mr_ready_count) == 1)
2044 wake_up_interruptible(&info->wait_mr);
2045 }
2046 }
2047
destroy_mr_list(struct smbd_connection * info)2048 static void destroy_mr_list(struct smbd_connection *info)
2049 {
2050 struct smbdirect_socket *sc = &info->socket;
2051 struct smbd_mr *mr, *tmp;
2052
2053 cancel_work_sync(&info->mr_recovery_work);
2054 list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
2055 if (mr->state == MR_INVALIDATED)
2056 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
2057 mr->sgt.nents, mr->dir);
2058 ib_dereg_mr(mr->mr);
2059 kfree(mr->sgt.sgl);
2060 kfree(mr);
2061 }
2062 }
2063
2064 /*
2065 * Allocate MRs used for RDMA read/write
2066 * The number of MRs will not exceed hardware capability in responder_resources
2067 * All MRs are kept in mr_list. The MR can be recovered after it's used
2068 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2069 * as MRs are used and recovered for I/O, but the list links will not change
2070 */
allocate_mr_list(struct smbd_connection * info)2071 static int allocate_mr_list(struct smbd_connection *info)
2072 {
2073 struct smbdirect_socket *sc = &info->socket;
2074 int i;
2075 struct smbd_mr *smbdirect_mr, *tmp;
2076
2077 INIT_LIST_HEAD(&info->mr_list);
2078 init_waitqueue_head(&info->wait_mr);
2079 spin_lock_init(&info->mr_list_lock);
2080 atomic_set(&info->mr_ready_count, 0);
2081 atomic_set(&info->mr_used_count, 0);
2082 init_waitqueue_head(&info->wait_for_mr_cleanup);
2083 INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
2084 /* Allocate more MRs (2x) than hardware responder_resources */
2085 for (i = 0; i < info->responder_resources * 2; i++) {
2086 smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
2087 if (!smbdirect_mr)
2088 goto cleanup_entries;
2089 smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type,
2090 info->max_frmr_depth);
2091 if (IS_ERR(smbdirect_mr->mr)) {
2092 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2093 info->mr_type, info->max_frmr_depth);
2094 goto out;
2095 }
2096 smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth,
2097 sizeof(struct scatterlist),
2098 GFP_KERNEL);
2099 if (!smbdirect_mr->sgt.sgl) {
2100 log_rdma_mr(ERR, "failed to allocate sgl\n");
2101 ib_dereg_mr(smbdirect_mr->mr);
2102 goto out;
2103 }
2104 smbdirect_mr->state = MR_READY;
2105 smbdirect_mr->conn = info;
2106
2107 list_add_tail(&smbdirect_mr->list, &info->mr_list);
2108 atomic_inc(&info->mr_ready_count);
2109 }
2110 return 0;
2111
2112 out:
2113 kfree(smbdirect_mr);
2114 cleanup_entries:
2115 list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
2116 list_del(&smbdirect_mr->list);
2117 ib_dereg_mr(smbdirect_mr->mr);
2118 kfree(smbdirect_mr->sgt.sgl);
2119 kfree(smbdirect_mr);
2120 }
2121 return -ENOMEM;
2122 }
2123
2124 /*
2125 * Get a MR from mr_list. This function waits until there is at least one
2126 * MR available in the list. It may access the list while the
2127 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2128 * as they never modify the same places. However, there may be several CPUs
2129 * issuing I/O trying to get MR at the same time, mr_list_lock is used to
2130 * protect this situation.
2131 */
get_mr(struct smbd_connection * info)2132 static struct smbd_mr *get_mr(struct smbd_connection *info)
2133 {
2134 struct smbdirect_socket *sc = &info->socket;
2135 struct smbd_mr *ret;
2136 int rc;
2137 again:
2138 rc = wait_event_interruptible(info->wait_mr,
2139 atomic_read(&info->mr_ready_count) ||
2140 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2141 if (rc) {
2142 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2143 return NULL;
2144 }
2145
2146 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2147 log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
2148 return NULL;
2149 }
2150
2151 spin_lock(&info->mr_list_lock);
2152 list_for_each_entry(ret, &info->mr_list, list) {
2153 if (ret->state == MR_READY) {
2154 ret->state = MR_REGISTERED;
2155 spin_unlock(&info->mr_list_lock);
2156 atomic_dec(&info->mr_ready_count);
2157 atomic_inc(&info->mr_used_count);
2158 return ret;
2159 }
2160 }
2161
2162 spin_unlock(&info->mr_list_lock);
2163 /*
2164 * It is possible that we could fail to get MR because other processes may
2165 * try to acquire a MR at the same time. If this is the case, retry it.
2166 */
2167 goto again;
2168 }
2169
2170 /*
2171 * Transcribe the pages from an iterator into an MR scatterlist.
2172 */
smbd_iter_to_mr(struct smbd_connection * info,struct iov_iter * iter,struct sg_table * sgt,unsigned int max_sg)2173 static int smbd_iter_to_mr(struct smbd_connection *info,
2174 struct iov_iter *iter,
2175 struct sg_table *sgt,
2176 unsigned int max_sg)
2177 {
2178 int ret;
2179
2180 memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
2181
2182 ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
2183 WARN_ON(ret < 0);
2184 if (sgt->nents > 0)
2185 sg_mark_end(&sgt->sgl[sgt->nents - 1]);
2186 return ret;
2187 }
2188
2189 /*
2190 * Register memory for RDMA read/write
2191 * iter: the buffer to register memory with
2192 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2193 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2194 * return value: the MR registered, NULL if failed.
2195 */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)2196 struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
2197 struct iov_iter *iter,
2198 bool writing, bool need_invalidate)
2199 {
2200 struct smbdirect_socket *sc = &info->socket;
2201 struct smbd_mr *smbdirect_mr;
2202 int rc, num_pages;
2203 enum dma_data_direction dir;
2204 struct ib_reg_wr *reg_wr;
2205
2206 num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1);
2207 if (num_pages > info->max_frmr_depth) {
2208 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2209 num_pages, info->max_frmr_depth);
2210 WARN_ON_ONCE(1);
2211 return NULL;
2212 }
2213
2214 smbdirect_mr = get_mr(info);
2215 if (!smbdirect_mr) {
2216 log_rdma_mr(ERR, "get_mr returning NULL\n");
2217 return NULL;
2218 }
2219
2220 dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2221 smbdirect_mr->dir = dir;
2222 smbdirect_mr->need_invalidate = need_invalidate;
2223 smbdirect_mr->sgt.nents = 0;
2224 smbdirect_mr->sgt.orig_nents = 0;
2225
2226 log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
2227 num_pages, iov_iter_count(iter), info->max_frmr_depth);
2228 smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
2229
2230 rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
2231 smbdirect_mr->sgt.nents, dir);
2232 if (!rc) {
2233 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2234 num_pages, dir, rc);
2235 goto dma_map_error;
2236 }
2237
2238 rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl,
2239 smbdirect_mr->sgt.nents, NULL, PAGE_SIZE);
2240 if (rc != smbdirect_mr->sgt.nents) {
2241 log_rdma_mr(ERR,
2242 "ib_map_mr_sg failed rc = %d nents = %x\n",
2243 rc, smbdirect_mr->sgt.nents);
2244 goto map_mr_error;
2245 }
2246
2247 ib_update_fast_reg_key(smbdirect_mr->mr,
2248 ib_inc_rkey(smbdirect_mr->mr->rkey));
2249 reg_wr = &smbdirect_mr->wr;
2250 reg_wr->wr.opcode = IB_WR_REG_MR;
2251 smbdirect_mr->cqe.done = register_mr_done;
2252 reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
2253 reg_wr->wr.num_sge = 0;
2254 reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2255 reg_wr->mr = smbdirect_mr->mr;
2256 reg_wr->key = smbdirect_mr->mr->rkey;
2257 reg_wr->access = writing ?
2258 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2259 IB_ACCESS_REMOTE_READ;
2260
2261 /*
2262 * There is no need for waiting for complemtion on ib_post_send
2263 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2264 * on the next ib_post_send when we actually send I/O to remote peer
2265 */
2266 rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL);
2267 if (!rc)
2268 return smbdirect_mr;
2269
2270 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2271 rc, reg_wr->key);
2272
2273 /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2274 map_mr_error:
2275 ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
2276 smbdirect_mr->sgt.nents, smbdirect_mr->dir);
2277
2278 dma_map_error:
2279 smbdirect_mr->state = MR_ERROR;
2280 if (atomic_dec_and_test(&info->mr_used_count))
2281 wake_up(&info->wait_for_mr_cleanup);
2282
2283 smbd_disconnect_rdma_connection(info);
2284
2285 return NULL;
2286 }
2287
local_inv_done(struct ib_cq * cq,struct ib_wc * wc)2288 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2289 {
2290 struct smbd_mr *smbdirect_mr;
2291 struct ib_cqe *cqe;
2292
2293 cqe = wc->wr_cqe;
2294 smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
2295 smbdirect_mr->state = MR_INVALIDATED;
2296 if (wc->status != IB_WC_SUCCESS) {
2297 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2298 smbdirect_mr->state = MR_ERROR;
2299 }
2300 complete(&smbdirect_mr->invalidate_done);
2301 }
2302
2303 /*
2304 * Deregister a MR after I/O is done
2305 * This function may wait if remote invalidation is not used
2306 * and we have to locally invalidate the buffer to prevent data is being
2307 * modified by remote peer after upper layer consumes it
2308 */
smbd_deregister_mr(struct smbd_mr * smbdirect_mr)2309 int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
2310 {
2311 struct ib_send_wr *wr;
2312 struct smbd_connection *info = smbdirect_mr->conn;
2313 struct smbdirect_socket *sc = &info->socket;
2314 int rc = 0;
2315
2316 if (smbdirect_mr->need_invalidate) {
2317 /* Need to finish local invalidation before returning */
2318 wr = &smbdirect_mr->inv_wr;
2319 wr->opcode = IB_WR_LOCAL_INV;
2320 smbdirect_mr->cqe.done = local_inv_done;
2321 wr->wr_cqe = &smbdirect_mr->cqe;
2322 wr->num_sge = 0;
2323 wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
2324 wr->send_flags = IB_SEND_SIGNALED;
2325
2326 init_completion(&smbdirect_mr->invalidate_done);
2327 rc = ib_post_send(sc->ib.qp, wr, NULL);
2328 if (rc) {
2329 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2330 smbd_disconnect_rdma_connection(info);
2331 goto done;
2332 }
2333 wait_for_completion(&smbdirect_mr->invalidate_done);
2334 smbdirect_mr->need_invalidate = false;
2335 } else
2336 /*
2337 * For remote invalidation, just set it to MR_INVALIDATED
2338 * and defer to mr_recovery_work to recover the MR for next use
2339 */
2340 smbdirect_mr->state = MR_INVALIDATED;
2341
2342 if (smbdirect_mr->state == MR_INVALIDATED) {
2343 ib_dma_unmap_sg(
2344 sc->ib.dev, smbdirect_mr->sgt.sgl,
2345 smbdirect_mr->sgt.nents,
2346 smbdirect_mr->dir);
2347 smbdirect_mr->state = MR_READY;
2348 if (atomic_inc_return(&info->mr_ready_count) == 1)
2349 wake_up_interruptible(&info->wait_mr);
2350 } else
2351 /*
2352 * Schedule the work to do MR recovery for future I/Os MR
2353 * recovery is slow and don't want it to block current I/O
2354 */
2355 queue_work(info->workqueue, &info->mr_recovery_work);
2356
2357 done:
2358 if (atomic_dec_and_test(&info->mr_used_count))
2359 wake_up(&info->wait_for_mr_cleanup);
2360
2361 return rc;
2362 }
2363
smb_set_sge(struct smb_extract_to_rdma * rdma,struct page * lowest_page,size_t off,size_t len)2364 static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
2365 struct page *lowest_page, size_t off, size_t len)
2366 {
2367 struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
2368 u64 addr;
2369
2370 addr = ib_dma_map_page(rdma->device, lowest_page,
2371 off, len, rdma->direction);
2372 if (ib_dma_mapping_error(rdma->device, addr))
2373 return false;
2374
2375 sge->addr = addr;
2376 sge->length = len;
2377 sge->lkey = rdma->local_dma_lkey;
2378 rdma->nr_sge++;
2379 return true;
2380 }
2381
2382 /*
2383 * Extract page fragments from a BVEC-class iterator and add them to an RDMA
2384 * element list. The pages are not pinned.
2385 */
smb_extract_bvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2386 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
2387 struct smb_extract_to_rdma *rdma,
2388 ssize_t maxsize)
2389 {
2390 const struct bio_vec *bv = iter->bvec;
2391 unsigned long start = iter->iov_offset;
2392 unsigned int i;
2393 ssize_t ret = 0;
2394
2395 for (i = 0; i < iter->nr_segs; i++) {
2396 size_t off, len;
2397
2398 len = bv[i].bv_len;
2399 if (start >= len) {
2400 start -= len;
2401 continue;
2402 }
2403
2404 len = min_t(size_t, maxsize, len - start);
2405 off = bv[i].bv_offset + start;
2406
2407 if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
2408 return -EIO;
2409
2410 ret += len;
2411 maxsize -= len;
2412 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2413 break;
2414 start = 0;
2415 }
2416
2417 if (ret > 0)
2418 iov_iter_advance(iter, ret);
2419 return ret;
2420 }
2421
2422 /*
2423 * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
2424 * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
2425 * The pages are not pinned.
2426 */
smb_extract_kvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2427 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
2428 struct smb_extract_to_rdma *rdma,
2429 ssize_t maxsize)
2430 {
2431 const struct kvec *kv = iter->kvec;
2432 unsigned long start = iter->iov_offset;
2433 unsigned int i;
2434 ssize_t ret = 0;
2435
2436 for (i = 0; i < iter->nr_segs; i++) {
2437 struct page *page;
2438 unsigned long kaddr;
2439 size_t off, len, seg;
2440
2441 len = kv[i].iov_len;
2442 if (start >= len) {
2443 start -= len;
2444 continue;
2445 }
2446
2447 kaddr = (unsigned long)kv[i].iov_base + start;
2448 off = kaddr & ~PAGE_MASK;
2449 len = min_t(size_t, maxsize, len - start);
2450 kaddr &= PAGE_MASK;
2451
2452 maxsize -= len;
2453 do {
2454 seg = min_t(size_t, len, PAGE_SIZE - off);
2455
2456 if (is_vmalloc_or_module_addr((void *)kaddr))
2457 page = vmalloc_to_page((void *)kaddr);
2458 else
2459 page = virt_to_page((void *)kaddr);
2460
2461 if (!smb_set_sge(rdma, page, off, seg))
2462 return -EIO;
2463
2464 ret += seg;
2465 len -= seg;
2466 kaddr += PAGE_SIZE;
2467 off = 0;
2468 } while (len > 0 && rdma->nr_sge < rdma->max_sge);
2469
2470 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2471 break;
2472 start = 0;
2473 }
2474
2475 if (ret > 0)
2476 iov_iter_advance(iter, ret);
2477 return ret;
2478 }
2479
2480 /*
2481 * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
2482 * list. The folios are not pinned.
2483 */
smb_extract_folioq_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2484 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
2485 struct smb_extract_to_rdma *rdma,
2486 ssize_t maxsize)
2487 {
2488 const struct folio_queue *folioq = iter->folioq;
2489 unsigned int slot = iter->folioq_slot;
2490 ssize_t ret = 0;
2491 size_t offset = iter->iov_offset;
2492
2493 BUG_ON(!folioq);
2494
2495 if (slot >= folioq_nr_slots(folioq)) {
2496 folioq = folioq->next;
2497 if (WARN_ON_ONCE(!folioq))
2498 return -EIO;
2499 slot = 0;
2500 }
2501
2502 do {
2503 struct folio *folio = folioq_folio(folioq, slot);
2504 size_t fsize = folioq_folio_size(folioq, slot);
2505
2506 if (offset < fsize) {
2507 size_t part = umin(maxsize, fsize - offset);
2508
2509 if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
2510 return -EIO;
2511
2512 offset += part;
2513 ret += part;
2514 maxsize -= part;
2515 }
2516
2517 if (offset >= fsize) {
2518 offset = 0;
2519 slot++;
2520 if (slot >= folioq_nr_slots(folioq)) {
2521 if (!folioq->next) {
2522 WARN_ON_ONCE(ret < iter->count);
2523 break;
2524 }
2525 folioq = folioq->next;
2526 slot = 0;
2527 }
2528 }
2529 } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
2530
2531 iter->folioq = folioq;
2532 iter->folioq_slot = slot;
2533 iter->iov_offset = offset;
2534 iter->count -= ret;
2535 return ret;
2536 }
2537
2538 /*
2539 * Extract page fragments from up to the given amount of the source iterator
2540 * and build up an RDMA list that refers to all of those bits. The RDMA list
2541 * is appended to, up to the maximum number of elements set in the parameter
2542 * block.
2543 *
2544 * The extracted page fragments are not pinned or ref'd in any way; if an
2545 * IOVEC/UBUF-type iterator is to be used, it should be converted to a
2546 * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
2547 * way.
2548 */
smb_extract_iter_to_rdma(struct iov_iter * iter,size_t len,struct smb_extract_to_rdma * rdma)2549 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
2550 struct smb_extract_to_rdma *rdma)
2551 {
2552 ssize_t ret;
2553 int before = rdma->nr_sge;
2554
2555 switch (iov_iter_type(iter)) {
2556 case ITER_BVEC:
2557 ret = smb_extract_bvec_to_rdma(iter, rdma, len);
2558 break;
2559 case ITER_KVEC:
2560 ret = smb_extract_kvec_to_rdma(iter, rdma, len);
2561 break;
2562 case ITER_FOLIOQ:
2563 ret = smb_extract_folioq_to_rdma(iter, rdma, len);
2564 break;
2565 default:
2566 WARN_ON_ONCE(1);
2567 return -EIO;
2568 }
2569
2570 if (ret < 0) {
2571 while (rdma->nr_sge > before) {
2572 struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
2573
2574 ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
2575 rdma->direction);
2576 sge->addr = 0;
2577 }
2578 }
2579
2580 return ret;
2581 }
2582