• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Support for INET connection oriented protocols.
7  *
8  * Authors:	See the TCP sources
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or(at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <linux/jhash.h>
18 
19 #include <net/inet_connection_sock.h>
20 #include <net/inet_hashtables.h>
21 #include <net/inet_timewait_sock.h>
22 #include <net/ip.h>
23 #include <net/route.h>
24 #include <net/tcp_states.h>
25 #include <net/xfrm.h>
26 #include <net/tcp.h>
27 
28 #ifdef INET_CSK_DEBUG
29 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
30 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
31 #endif
32 
inet_get_local_port_range(struct net * net,int * low,int * high)33 void inet_get_local_port_range(struct net *net, int *low, int *high)
34 {
35 	unsigned int seq;
36 
37 	do {
38 		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
39 
40 		*low = net->ipv4.ip_local_ports.range[0];
41 		*high = net->ipv4.ip_local_ports.range[1];
42 	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
43 }
44 EXPORT_SYMBOL(inet_get_local_port_range);
45 
inet_csk_bind_conflict(const struct sock * sk,const struct inet_bind_bucket * tb,bool relax)46 int inet_csk_bind_conflict(const struct sock *sk,
47 			   const struct inet_bind_bucket *tb, bool relax)
48 {
49 	struct sock *sk2;
50 	int reuse = sk->sk_reuse;
51 	int reuseport = sk->sk_reuseport;
52 	kuid_t uid = sock_i_uid((struct sock *)sk);
53 
54 	/*
55 	 * Unlike other sk lookup places we do not check
56 	 * for sk_net here, since _all_ the socks listed
57 	 * in tb->owners list belong to the same net - the
58 	 * one this bucket belongs to.
59 	 */
60 
61 	sk_for_each_bound(sk2, &tb->owners) {
62 		if (sk != sk2 &&
63 		    !inet_v6_ipv6only(sk2) &&
64 		    (!sk->sk_bound_dev_if ||
65 		     !sk2->sk_bound_dev_if ||
66 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
67 			if ((!reuse || !sk2->sk_reuse ||
68 			    sk2->sk_state == TCP_LISTEN) &&
69 			    (!reuseport || !sk2->sk_reuseport ||
70 			    (sk2->sk_state != TCP_TIME_WAIT &&
71 			     !uid_eq(uid, sock_i_uid(sk2))))) {
72 
73 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
74 				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
75 					break;
76 			}
77 			if (!relax && reuse && sk2->sk_reuse &&
78 			    sk2->sk_state != TCP_LISTEN) {
79 
80 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
81 				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
82 					break;
83 			}
84 		}
85 	}
86 	return sk2 != NULL;
87 }
88 EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
89 
inet_csk_update_fastreuse(struct inet_bind_bucket * tb,struct sock * sk)90 void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
91 			       struct sock *sk)
92 {
93 	kuid_t uid = sock_i_uid(sk);
94 
95 	if (hlist_empty(&tb->owners)) {
96 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
97 			tb->fastreuse = 1;
98 		else
99 			tb->fastreuse = 0;
100 		if (sk->sk_reuseport) {
101 			tb->fastreuseport = 1;
102 			tb->fastuid = uid;
103 		} else
104 			tb->fastreuseport = 0;
105 	} else {
106 		if (tb->fastreuse &&
107 		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
108 			tb->fastreuse = 0;
109 		if (tb->fastreuseport &&
110 		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
111 			tb->fastreuseport = 0;
112 	}
113 }
114 
115 /* Obtain a reference to a local port for the given sock,
116  * if snum is zero it means select any available local port.
117  */
inet_csk_get_port(struct sock * sk,unsigned short snum)118 int inet_csk_get_port(struct sock *sk, unsigned short snum)
119 {
120 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
121 	struct inet_bind_hashbucket *head;
122 	struct inet_bind_bucket *tb;
123 	int ret, attempts = 5;
124 	struct net *net = sock_net(sk);
125 	int smallest_size = -1, smallest_rover;
126 	kuid_t uid = sock_i_uid(sk);
127 	int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
128 
129 	local_bh_disable();
130 	if (!snum) {
131 		int remaining, rover, low, high;
132 
133 again:
134 		inet_get_local_port_range(net, &low, &high);
135 		if (attempt_half) {
136 			int half = low + ((high - low) >> 1);
137 
138 			if (attempt_half == 1)
139 				high = half;
140 			else
141 				low = half;
142 		}
143 		remaining = (high - low) + 1;
144 		smallest_rover = rover = prandom_u32() % remaining + low;
145 
146 		smallest_size = -1;
147 		do {
148 			if (inet_is_local_reserved_port(net, rover))
149 				goto next_nolock;
150 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
151 					hashinfo->bhash_size)];
152 			spin_lock(&head->lock);
153 			inet_bind_bucket_for_each(tb, &head->chain)
154 				if (net_eq(ib_net(tb), net) && tb->port == rover) {
155 					if (((tb->fastreuse > 0 &&
156 					      sk->sk_reuse &&
157 					      sk->sk_state != TCP_LISTEN) ||
158 					     (tb->fastreuseport > 0 &&
159 					      sk->sk_reuseport &&
160 					      uid_eq(tb->fastuid, uid))) &&
161 					    (tb->num_owners < smallest_size || smallest_size == -1)) {
162 						smallest_size = tb->num_owners;
163 						smallest_rover = rover;
164 					}
165 					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
166 						snum = rover;
167 						goto tb_found;
168 					}
169 					goto next;
170 				}
171 			break;
172 		next:
173 			spin_unlock(&head->lock);
174 		next_nolock:
175 			if (++rover > high)
176 				rover = low;
177 		} while (--remaining > 0);
178 
179 		/* Exhausted local port range during search?  It is not
180 		 * possible for us to be holding one of the bind hash
181 		 * locks if this test triggers, because if 'remaining'
182 		 * drops to zero, we broke out of the do/while loop at
183 		 * the top level, not from the 'break;' statement.
184 		 */
185 		ret = 1;
186 		if (remaining <= 0) {
187 			if (smallest_size != -1) {
188 				snum = smallest_rover;
189 				goto have_snum;
190 			}
191 			if (attempt_half == 1) {
192 				/* OK we now try the upper half of the range */
193 				attempt_half = 2;
194 				goto again;
195 			}
196 			goto fail;
197 		}
198 		/* OK, here is the one we will use.  HEAD is
199 		 * non-NULL and we hold it's mutex.
200 		 */
201 		snum = rover;
202 	} else {
203 have_snum:
204 		head = &hashinfo->bhash[inet_bhashfn(net, snum,
205 				hashinfo->bhash_size)];
206 		spin_lock(&head->lock);
207 		inet_bind_bucket_for_each(tb, &head->chain)
208 			if (net_eq(ib_net(tb), net) && tb->port == snum)
209 				goto tb_found;
210 	}
211 	tb = NULL;
212 	goto tb_not_found;
213 tb_found:
214 	if (!hlist_empty(&tb->owners)) {
215 		if (sk->sk_reuse == SK_FORCE_REUSE)
216 			goto success;
217 
218 		if (((tb->fastreuse > 0 &&
219 		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
220 		     (tb->fastreuseport > 0 &&
221 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
222 		    smallest_size == -1) {
223 			goto success;
224 		} else {
225 			ret = 1;
226 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
227 				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
228 				     (tb->fastreuseport > 0 &&
229 				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
230 				    smallest_size != -1 && --attempts >= 0) {
231 					spin_unlock(&head->lock);
232 					goto again;
233 				}
234 
235 				goto fail_unlock;
236 			}
237 		}
238 	}
239 tb_not_found:
240 	ret = 1;
241 	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
242 					net, head, snum)) == NULL)
243 		goto fail_unlock;
244 
245 	inet_csk_update_fastreuse(tb, sk);
246 
247 success:
248 	if (!inet_csk(sk)->icsk_bind_hash)
249 		inet_bind_hash(sk, tb, snum);
250 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
251 	ret = 0;
252 
253 fail_unlock:
254 	spin_unlock(&head->lock);
255 fail:
256 	local_bh_enable();
257 	return ret;
258 }
259 EXPORT_SYMBOL_GPL(inet_csk_get_port);
260 
261 /*
262  * Wait for an incoming connection, avoid race conditions. This must be called
263  * with the socket locked.
264  */
inet_csk_wait_for_connect(struct sock * sk,long timeo)265 static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
266 {
267 	struct inet_connection_sock *icsk = inet_csk(sk);
268 	DEFINE_WAIT(wait);
269 	int err;
270 
271 	/*
272 	 * True wake-one mechanism for incoming connections: only
273 	 * one process gets woken up, not the 'whole herd'.
274 	 * Since we do not 'race & poll' for established sockets
275 	 * anymore, the common case will execute the loop only once.
276 	 *
277 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
278 	 * after any current non-exclusive waiters, and we know that
279 	 * it will always _stay_ after any new non-exclusive waiters
280 	 * because all non-exclusive waiters are added at the
281 	 * beginning of the wait-queue. As such, it's ok to "drop"
282 	 * our exclusiveness temporarily when we get woken up without
283 	 * having to remove and re-insert us on the wait queue.
284 	 */
285 	for (;;) {
286 		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
287 					  TASK_INTERRUPTIBLE);
288 		release_sock(sk);
289 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
290 			timeo = schedule_timeout(timeo);
291 		sched_annotate_sleep();
292 		lock_sock(sk);
293 		err = 0;
294 		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
295 			break;
296 		err = -EINVAL;
297 		if (sk->sk_state != TCP_LISTEN)
298 			break;
299 		err = sock_intr_errno(timeo);
300 		if (signal_pending(current))
301 			break;
302 		err = -EAGAIN;
303 		if (!timeo)
304 			break;
305 	}
306 	finish_wait(sk_sleep(sk), &wait);
307 	return err;
308 }
309 
310 /*
311  * This will accept the next outstanding connection.
312  */
inet_csk_accept(struct sock * sk,int flags,int * err)313 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
314 {
315 	struct inet_connection_sock *icsk = inet_csk(sk);
316 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
317 	struct request_sock *req;
318 	struct sock *newsk;
319 	int error;
320 
321 	lock_sock(sk);
322 
323 	/* We need to make sure that this socket is listening,
324 	 * and that it has something pending.
325 	 */
326 	error = -EINVAL;
327 	if (sk->sk_state != TCP_LISTEN)
328 		goto out_err;
329 
330 	/* Find already established connection */
331 	if (reqsk_queue_empty(queue)) {
332 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
333 
334 		/* If this is a non blocking socket don't sleep */
335 		error = -EAGAIN;
336 		if (!timeo)
337 			goto out_err;
338 
339 		error = inet_csk_wait_for_connect(sk, timeo);
340 		if (error)
341 			goto out_err;
342 	}
343 	req = reqsk_queue_remove(queue, sk);
344 	newsk = req->sk;
345 
346 	if (sk->sk_protocol == IPPROTO_TCP &&
347 	    tcp_rsk(req)->tfo_listener) {
348 		spin_lock_bh(&queue->fastopenq.lock);
349 		if (tcp_rsk(req)->tfo_listener) {
350 			/* We are still waiting for the final ACK from 3WHS
351 			 * so can't free req now. Instead, we set req->sk to
352 			 * NULL to signify that the child socket is taken
353 			 * so reqsk_fastopen_remove() will free the req
354 			 * when 3WHS finishes (or is aborted).
355 			 */
356 			req->sk = NULL;
357 			req = NULL;
358 		}
359 		spin_unlock_bh(&queue->fastopenq.lock);
360 	}
361 out:
362 	release_sock(sk);
363 	if (req)
364 		reqsk_put(req);
365 	return newsk;
366 out_err:
367 	newsk = NULL;
368 	req = NULL;
369 	*err = error;
370 	goto out;
371 }
372 EXPORT_SYMBOL(inet_csk_accept);
373 
374 /*
375  * Using different timers for retransmit, delayed acks and probes
376  * We may wish use just one timer maintaining a list of expire jiffies
377  * to optimize.
378  */
inet_csk_init_xmit_timers(struct sock * sk,void (* retransmit_handler)(unsigned long),void (* delack_handler)(unsigned long),void (* keepalive_handler)(unsigned long))379 void inet_csk_init_xmit_timers(struct sock *sk,
380 			       void (*retransmit_handler)(unsigned long),
381 			       void (*delack_handler)(unsigned long),
382 			       void (*keepalive_handler)(unsigned long))
383 {
384 	struct inet_connection_sock *icsk = inet_csk(sk);
385 
386 	setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
387 			(unsigned long)sk);
388 	setup_timer(&icsk->icsk_delack_timer, delack_handler,
389 			(unsigned long)sk);
390 	setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
391 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
392 }
393 EXPORT_SYMBOL(inet_csk_init_xmit_timers);
394 
inet_csk_clear_xmit_timers(struct sock * sk)395 void inet_csk_clear_xmit_timers(struct sock *sk)
396 {
397 	struct inet_connection_sock *icsk = inet_csk(sk);
398 
399 	icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
400 
401 	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
402 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
403 	sk_stop_timer(sk, &sk->sk_timer);
404 }
405 EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
406 
inet_csk_delete_keepalive_timer(struct sock * sk)407 void inet_csk_delete_keepalive_timer(struct sock *sk)
408 {
409 	sk_stop_timer(sk, &sk->sk_timer);
410 }
411 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
412 
inet_csk_reset_keepalive_timer(struct sock * sk,unsigned long len)413 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
414 {
415 	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
416 }
417 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
418 
inet_csk_route_req(const struct sock * sk,struct flowi4 * fl4,const struct request_sock * req)419 struct dst_entry *inet_csk_route_req(const struct sock *sk,
420 				     struct flowi4 *fl4,
421 				     const struct request_sock *req)
422 {
423 	const struct inet_request_sock *ireq = inet_rsk(req);
424 	struct net *net = read_pnet(&ireq->ireq_net);
425 	struct ip_options_rcu *opt;
426 	struct rtable *rt;
427 
428 	opt = ireq_opt_deref(ireq);
429 
430 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
431 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
432 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
433 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
434 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
435 			   htons(ireq->ir_num), sk->sk_uid);
436 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
437 	rt = ip_route_output_flow(net, fl4, sk);
438 	if (IS_ERR(rt))
439 		goto no_route;
440 	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
441 		goto route_err;
442 	return &rt->dst;
443 
444 route_err:
445 	ip_rt_put(rt);
446 no_route:
447 	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
448 	return NULL;
449 }
450 EXPORT_SYMBOL_GPL(inet_csk_route_req);
451 
inet_csk_route_child_sock(const struct sock * sk,struct sock * newsk,const struct request_sock * req)452 struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
453 					    struct sock *newsk,
454 					    const struct request_sock *req)
455 {
456 	const struct inet_request_sock *ireq = inet_rsk(req);
457 	struct net *net = read_pnet(&ireq->ireq_net);
458 	struct inet_sock *newinet = inet_sk(newsk);
459 	struct ip_options_rcu *opt;
460 	struct flowi4 *fl4;
461 	struct rtable *rt;
462 
463 	opt = rcu_dereference(ireq->ireq_opt);
464 	fl4 = &newinet->cork.fl.u.ip4;
465 
466 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
467 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
468 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
469 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
470 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
471 			   htons(ireq->ir_num), sk->sk_uid);
472 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
473 	rt = ip_route_output_flow(net, fl4, sk);
474 	if (IS_ERR(rt))
475 		goto no_route;
476 	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
477 		goto route_err;
478 	return &rt->dst;
479 
480 route_err:
481 	ip_rt_put(rt);
482 no_route:
483 	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
484 	return NULL;
485 }
486 EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
487 
488 #if IS_ENABLED(CONFIG_IPV6)
489 #define AF_INET_FAMILY(fam) ((fam) == AF_INET)
490 #else
491 #define AF_INET_FAMILY(fam) true
492 #endif
493 
494 /* Only thing we need from tcp.h */
495 extern int sysctl_tcp_synack_retries;
496 
497 
498 /* Decide when to expire the request and when to resend SYN-ACK */
syn_ack_recalc(struct request_sock * req,const int thresh,const int max_retries,const u8 rskq_defer_accept,int * expire,int * resend)499 static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
500 				  const int max_retries,
501 				  const u8 rskq_defer_accept,
502 				  int *expire, int *resend)
503 {
504 	if (!rskq_defer_accept) {
505 		*expire = req->num_timeout >= thresh;
506 		*resend = 1;
507 		return;
508 	}
509 	*expire = req->num_timeout >= thresh &&
510 		  (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
511 	/*
512 	 * Do not resend while waiting for data after ACK,
513 	 * start to resend on end of deferring period to give
514 	 * last chance for data or ACK to create established socket.
515 	 */
516 	*resend = !inet_rsk(req)->acked ||
517 		  req->num_timeout >= rskq_defer_accept - 1;
518 }
519 
inet_rtx_syn_ack(const struct sock * parent,struct request_sock * req)520 int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
521 {
522 	int err = req->rsk_ops->rtx_syn_ack(parent, req);
523 
524 	if (!err)
525 		req->num_retrans++;
526 	return err;
527 }
528 EXPORT_SYMBOL(inet_rtx_syn_ack);
529 
530 /* return true if req was found in the ehash table */
reqsk_queue_unlink(struct request_sock_queue * queue,struct request_sock * req)531 static bool reqsk_queue_unlink(struct request_sock_queue *queue,
532 			       struct request_sock *req)
533 {
534 	struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
535 	bool found = false;
536 
537 	if (sk_hashed(req_to_sk(req))) {
538 		spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
539 
540 		spin_lock(lock);
541 		found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
542 		spin_unlock(lock);
543 	}
544 	if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
545 		reqsk_put(req);
546 	return found;
547 }
548 
inet_csk_reqsk_queue_drop(struct sock * sk,struct request_sock * req)549 void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
550 {
551 	if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
552 		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
553 		reqsk_put(req);
554 	}
555 }
556 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
557 
inet_csk_reqsk_queue_drop_and_put(struct sock * sk,struct request_sock * req)558 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
559 {
560 	inet_csk_reqsk_queue_drop(sk, req);
561 	reqsk_put(req);
562 }
563 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
564 
reqsk_timer_handler(unsigned long data)565 static void reqsk_timer_handler(unsigned long data)
566 {
567 	struct request_sock *req = (struct request_sock *)data;
568 	struct sock *sk_listener = req->rsk_listener;
569 	struct inet_connection_sock *icsk = inet_csk(sk_listener);
570 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
571 	int qlen, expire = 0, resend = 0;
572 	int max_retries, thresh;
573 	u8 defer_accept;
574 
575 	if (sk_state_load(sk_listener) != TCP_LISTEN)
576 		goto drop;
577 
578 	max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
579 	thresh = max_retries;
580 	/* Normally all the openreqs are young and become mature
581 	 * (i.e. converted to established socket) for first timeout.
582 	 * If synack was not acknowledged for 1 second, it means
583 	 * one of the following things: synack was lost, ack was lost,
584 	 * rtt is high or nobody planned to ack (i.e. synflood).
585 	 * When server is a bit loaded, queue is populated with old
586 	 * open requests, reducing effective size of queue.
587 	 * When server is well loaded, queue size reduces to zero
588 	 * after several minutes of work. It is not synflood,
589 	 * it is normal operation. The solution is pruning
590 	 * too old entries overriding normal timeout, when
591 	 * situation becomes dangerous.
592 	 *
593 	 * Essentially, we reserve half of room for young
594 	 * embrions; and abort old ones without pity, if old
595 	 * ones are about to clog our table.
596 	 */
597 	qlen = reqsk_queue_len(queue);
598 	if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
599 		int young = reqsk_queue_len_young(queue) << 1;
600 
601 		while (thresh > 2) {
602 			if (qlen < young)
603 				break;
604 			thresh--;
605 			young <<= 1;
606 		}
607 	}
608 	defer_accept = READ_ONCE(queue->rskq_defer_accept);
609 	if (defer_accept)
610 		max_retries = defer_accept;
611 	syn_ack_recalc(req, thresh, max_retries, defer_accept,
612 		       &expire, &resend);
613 	req->rsk_ops->syn_ack_timeout(req);
614 	if (!expire &&
615 	    (!resend ||
616 	     !inet_rtx_syn_ack(sk_listener, req) ||
617 	     inet_rsk(req)->acked)) {
618 		unsigned long timeo;
619 
620 		if (req->num_timeout++ == 0)
621 			atomic_dec(&queue->young);
622 		timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
623 		mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
624 		return;
625 	}
626 drop:
627 	inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
628 }
629 
reqsk_queue_hash_req(struct request_sock * req,unsigned long timeout)630 static void reqsk_queue_hash_req(struct request_sock *req,
631 				 unsigned long timeout)
632 {
633 	req->num_retrans = 0;
634 	req->num_timeout = 0;
635 	req->sk = NULL;
636 
637 	setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
638 	mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
639 
640 	inet_ehash_insert(req_to_sk(req), NULL);
641 	/* before letting lookups find us, make sure all req fields
642 	 * are committed to memory and refcnt initialized.
643 	 */
644 	smp_wmb();
645 	atomic_set(&req->rsk_refcnt, 2 + 1);
646 }
647 
inet_csk_reqsk_queue_hash_add(struct sock * sk,struct request_sock * req,unsigned long timeout)648 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
649 				   unsigned long timeout)
650 {
651 	reqsk_queue_hash_req(req, timeout);
652 	inet_csk_reqsk_queue_added(sk);
653 }
654 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
655 
656 /**
657  *	inet_csk_clone_lock - clone an inet socket, and lock its clone
658  *	@sk: the socket to clone
659  *	@req: request_sock
660  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
661  *
662  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
663  */
inet_csk_clone_lock(const struct sock * sk,const struct request_sock * req,const gfp_t priority)664 struct sock *inet_csk_clone_lock(const struct sock *sk,
665 				 const struct request_sock *req,
666 				 const gfp_t priority)
667 {
668 	struct sock *newsk = sk_clone_lock(sk, priority);
669 
670 	if (newsk) {
671 		struct inet_connection_sock *newicsk = inet_csk(newsk);
672 
673 		newsk->sk_state = TCP_SYN_RECV;
674 		newicsk->icsk_bind_hash = NULL;
675 
676 		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
677 		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
678 		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
679 		newsk->sk_write_space = sk_stream_write_space;
680 
681 		inet_sk(newsk)->mc_list = NULL;
682 
683 		newsk->sk_mark = inet_rsk(req)->ir_mark;
684 		atomic64_set(&newsk->sk_cookie,
685 			     atomic64_read(&inet_rsk(req)->ir_cookie));
686 
687 		newicsk->icsk_retransmits = 0;
688 		newicsk->icsk_backoff	  = 0;
689 		newicsk->icsk_probes_out  = 0;
690 
691 		/* Deinitialize accept_queue to trap illegal accesses. */
692 		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
693 
694 		security_inet_csk_clone(newsk, req);
695 	}
696 	return newsk;
697 }
698 EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
699 
700 /*
701  * At this point, there should be no process reference to this
702  * socket, and thus no user references at all.  Therefore we
703  * can assume the socket waitqueue is inactive and nobody will
704  * try to jump onto it.
705  */
inet_csk_destroy_sock(struct sock * sk)706 void inet_csk_destroy_sock(struct sock *sk)
707 {
708 	WARN_ON(sk->sk_state != TCP_CLOSE);
709 	WARN_ON(!sock_flag(sk, SOCK_DEAD));
710 
711 	/* It cannot be in hash table! */
712 	WARN_ON(!sk_unhashed(sk));
713 
714 	/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
715 	WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
716 
717 	sk->sk_prot->destroy(sk);
718 
719 	sk_stream_kill_queues(sk);
720 
721 	xfrm_sk_free_policy(sk);
722 
723 	sk_refcnt_debug_release(sk);
724 
725 	percpu_counter_dec(sk->sk_prot->orphan_count);
726 	sock_put(sk);
727 }
728 EXPORT_SYMBOL(inet_csk_destroy_sock);
729 
730 /* This function allows to force a closure of a socket after the call to
731  * tcp/dccp_create_openreq_child().
732  */
inet_csk_prepare_forced_close(struct sock * sk)733 void inet_csk_prepare_forced_close(struct sock *sk)
734 	__releases(&sk->sk_lock.slock)
735 {
736 	/* sk_clone_lock locked the socket and set refcnt to 2 */
737 	bh_unlock_sock(sk);
738 	sock_put(sk);
739 
740 	/* The below has to be done to allow calling inet_csk_destroy_sock */
741 	sock_set_flag(sk, SOCK_DEAD);
742 	percpu_counter_inc(sk->sk_prot->orphan_count);
743 	inet_sk(sk)->inet_num = 0;
744 }
745 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
746 
inet_csk_listen_start(struct sock * sk,int backlog)747 int inet_csk_listen_start(struct sock *sk, int backlog)
748 {
749 	struct inet_connection_sock *icsk = inet_csk(sk);
750 	struct inet_sock *inet = inet_sk(sk);
751 
752 	reqsk_queue_alloc(&icsk->icsk_accept_queue);
753 
754 	sk->sk_max_ack_backlog = backlog;
755 	sk->sk_ack_backlog = 0;
756 	inet_csk_delack_init(sk);
757 
758 	/* There is race window here: we announce ourselves listening,
759 	 * but this transition is still not validated by get_port().
760 	 * It is OK, because this socket enters to hash table only
761 	 * after validation is complete.
762 	 */
763 	sk_state_store(sk, TCP_LISTEN);
764 	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
765 		inet->inet_sport = htons(inet->inet_num);
766 
767 		sk_dst_reset(sk);
768 		sk->sk_prot->hash(sk);
769 
770 		return 0;
771 	}
772 
773 	sk->sk_state = TCP_CLOSE;
774 	return -EADDRINUSE;
775 }
776 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
777 
inet_child_forget(struct sock * sk,struct request_sock * req,struct sock * child)778 static void inet_child_forget(struct sock *sk, struct request_sock *req,
779 			      struct sock *child)
780 {
781 	sk->sk_prot->disconnect(child, O_NONBLOCK);
782 
783 	sock_orphan(child);
784 
785 	percpu_counter_inc(sk->sk_prot->orphan_count);
786 
787 	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
788 		BUG_ON(tcp_sk(child)->fastopen_rsk != req);
789 		BUG_ON(sk != req->rsk_listener);
790 
791 		/* Paranoid, to prevent race condition if
792 		 * an inbound pkt destined for child is
793 		 * blocked by sock lock in tcp_v4_rcv().
794 		 * Also to satisfy an assertion in
795 		 * tcp_v4_destroy_sock().
796 		 */
797 		tcp_sk(child)->fastopen_rsk = NULL;
798 	}
799 	inet_csk_destroy_sock(child);
800 }
801 
inet_csk_reqsk_queue_add(struct sock * sk,struct request_sock * req,struct sock * child)802 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
803 				      struct request_sock *req,
804 				      struct sock *child)
805 {
806 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
807 
808 	spin_lock(&queue->rskq_lock);
809 	if (unlikely(sk->sk_state != TCP_LISTEN)) {
810 		inet_child_forget(sk, req, child);
811 		child = NULL;
812 	} else {
813 		req->sk = child;
814 		req->dl_next = NULL;
815 		if (queue->rskq_accept_head == NULL)
816 			queue->rskq_accept_head = req;
817 		else
818 			queue->rskq_accept_tail->dl_next = req;
819 		queue->rskq_accept_tail = req;
820 		sk_acceptq_added(sk);
821 	}
822 	spin_unlock(&queue->rskq_lock);
823 	return child;
824 }
825 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
826 
inet_csk_complete_hashdance(struct sock * sk,struct sock * child,struct request_sock * req,bool own_req)827 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
828 					 struct request_sock *req, bool own_req)
829 {
830 	if (own_req) {
831 		inet_csk_reqsk_queue_drop(sk, req);
832 		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
833 		if (inet_csk_reqsk_queue_add(sk, req, child))
834 			return child;
835 	}
836 	/* Too bad, another child took ownership of the request, undo. */
837 	bh_unlock_sock(child);
838 	sock_put(child);
839 	return NULL;
840 }
841 EXPORT_SYMBOL(inet_csk_complete_hashdance);
842 
843 /*
844  *	This routine closes sockets which have been at least partially
845  *	opened, but not yet accepted.
846  */
inet_csk_listen_stop(struct sock * sk)847 void inet_csk_listen_stop(struct sock *sk)
848 {
849 	struct inet_connection_sock *icsk = inet_csk(sk);
850 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
851 	struct request_sock *next, *req;
852 
853 	/* Following specs, it would be better either to send FIN
854 	 * (and enter FIN-WAIT-1, it is normal close)
855 	 * or to send active reset (abort).
856 	 * Certainly, it is pretty dangerous while synflood, but it is
857 	 * bad justification for our negligence 8)
858 	 * To be honest, we are not able to make either
859 	 * of the variants now.			--ANK
860 	 */
861 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
862 		struct sock *child = req->sk;
863 
864 		local_bh_disable();
865 		bh_lock_sock(child);
866 		WARN_ON(sock_owned_by_user(child));
867 		sock_hold(child);
868 
869 		inet_child_forget(sk, req, child);
870 		reqsk_put(req);
871 		bh_unlock_sock(child);
872 		local_bh_enable();
873 		sock_put(child);
874 
875 		cond_resched();
876 	}
877 	if (queue->fastopenq.rskq_rst_head) {
878 		/* Free all the reqs queued in rskq_rst_head. */
879 		spin_lock_bh(&queue->fastopenq.lock);
880 		req = queue->fastopenq.rskq_rst_head;
881 		queue->fastopenq.rskq_rst_head = NULL;
882 		spin_unlock_bh(&queue->fastopenq.lock);
883 		while (req != NULL) {
884 			next = req->dl_next;
885 			reqsk_put(req);
886 			req = next;
887 		}
888 	}
889 	WARN_ON_ONCE(sk->sk_ack_backlog);
890 }
891 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
892 
inet_csk_addr2sockaddr(struct sock * sk,struct sockaddr * uaddr)893 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
894 {
895 	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
896 	const struct inet_sock *inet = inet_sk(sk);
897 
898 	sin->sin_family		= AF_INET;
899 	sin->sin_addr.s_addr	= inet->inet_daddr;
900 	sin->sin_port		= inet->inet_dport;
901 }
902 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
903 
904 #ifdef CONFIG_COMPAT
inet_csk_compat_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * optlen)905 int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
906 			       char __user *optval, int __user *optlen)
907 {
908 	const struct inet_connection_sock *icsk = inet_csk(sk);
909 
910 	if (icsk->icsk_af_ops->compat_getsockopt)
911 		return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
912 							    optval, optlen);
913 	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
914 					     optval, optlen);
915 }
916 EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
917 
inet_csk_compat_setsockopt(struct sock * sk,int level,int optname,char __user * optval,unsigned int optlen)918 int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
919 			       char __user *optval, unsigned int optlen)
920 {
921 	const struct inet_connection_sock *icsk = inet_csk(sk);
922 
923 	if (icsk->icsk_af_ops->compat_setsockopt)
924 		return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
925 							    optval, optlen);
926 	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
927 					     optval, optlen);
928 }
929 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
930 #endif
931 
inet_csk_rebuild_route(struct sock * sk,struct flowi * fl)932 static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
933 {
934 	const struct inet_sock *inet = inet_sk(sk);
935 	const struct ip_options_rcu *inet_opt;
936 	__be32 daddr = inet->inet_daddr;
937 	struct flowi4 *fl4;
938 	struct rtable *rt;
939 
940 	rcu_read_lock();
941 	inet_opt = rcu_dereference(inet->inet_opt);
942 	if (inet_opt && inet_opt->opt.srr)
943 		daddr = inet_opt->opt.faddr;
944 	fl4 = &fl->u.ip4;
945 	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
946 				   inet->inet_saddr, inet->inet_dport,
947 				   inet->inet_sport, sk->sk_protocol,
948 				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
949 	if (IS_ERR(rt))
950 		rt = NULL;
951 	if (rt)
952 		sk_setup_caps(sk, &rt->dst);
953 	rcu_read_unlock();
954 
955 	return &rt->dst;
956 }
957 
inet_csk_update_pmtu(struct sock * sk,u32 mtu)958 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
959 {
960 	struct dst_entry *dst = __sk_dst_check(sk, 0);
961 	struct inet_sock *inet = inet_sk(sk);
962 
963 	if (!dst) {
964 		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
965 		if (!dst)
966 			goto out;
967 	}
968 	dst->ops->update_pmtu(dst, sk, NULL, mtu);
969 
970 	dst = __sk_dst_check(sk, 0);
971 	if (!dst)
972 		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
973 out:
974 	return dst;
975 }
976 EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
977