• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
tcp_v4_init_seq(const struct sk_buff * skb)93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
129 				loopback = true;
130 		} else
131 #endif
132 		{
133 			if (ipv4_is_loopback(tw->tw_daddr) ||
134 			    ipv4_is_loopback(tw->tw_rcv_saddr))
135 				loopback = true;
136 		}
137 		if (!loopback)
138 			reuse = 0;
139 	}
140 
141 	/* With PAWS, it is safe from the viewpoint
142 	   of data integrity. Even without PAWS it is safe provided sequence
143 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 
145 	   Actually, the idea is close to VJ's one, only timestamp cache is
146 	   held not per host, but per port pair and TW bucket is used as state
147 	   holder.
148 
149 	   If TW bucket has been already destroyed we fall back to VJ's scheme
150 	   and use initial timestamp retrieved from peer table.
151 	 */
152 	if (tcptw->tw_ts_recent_stamp &&
153 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
154 					    tcptw->tw_ts_recent_stamp)))) {
155 		/* In case of repair and re-using TIME-WAIT sockets we still
156 		 * want to be sure that it is safe as above but honor the
157 		 * sequence numbers and time stamps set as part of the repair
158 		 * process.
159 		 *
160 		 * Without this check re-using a TIME-WAIT socket with TCP
161 		 * repair would accumulate a -1 on the repair assigned
162 		 * sequence number. The first time it is reused the sequence
163 		 * is -1, the second time -2, etc. This fixes that issue
164 		 * without appearing to create any others.
165 		 */
166 		if (likely(!tp->repair)) {
167 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
168 
169 			if (!seq)
170 				seq = 1;
171 			WRITE_ONCE(tp->write_seq, seq);
172 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
173 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
174 		}
175 		sock_hold(sktw);
176 		return 1;
177 	}
178 
179 	return 0;
180 }
181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
184 			      int addr_len)
185 {
186 	/* This check is replicated from tcp_v4_connect() and intended to
187 	 * prevent BPF program called below from accessing bytes that are out
188 	 * of the bound specified by user in addr_len.
189 	 */
190 	if (addr_len < sizeof(struct sockaddr_in))
191 		return -EINVAL;
192 
193 	sock_owned_by_me(sk);
194 
195 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 }
197 
198 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 {
201 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
202 	struct inet_sock *inet = inet_sk(sk);
203 	struct tcp_sock *tp = tcp_sk(sk);
204 	__be16 orig_sport, orig_dport;
205 	__be32 daddr, nexthop;
206 	struct flowi4 *fl4;
207 	struct rtable *rt;
208 	int err;
209 	struct ip_options_rcu *inet_opt;
210 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 
212 	if (addr_len < sizeof(struct sockaddr_in))
213 		return -EINVAL;
214 
215 	if (usin->sin_family != AF_INET)
216 		return -EAFNOSUPPORT;
217 
218 	nexthop = daddr = usin->sin_addr.s_addr;
219 	inet_opt = rcu_dereference_protected(inet->inet_opt,
220 					     lockdep_sock_is_held(sk));
221 	if (inet_opt && inet_opt->opt.srr) {
222 		if (!daddr)
223 			return -EINVAL;
224 		nexthop = inet_opt->opt.faddr;
225 	}
226 
227 	orig_sport = inet->inet_sport;
228 	orig_dport = usin->sin_port;
229 	fl4 = &inet->cork.fl.u.ip4;
230 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
231 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 			      IPPROTO_TCP,
233 			      orig_sport, orig_dport, sk);
234 	if (IS_ERR(rt)) {
235 		err = PTR_ERR(rt);
236 		if (err == -ENETUNREACH)
237 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 		return err;
239 	}
240 
241 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 		ip_rt_put(rt);
243 		return -ENETUNREACH;
244 	}
245 
246 	if (!inet_opt || !inet_opt->opt.srr)
247 		daddr = fl4->daddr;
248 
249 	if (!inet->inet_saddr)
250 		inet->inet_saddr = fl4->saddr;
251 	sk_rcv_saddr_set(sk, inet->inet_saddr);
252 
253 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 		/* Reset inherited state */
255 		tp->rx_opt.ts_recent	   = 0;
256 		tp->rx_opt.ts_recent_stamp = 0;
257 		if (likely(!tp->repair))
258 			WRITE_ONCE(tp->write_seq, 0);
259 	}
260 
261 	inet->inet_dport = usin->sin_port;
262 	sk_daddr_set(sk, daddr);
263 
264 	inet_csk(sk)->icsk_ext_hdr_len = 0;
265 	if (inet_opt)
266 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 
268 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 
270 	/* Socket identity is still unknown (sport may be zero).
271 	 * However we set state to SYN-SENT and not releasing socket
272 	 * lock select source port, enter ourselves into the hash tables and
273 	 * complete initialization after this.
274 	 */
275 	tcp_set_state(sk, TCP_SYN_SENT);
276 	err = inet_hash_connect(tcp_death_row, sk);
277 	if (err)
278 		goto failure;
279 
280 	sk_set_txhash(sk);
281 
282 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 			       inet->inet_sport, inet->inet_dport, sk);
284 	if (IS_ERR(rt)) {
285 		err = PTR_ERR(rt);
286 		rt = NULL;
287 		goto failure;
288 	}
289 	/* OK, now commit destination to socket.  */
290 	sk->sk_gso_type = SKB_GSO_TCPV4;
291 	sk_setup_caps(sk, &rt->dst);
292 	rt = NULL;
293 
294 	if (likely(!tp->repair)) {
295 		if (!tp->write_seq)
296 			WRITE_ONCE(tp->write_seq,
297 				   secure_tcp_seq(inet->inet_saddr,
298 						  inet->inet_daddr,
299 						  inet->inet_sport,
300 						  usin->sin_port));
301 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 						 inet->inet_saddr,
303 						 inet->inet_daddr);
304 	}
305 
306 	inet->inet_id = prandom_u32();
307 
308 	if (tcp_fastopen_defer_connect(sk, &err))
309 		return err;
310 	if (err)
311 		goto failure;
312 
313 	err = tcp_connect(sk);
314 
315 	if (err)
316 		goto failure;
317 
318 	return 0;
319 
320 failure:
321 	/*
322 	 * This unhashes the socket and releases the local port,
323 	 * if necessary.
324 	 */
325 	tcp_set_state(sk, TCP_CLOSE);
326 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
327 		inet_reset_saddr(sk);
328 	ip_rt_put(rt);
329 	sk->sk_route_caps = 0;
330 	inet->inet_dport = 0;
331 	return err;
332 }
333 EXPORT_SYMBOL(tcp_v4_connect);
334 
335 /*
336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337  * It can be called through tcp_release_cb() if socket was owned by user
338  * at the time tcp_v4_err() was called to handle ICMP message.
339  */
tcp_v4_mtu_reduced(struct sock * sk)340 void tcp_v4_mtu_reduced(struct sock *sk)
341 {
342 	struct inet_sock *inet = inet_sk(sk);
343 	struct dst_entry *dst;
344 	u32 mtu;
345 
346 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 		return;
348 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
349 	dst = inet_csk_update_pmtu(sk, mtu);
350 	if (!dst)
351 		return;
352 
353 	/* Something is about to be wrong... Remember soft error
354 	 * for the case, if this connection will not able to recover.
355 	 */
356 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 		sk->sk_err_soft = EMSGSIZE;
358 
359 	mtu = dst_mtu(dst);
360 
361 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362 	    ip_sk_accept_pmtu(sk) &&
363 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364 		tcp_sync_mss(sk, mtu);
365 
366 		/* Resend the TCP packet because it's
367 		 * clear that the old packet has been
368 		 * dropped. This is the new "fast" path mtu
369 		 * discovery.
370 		 */
371 		tcp_simple_retransmit(sk);
372 	} /* else let the usual retransmit timer handle it */
373 }
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375 
do_redirect(struct sk_buff * skb,struct sock * sk)376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 {
378 	struct dst_entry *dst = __sk_dst_check(sk, 0);
379 
380 	if (dst)
381 		dst->ops->redirect(dst, sk, skb);
382 }
383 
384 
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 {
388 	struct request_sock *req = inet_reqsk(sk);
389 	struct net *net = sock_net(sk);
390 
391 	/* ICMPs are not backlogged, hence we cannot get
392 	 * an established socket here.
393 	 */
394 	if (seq != tcp_rsk(req)->snt_isn) {
395 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396 	} else if (abort) {
397 		/*
398 		 * Still in SYN_RECV, just remove it silently.
399 		 * There is no good way to pass the error to the newly
400 		 * created socket, and POSIX does not want network
401 		 * errors returned from accept().
402 		 */
403 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404 		tcp_listendrop(req->rsk_listener);
405 	}
406 	reqsk_put(req);
407 }
408 EXPORT_SYMBOL(tcp_req_err);
409 
410 /*
411  * This routine is called by the ICMP module when it gets some
412  * sort of error condition.  If err < 0 then the socket should
413  * be closed and the error returned to the user.  If err > 0
414  * it's just the icmp type << 8 | icmp code.  After adjustment
415  * header points to the first 8 bytes of the tcp header.  We need
416  * to find the appropriate port.
417  *
418  * The locking strategy used here is very "optimistic". When
419  * someone else accesses the socket the ICMP is just dropped
420  * and for some paths there is no check at all.
421  * A more general error queue to queue errors for later handling
422  * is probably better.
423  *
424  */
425 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 {
428 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430 	struct inet_connection_sock *icsk;
431 	struct tcp_sock *tp;
432 	struct inet_sock *inet;
433 	const int type = icmp_hdr(icmp_skb)->type;
434 	const int code = icmp_hdr(icmp_skb)->code;
435 	struct sock *sk;
436 	struct sk_buff *skb;
437 	struct request_sock *fastopen;
438 	u32 seq, snd_una;
439 	s32 remaining;
440 	u32 delta_us;
441 	int err;
442 	struct net *net = dev_net(icmp_skb->dev);
443 
444 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 				       th->dest, iph->saddr, ntohs(th->source),
446 				       inet_iif(icmp_skb), 0);
447 	if (!sk) {
448 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
449 		return -ENOENT;
450 	}
451 	if (sk->sk_state == TCP_TIME_WAIT) {
452 		inet_twsk_put(inet_twsk(sk));
453 		return 0;
454 	}
455 	seq = ntohl(th->seq);
456 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
457 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458 				     type == ICMP_TIME_EXCEEDED ||
459 				     (type == ICMP_DEST_UNREACH &&
460 				      (code == ICMP_NET_UNREACH ||
461 				       code == ICMP_HOST_UNREACH)));
462 		return 0;
463 	}
464 
465 	bh_lock_sock(sk);
466 	/* If too many ICMPs get dropped on busy
467 	 * servers this needs to be solved differently.
468 	 * We do take care of PMTU discovery (RFC1191) special case :
469 	 * we can receive locally generated ICMP messages while socket is held.
470 	 */
471 	if (sock_owned_by_user(sk)) {
472 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
473 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
474 	}
475 	if (sk->sk_state == TCP_CLOSE)
476 		goto out;
477 
478 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
479 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
480 		goto out;
481 	}
482 
483 	icsk = inet_csk(sk);
484 	tp = tcp_sk(sk);
485 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486 	fastopen = rcu_dereference(tp->fastopen_rsk);
487 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
488 	if (sk->sk_state != TCP_LISTEN &&
489 	    !between(seq, snd_una, tp->snd_nxt)) {
490 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
491 		goto out;
492 	}
493 
494 	switch (type) {
495 	case ICMP_REDIRECT:
496 		if (!sock_owned_by_user(sk))
497 			do_redirect(icmp_skb, sk);
498 		goto out;
499 	case ICMP_SOURCE_QUENCH:
500 		/* Just silently ignore these. */
501 		goto out;
502 	case ICMP_PARAMETERPROB:
503 		err = EPROTO;
504 		break;
505 	case ICMP_DEST_UNREACH:
506 		if (code > NR_ICMP_UNREACH)
507 			goto out;
508 
509 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
510 			/* We are not interested in TCP_LISTEN and open_requests
511 			 * (SYN-ACKs send out by Linux are always <576bytes so
512 			 * they should go through unfragmented).
513 			 */
514 			if (sk->sk_state == TCP_LISTEN)
515 				goto out;
516 
517 			WRITE_ONCE(tp->mtu_info, info);
518 			if (!sock_owned_by_user(sk)) {
519 				tcp_v4_mtu_reduced(sk);
520 			} else {
521 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
522 					sock_hold(sk);
523 			}
524 			goto out;
525 		}
526 
527 		err = icmp_err_convert[code].errno;
528 		/* check if icmp_skb allows revert of backoff
529 		 * (see draft-zimmermann-tcp-lcd) */
530 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531 			break;
532 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
533 		    !icsk->icsk_backoff || fastopen)
534 			break;
535 
536 		if (sock_owned_by_user(sk))
537 			break;
538 
539 		skb = tcp_rtx_queue_head(sk);
540 		if (WARN_ON_ONCE(!skb))
541 			break;
542 
543 		icsk->icsk_backoff--;
544 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
545 					       TCP_TIMEOUT_INIT;
546 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
547 
548 
549 		tcp_mstamp_refresh(tp);
550 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
551 		remaining = icsk->icsk_rto -
552 			    usecs_to_jiffies(delta_us);
553 
554 		if (remaining > 0) {
555 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
556 						  remaining, TCP_RTO_MAX);
557 		} else {
558 			/* RTO revert clocked out retransmission.
559 			 * Will retransmit now */
560 			tcp_retransmit_timer(sk);
561 		}
562 
563 		break;
564 	case ICMP_TIME_EXCEEDED:
565 		err = EHOSTUNREACH;
566 		break;
567 	default:
568 		goto out;
569 	}
570 
571 	switch (sk->sk_state) {
572 	case TCP_SYN_SENT:
573 	case TCP_SYN_RECV:
574 		/* Only in fast or simultaneous open. If a fast open socket is
575 		 * is already accepted it is treated as a connected one below.
576 		 */
577 		if (fastopen && !fastopen->sk)
578 			break;
579 
580 		if (!sock_owned_by_user(sk)) {
581 			sk->sk_err = err;
582 
583 			sk->sk_error_report(sk);
584 
585 			tcp_done(sk);
586 		} else {
587 			sk->sk_err_soft = err;
588 		}
589 		goto out;
590 	}
591 
592 	/* If we've already connected we will keep trying
593 	 * until we time out, or the user gives up.
594 	 *
595 	 * rfc1122 4.2.3.9 allows to consider as hard errors
596 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
597 	 * but it is obsoleted by pmtu discovery).
598 	 *
599 	 * Note, that in modern internet, where routing is unreliable
600 	 * and in each dark corner broken firewalls sit, sending random
601 	 * errors ordered by their masters even this two messages finally lose
602 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
603 	 *
604 	 * Now we are in compliance with RFCs.
605 	 *							--ANK (980905)
606 	 */
607 
608 	inet = inet_sk(sk);
609 	if (!sock_owned_by_user(sk) && inet->recverr) {
610 		sk->sk_err = err;
611 		sk->sk_error_report(sk);
612 	} else	{ /* Only an error on timeout */
613 		sk->sk_err_soft = err;
614 	}
615 
616 out:
617 	bh_unlock_sock(sk);
618 	sock_put(sk);
619 	return 0;
620 }
621 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624 	struct tcphdr *th = tcp_hdr(skb);
625 
626 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627 	skb->csum_start = skb_transport_header(skb) - skb->head;
628 	skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630 
631 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634 	const struct inet_sock *inet = inet_sk(sk);
635 
636 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639 
640 /*
641  *	This routine will send an RST to the other tcp.
642  *
643  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644  *		      for reset.
645  *	Answer: if a packet caused RST, it is not for a socket
646  *		existing in our system, if it is matched to a socket,
647  *		it is just duplicate segment or bug in other side's TCP.
648  *		So that we build reply only basing on parameters
649  *		arrived with segment.
650  *	Exception: precedence violation. We do not implement it in any case.
651  */
652 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655 	const struct tcphdr *th = tcp_hdr(skb);
656 	struct {
657 		struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661 	} rep;
662 	struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664 	struct tcp_md5sig_key *key = NULL;
665 	const __u8 *hash_location = NULL;
666 	unsigned char newhash[16];
667 	int genhash;
668 	struct sock *sk1 = NULL;
669 #endif
670 	u64 transmit_time = 0;
671 	struct sock *ctl_sk;
672 	struct net *net;
673 
674 	/* Never send a reset in response to a reset. */
675 	if (th->rst)
676 		return;
677 
678 	/* If sk not NULL, it means we did a successful lookup and incoming
679 	 * route had to be correct. prequeue might have dropped our dst.
680 	 */
681 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
682 		return;
683 
684 	/* Swap the send and the receive. */
685 	memset(&rep, 0, sizeof(rep));
686 	rep.th.dest   = th->source;
687 	rep.th.source = th->dest;
688 	rep.th.doff   = sizeof(struct tcphdr) / 4;
689 	rep.th.rst    = 1;
690 
691 	if (th->ack) {
692 		rep.th.seq = th->ack_seq;
693 	} else {
694 		rep.th.ack = 1;
695 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
696 				       skb->len - (th->doff << 2));
697 	}
698 
699 	memset(&arg, 0, sizeof(arg));
700 	arg.iov[0].iov_base = (unsigned char *)&rep;
701 	arg.iov[0].iov_len  = sizeof(rep.th);
702 
703 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
704 #ifdef CONFIG_TCP_MD5SIG
705 	rcu_read_lock();
706 	hash_location = tcp_parse_md5sig_option(th);
707 	if (sk && sk_fullsock(sk)) {
708 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
709 					&ip_hdr(skb)->saddr, AF_INET);
710 	} else if (hash_location) {
711 		/*
712 		 * active side is lost. Try to find listening socket through
713 		 * source port, and then find md5 key through listening socket.
714 		 * we are not loose security here:
715 		 * Incoming packet is checked with md5 hash with finding key,
716 		 * no RST generated if md5 hash doesn't match.
717 		 */
718 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
719 					     ip_hdr(skb)->saddr,
720 					     th->source, ip_hdr(skb)->daddr,
721 					     ntohs(th->source), inet_iif(skb),
722 					     tcp_v4_sdif(skb));
723 		/* don't send rst if it can't find key */
724 		if (!sk1)
725 			goto out;
726 
727 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
728 					&ip_hdr(skb)->saddr, AF_INET);
729 		if (!key)
730 			goto out;
731 
732 
733 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
734 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
735 			goto out;
736 
737 	}
738 
739 	if (key) {
740 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
741 				   (TCPOPT_NOP << 16) |
742 				   (TCPOPT_MD5SIG << 8) |
743 				   TCPOLEN_MD5SIG);
744 		/* Update length and the length the header thinks exists */
745 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
746 		rep.th.doff = arg.iov[0].iov_len / 4;
747 
748 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
749 				     key, ip_hdr(skb)->saddr,
750 				     ip_hdr(skb)->daddr, &rep.th);
751 	}
752 #endif
753 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
754 				      ip_hdr(skb)->saddr, /* XXX */
755 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
756 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
757 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
758 
759 	/* When socket is gone, all binding information is lost.
760 	 * routing might fail in this case. No choice here, if we choose to force
761 	 * input interface, we will misroute in case of asymmetric route.
762 	 */
763 	if (sk) {
764 		arg.bound_dev_if = sk->sk_bound_dev_if;
765 		if (sk_fullsock(sk))
766 			trace_tcp_send_reset(sk, skb);
767 	}
768 
769 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
770 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
771 
772 	arg.tos = ip_hdr(skb)->tos;
773 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
774 	local_bh_disable();
775 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
776 	if (sk) {
777 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
778 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
779 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
780 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
781 		transmit_time = tcp_transmit_time(sk);
782 	}
783 	ip_send_unicast_reply(ctl_sk,
784 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
785 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
786 			      &arg, arg.iov[0].iov_len,
787 			      transmit_time);
788 
789 	ctl_sk->sk_mark = 0;
790 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
791 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
792 	local_bh_enable();
793 
794 #ifdef CONFIG_TCP_MD5SIG
795 out:
796 	rcu_read_unlock();
797 #endif
798 }
799 
800 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
801    outside socket context is ugly, certainly. What can I do?
802  */
803 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)804 static void tcp_v4_send_ack(const struct sock *sk,
805 			    struct sk_buff *skb, u32 seq, u32 ack,
806 			    u32 win, u32 tsval, u32 tsecr, int oif,
807 			    struct tcp_md5sig_key *key,
808 			    int reply_flags, u8 tos)
809 {
810 	const struct tcphdr *th = tcp_hdr(skb);
811 	struct {
812 		struct tcphdr th;
813 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
814 #ifdef CONFIG_TCP_MD5SIG
815 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
816 #endif
817 			];
818 	} rep;
819 	struct net *net = sock_net(sk);
820 	struct ip_reply_arg arg;
821 	struct sock *ctl_sk;
822 	u64 transmit_time;
823 
824 	memset(&rep.th, 0, sizeof(struct tcphdr));
825 	memset(&arg, 0, sizeof(arg));
826 
827 	arg.iov[0].iov_base = (unsigned char *)&rep;
828 	arg.iov[0].iov_len  = sizeof(rep.th);
829 	if (tsecr) {
830 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
831 				   (TCPOPT_TIMESTAMP << 8) |
832 				   TCPOLEN_TIMESTAMP);
833 		rep.opt[1] = htonl(tsval);
834 		rep.opt[2] = htonl(tsecr);
835 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
836 	}
837 
838 	/* Swap the send and the receive. */
839 	rep.th.dest    = th->source;
840 	rep.th.source  = th->dest;
841 	rep.th.doff    = arg.iov[0].iov_len / 4;
842 	rep.th.seq     = htonl(seq);
843 	rep.th.ack_seq = htonl(ack);
844 	rep.th.ack     = 1;
845 	rep.th.window  = htons(win);
846 
847 #ifdef CONFIG_TCP_MD5SIG
848 	if (key) {
849 		int offset = (tsecr) ? 3 : 0;
850 
851 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
852 					  (TCPOPT_NOP << 16) |
853 					  (TCPOPT_MD5SIG << 8) |
854 					  TCPOLEN_MD5SIG);
855 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 		rep.th.doff = arg.iov[0].iov_len/4;
857 
858 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
859 				    key, ip_hdr(skb)->saddr,
860 				    ip_hdr(skb)->daddr, &rep.th);
861 	}
862 #endif
863 	arg.flags = reply_flags;
864 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
865 				      ip_hdr(skb)->saddr, /* XXX */
866 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
867 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
868 	if (oif)
869 		arg.bound_dev_if = oif;
870 	arg.tos = tos;
871 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
872 	local_bh_disable();
873 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
874 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
876 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
878 	transmit_time = tcp_transmit_time(sk);
879 	ip_send_unicast_reply(ctl_sk,
880 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
881 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
882 			      &arg, arg.iov[0].iov_len,
883 			      transmit_time);
884 
885 	ctl_sk->sk_mark = 0;
886 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
887 	local_bh_enable();
888 }
889 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)890 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
891 {
892 	struct inet_timewait_sock *tw = inet_twsk(sk);
893 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
894 
895 	tcp_v4_send_ack(sk, skb,
896 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
897 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
898 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
899 			tcptw->tw_ts_recent,
900 			tw->tw_bound_dev_if,
901 			tcp_twsk_md5_key(tcptw),
902 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
903 			tw->tw_tos
904 			);
905 
906 	inet_twsk_put(tw);
907 }
908 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)909 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
910 				  struct request_sock *req)
911 {
912 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
913 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
914 	 */
915 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
916 					     tcp_sk(sk)->snd_nxt;
917 
918 	/* RFC 7323 2.3
919 	 * The window field (SEG.WND) of every outgoing segment, with the
920 	 * exception of <SYN> segments, MUST be right-shifted by
921 	 * Rcv.Wind.Shift bits:
922 	 */
923 	tcp_v4_send_ack(sk, skb, seq,
924 			tcp_rsk(req)->rcv_nxt,
925 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
926 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
927 			req->ts_recent,
928 			0,
929 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
930 					  AF_INET),
931 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
932 			ip_hdr(skb)->tos);
933 }
934 
935 /*
936  *	Send a SYN-ACK after having received a SYN.
937  *	This still operates on a request_sock only, not on a big
938  *	socket.
939  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)940 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
941 			      struct flowi *fl,
942 			      struct request_sock *req,
943 			      struct tcp_fastopen_cookie *foc,
944 			      enum tcp_synack_type synack_type)
945 {
946 	const struct inet_request_sock *ireq = inet_rsk(req);
947 	struct flowi4 fl4;
948 	int err = -1;
949 	struct sk_buff *skb;
950 
951 	/* First, grab a route. */
952 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
953 		return -1;
954 
955 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
956 
957 	if (skb) {
958 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
959 
960 		rcu_read_lock();
961 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
962 					    ireq->ir_rmt_addr,
963 					    rcu_dereference(ireq->ireq_opt));
964 		rcu_read_unlock();
965 		err = net_xmit_eval(err);
966 	}
967 
968 	return err;
969 }
970 
971 /*
972  *	IPv4 request_sock destructor.
973  */
tcp_v4_reqsk_destructor(struct request_sock * req)974 static void tcp_v4_reqsk_destructor(struct request_sock *req)
975 {
976 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
977 }
978 
979 #ifdef CONFIG_TCP_MD5SIG
980 /*
981  * RFC2385 MD5 checksumming requires a mapping of
982  * IP address->MD5 Key.
983  * We need to maintain these in the sk structure.
984  */
985 
986 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
987 EXPORT_SYMBOL(tcp_md5_needed);
988 
989 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)990 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
991 					   const union tcp_md5_addr *addr,
992 					   int family)
993 {
994 	const struct tcp_sock *tp = tcp_sk(sk);
995 	struct tcp_md5sig_key *key;
996 	const struct tcp_md5sig_info *md5sig;
997 	__be32 mask;
998 	struct tcp_md5sig_key *best_match = NULL;
999 	bool match;
1000 
1001 	/* caller either holds rcu_read_lock() or socket lock */
1002 	md5sig = rcu_dereference_check(tp->md5sig_info,
1003 				       lockdep_sock_is_held(sk));
1004 	if (!md5sig)
1005 		return NULL;
1006 
1007 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1008 		if (key->family != family)
1009 			continue;
1010 
1011 		if (family == AF_INET) {
1012 			mask = inet_make_mask(key->prefixlen);
1013 			match = (key->addr.a4.s_addr & mask) ==
1014 				(addr->a4.s_addr & mask);
1015 #if IS_ENABLED(CONFIG_IPV6)
1016 		} else if (family == AF_INET6) {
1017 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1018 						  key->prefixlen);
1019 #endif
1020 		} else {
1021 			match = false;
1022 		}
1023 
1024 		if (match && (!best_match ||
1025 			      key->prefixlen > best_match->prefixlen))
1026 			best_match = key;
1027 	}
1028 	return best_match;
1029 }
1030 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1031 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1032 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1033 						      const union tcp_md5_addr *addr,
1034 						      int family, u8 prefixlen)
1035 {
1036 	const struct tcp_sock *tp = tcp_sk(sk);
1037 	struct tcp_md5sig_key *key;
1038 	unsigned int size = sizeof(struct in_addr);
1039 	const struct tcp_md5sig_info *md5sig;
1040 
1041 	/* caller either holds rcu_read_lock() or socket lock */
1042 	md5sig = rcu_dereference_check(tp->md5sig_info,
1043 				       lockdep_sock_is_held(sk));
1044 	if (!md5sig)
1045 		return NULL;
1046 #if IS_ENABLED(CONFIG_IPV6)
1047 	if (family == AF_INET6)
1048 		size = sizeof(struct in6_addr);
1049 #endif
1050 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1051 		if (key->family != family)
1052 			continue;
1053 		if (!memcmp(&key->addr, addr, size) &&
1054 		    key->prefixlen == prefixlen)
1055 			return key;
1056 	}
1057 	return NULL;
1058 }
1059 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1060 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1061 					 const struct sock *addr_sk)
1062 {
1063 	const union tcp_md5_addr *addr;
1064 
1065 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1066 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1067 }
1068 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1069 
1070 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)1071 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1072 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1073 		   gfp_t gfp)
1074 {
1075 	/* Add Key to the list */
1076 	struct tcp_md5sig_key *key;
1077 	struct tcp_sock *tp = tcp_sk(sk);
1078 	struct tcp_md5sig_info *md5sig;
1079 
1080 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1081 	if (key) {
1082 		/* Pre-existing entry - just update that one.
1083 		 * Note that the key might be used concurrently.
1084 		 */
1085 		memcpy(key->key, newkey, newkeylen);
1086 
1087 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1088 		 * Also note that a reader could catch new key->keylen value
1089 		 * but old key->key[], this is the reason we use __GFP_ZERO
1090 		 * at sock_kmalloc() time below these lines.
1091 		 */
1092 		WRITE_ONCE(key->keylen, newkeylen);
1093 
1094 		return 0;
1095 	}
1096 
1097 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1098 					   lockdep_sock_is_held(sk));
1099 	if (!md5sig) {
1100 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1101 		if (!md5sig)
1102 			return -ENOMEM;
1103 
1104 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1105 		INIT_HLIST_HEAD(&md5sig->head);
1106 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1107 	}
1108 
1109 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1110 	if (!key)
1111 		return -ENOMEM;
1112 	if (!tcp_alloc_md5sig_pool()) {
1113 		sock_kfree_s(sk, key, sizeof(*key));
1114 		return -ENOMEM;
1115 	}
1116 
1117 	memcpy(key->key, newkey, newkeylen);
1118 	key->keylen = newkeylen;
1119 	key->family = family;
1120 	key->prefixlen = prefixlen;
1121 	memcpy(&key->addr, addr,
1122 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1123 				      sizeof(struct in_addr));
1124 	hlist_add_head_rcu(&key->node, &md5sig->head);
1125 	return 0;
1126 }
1127 EXPORT_SYMBOL(tcp_md5_do_add);
1128 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1129 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1130 		   u8 prefixlen)
1131 {
1132 	struct tcp_md5sig_key *key;
1133 
1134 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1135 	if (!key)
1136 		return -ENOENT;
1137 	hlist_del_rcu(&key->node);
1138 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1139 	kfree_rcu(key, rcu);
1140 	return 0;
1141 }
1142 EXPORT_SYMBOL(tcp_md5_do_del);
1143 
tcp_clear_md5_list(struct sock * sk)1144 static void tcp_clear_md5_list(struct sock *sk)
1145 {
1146 	struct tcp_sock *tp = tcp_sk(sk);
1147 	struct tcp_md5sig_key *key;
1148 	struct hlist_node *n;
1149 	struct tcp_md5sig_info *md5sig;
1150 
1151 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1152 
1153 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1154 		hlist_del_rcu(&key->node);
1155 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1156 		kfree_rcu(key, rcu);
1157 	}
1158 }
1159 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1160 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1161 				 char __user *optval, int optlen)
1162 {
1163 	struct tcp_md5sig cmd;
1164 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1165 	u8 prefixlen = 32;
1166 
1167 	if (optlen < sizeof(cmd))
1168 		return -EINVAL;
1169 
1170 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1171 		return -EFAULT;
1172 
1173 	if (sin->sin_family != AF_INET)
1174 		return -EINVAL;
1175 
1176 	if (optname == TCP_MD5SIG_EXT &&
1177 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1178 		prefixlen = cmd.tcpm_prefixlen;
1179 		if (prefixlen > 32)
1180 			return -EINVAL;
1181 	}
1182 
1183 	if (!cmd.tcpm_keylen)
1184 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1185 				      AF_INET, prefixlen);
1186 
1187 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1188 		return -EINVAL;
1189 
1190 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1191 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1192 			      GFP_KERNEL);
1193 }
1194 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1195 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1196 				   __be32 daddr, __be32 saddr,
1197 				   const struct tcphdr *th, int nbytes)
1198 {
1199 	struct tcp4_pseudohdr *bp;
1200 	struct scatterlist sg;
1201 	struct tcphdr *_th;
1202 
1203 	bp = hp->scratch;
1204 	bp->saddr = saddr;
1205 	bp->daddr = daddr;
1206 	bp->pad = 0;
1207 	bp->protocol = IPPROTO_TCP;
1208 	bp->len = cpu_to_be16(nbytes);
1209 
1210 	_th = (struct tcphdr *)(bp + 1);
1211 	memcpy(_th, th, sizeof(*th));
1212 	_th->check = 0;
1213 
1214 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1215 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1216 				sizeof(*bp) + sizeof(*th));
1217 	return crypto_ahash_update(hp->md5_req);
1218 }
1219 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1220 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1221 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1222 {
1223 	struct tcp_md5sig_pool *hp;
1224 	struct ahash_request *req;
1225 
1226 	hp = tcp_get_md5sig_pool();
1227 	if (!hp)
1228 		goto clear_hash_noput;
1229 	req = hp->md5_req;
1230 
1231 	if (crypto_ahash_init(req))
1232 		goto clear_hash;
1233 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1234 		goto clear_hash;
1235 	if (tcp_md5_hash_key(hp, key))
1236 		goto clear_hash;
1237 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1238 	if (crypto_ahash_final(req))
1239 		goto clear_hash;
1240 
1241 	tcp_put_md5sig_pool();
1242 	return 0;
1243 
1244 clear_hash:
1245 	tcp_put_md5sig_pool();
1246 clear_hash_noput:
1247 	memset(md5_hash, 0, 16);
1248 	return 1;
1249 }
1250 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1251 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1252 			const struct sock *sk,
1253 			const struct sk_buff *skb)
1254 {
1255 	struct tcp_md5sig_pool *hp;
1256 	struct ahash_request *req;
1257 	const struct tcphdr *th = tcp_hdr(skb);
1258 	__be32 saddr, daddr;
1259 
1260 	if (sk) { /* valid for establish/request sockets */
1261 		saddr = sk->sk_rcv_saddr;
1262 		daddr = sk->sk_daddr;
1263 	} else {
1264 		const struct iphdr *iph = ip_hdr(skb);
1265 		saddr = iph->saddr;
1266 		daddr = iph->daddr;
1267 	}
1268 
1269 	hp = tcp_get_md5sig_pool();
1270 	if (!hp)
1271 		goto clear_hash_noput;
1272 	req = hp->md5_req;
1273 
1274 	if (crypto_ahash_init(req))
1275 		goto clear_hash;
1276 
1277 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1278 		goto clear_hash;
1279 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1280 		goto clear_hash;
1281 	if (tcp_md5_hash_key(hp, key))
1282 		goto clear_hash;
1283 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1284 	if (crypto_ahash_final(req))
1285 		goto clear_hash;
1286 
1287 	tcp_put_md5sig_pool();
1288 	return 0;
1289 
1290 clear_hash:
1291 	tcp_put_md5sig_pool();
1292 clear_hash_noput:
1293 	memset(md5_hash, 0, 16);
1294 	return 1;
1295 }
1296 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1297 
1298 #endif
1299 
1300 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1301 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1302 				    const struct sk_buff *skb)
1303 {
1304 #ifdef CONFIG_TCP_MD5SIG
1305 	/*
1306 	 * This gets called for each TCP segment that arrives
1307 	 * so we want to be efficient.
1308 	 * We have 3 drop cases:
1309 	 * o No MD5 hash and one expected.
1310 	 * o MD5 hash and we're not expecting one.
1311 	 * o MD5 hash and its wrong.
1312 	 */
1313 	const __u8 *hash_location = NULL;
1314 	struct tcp_md5sig_key *hash_expected;
1315 	const struct iphdr *iph = ip_hdr(skb);
1316 	const struct tcphdr *th = tcp_hdr(skb);
1317 	int genhash;
1318 	unsigned char newhash[16];
1319 
1320 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1321 					  AF_INET);
1322 	hash_location = tcp_parse_md5sig_option(th);
1323 
1324 	/* We've parsed the options - do we have a hash? */
1325 	if (!hash_expected && !hash_location)
1326 		return false;
1327 
1328 	if (hash_expected && !hash_location) {
1329 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1330 		return true;
1331 	}
1332 
1333 	if (!hash_expected && hash_location) {
1334 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1335 		return true;
1336 	}
1337 
1338 	/* Okay, so this is hash_expected and hash_location -
1339 	 * so we need to calculate the checksum.
1340 	 */
1341 	genhash = tcp_v4_md5_hash_skb(newhash,
1342 				      hash_expected,
1343 				      NULL, skb);
1344 
1345 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1346 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1347 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1348 				     &iph->saddr, ntohs(th->source),
1349 				     &iph->daddr, ntohs(th->dest),
1350 				     genhash ? " tcp_v4_calc_md5_hash failed"
1351 				     : "");
1352 		return true;
1353 	}
1354 	return false;
1355 #endif
1356 	return false;
1357 }
1358 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1359 static void tcp_v4_init_req(struct request_sock *req,
1360 			    const struct sock *sk_listener,
1361 			    struct sk_buff *skb)
1362 {
1363 	struct inet_request_sock *ireq = inet_rsk(req);
1364 	struct net *net = sock_net(sk_listener);
1365 
1366 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1367 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1368 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1369 }
1370 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1371 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1372 					  struct flowi *fl,
1373 					  const struct request_sock *req)
1374 {
1375 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1376 }
1377 
1378 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1379 	.family		=	PF_INET,
1380 	.obj_size	=	sizeof(struct tcp_request_sock),
1381 	.rtx_syn_ack	=	tcp_rtx_synack,
1382 	.send_ack	=	tcp_v4_reqsk_send_ack,
1383 	.destructor	=	tcp_v4_reqsk_destructor,
1384 	.send_reset	=	tcp_v4_send_reset,
1385 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1386 };
1387 
1388 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1389 	.mss_clamp	=	TCP_MSS_DEFAULT,
1390 #ifdef CONFIG_TCP_MD5SIG
1391 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1392 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1393 #endif
1394 	.init_req	=	tcp_v4_init_req,
1395 #ifdef CONFIG_SYN_COOKIES
1396 	.cookie_init_seq =	cookie_v4_init_sequence,
1397 #endif
1398 	.route_req	=	tcp_v4_route_req,
1399 	.init_seq	=	tcp_v4_init_seq,
1400 	.init_ts_off	=	tcp_v4_init_ts_off,
1401 	.send_synack	=	tcp_v4_send_synack,
1402 };
1403 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1404 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1405 {
1406 	/* Never answer to SYNs send to broadcast or multicast */
1407 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1408 		goto drop;
1409 
1410 	return tcp_conn_request(&tcp_request_sock_ops,
1411 				&tcp_request_sock_ipv4_ops, sk, skb);
1412 
1413 drop:
1414 	tcp_listendrop(sk);
1415 	return 0;
1416 }
1417 EXPORT_SYMBOL(tcp_v4_conn_request);
1418 
1419 
1420 /*
1421  * The three way handshake has completed - we got a valid synack -
1422  * now create the new socket.
1423  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1424 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1425 				  struct request_sock *req,
1426 				  struct dst_entry *dst,
1427 				  struct request_sock *req_unhash,
1428 				  bool *own_req)
1429 {
1430 	struct inet_request_sock *ireq;
1431 	bool found_dup_sk = false;
1432 	struct inet_sock *newinet;
1433 	struct tcp_sock *newtp;
1434 	struct sock *newsk;
1435 #ifdef CONFIG_TCP_MD5SIG
1436 	struct tcp_md5sig_key *key;
1437 #endif
1438 	struct ip_options_rcu *inet_opt;
1439 
1440 	if (sk_acceptq_is_full(sk))
1441 		goto exit_overflow;
1442 
1443 	newsk = tcp_create_openreq_child(sk, req, skb);
1444 	if (!newsk)
1445 		goto exit_nonewsk;
1446 
1447 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1448 	inet_sk_rx_dst_set(newsk, skb);
1449 
1450 	newtp		      = tcp_sk(newsk);
1451 	newinet		      = inet_sk(newsk);
1452 	ireq		      = inet_rsk(req);
1453 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1454 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1455 	newsk->sk_bound_dev_if = ireq->ir_iif;
1456 	newinet->inet_saddr   = ireq->ir_loc_addr;
1457 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1458 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1459 	newinet->mc_index     = inet_iif(skb);
1460 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1461 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1462 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1463 	if (inet_opt)
1464 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1465 	newinet->inet_id = prandom_u32();
1466 
1467 	if (!dst) {
1468 		dst = inet_csk_route_child_sock(sk, newsk, req);
1469 		if (!dst)
1470 			goto put_and_exit;
1471 	} else {
1472 		/* syncookie case : see end of cookie_v4_check() */
1473 	}
1474 	sk_setup_caps(newsk, dst);
1475 
1476 	tcp_ca_openreq_child(newsk, dst);
1477 
1478 	tcp_sync_mss(newsk, dst_mtu(dst));
1479 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1480 
1481 	tcp_initialize_rcv_mss(newsk);
1482 
1483 #ifdef CONFIG_TCP_MD5SIG
1484 	/* Copy over the MD5 key from the original socket */
1485 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1486 				AF_INET);
1487 	if (key) {
1488 		/*
1489 		 * We're using one, so create a matching key
1490 		 * on the newsk structure. If we fail to get
1491 		 * memory, then we end up not copying the key
1492 		 * across. Shucks.
1493 		 */
1494 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1495 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1496 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1497 	}
1498 #endif
1499 
1500 	if (__inet_inherit_port(sk, newsk) < 0)
1501 		goto put_and_exit;
1502 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1503 				       &found_dup_sk);
1504 	if (likely(*own_req)) {
1505 		tcp_move_syn(newtp, req);
1506 		ireq->ireq_opt = NULL;
1507 	} else {
1508 		newinet->inet_opt = NULL;
1509 
1510 		if (!req_unhash && found_dup_sk) {
1511 			/* This code path should only be executed in the
1512 			 * syncookie case only
1513 			 */
1514 			bh_unlock_sock(newsk);
1515 			sock_put(newsk);
1516 			newsk = NULL;
1517 		}
1518 	}
1519 	return newsk;
1520 
1521 exit_overflow:
1522 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1523 exit_nonewsk:
1524 	dst_release(dst);
1525 exit:
1526 	tcp_listendrop(sk);
1527 	return NULL;
1528 put_and_exit:
1529 	newinet->inet_opt = NULL;
1530 	inet_csk_prepare_forced_close(newsk);
1531 	tcp_done(newsk);
1532 	goto exit;
1533 }
1534 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1535 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1536 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1537 {
1538 #ifdef CONFIG_SYN_COOKIES
1539 	const struct tcphdr *th = tcp_hdr(skb);
1540 
1541 	if (!th->syn)
1542 		sk = cookie_v4_check(sk, skb);
1543 #endif
1544 	return sk;
1545 }
1546 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1547 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1548 			 struct tcphdr *th, u32 *cookie)
1549 {
1550 	u16 mss = 0;
1551 #ifdef CONFIG_SYN_COOKIES
1552 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1553 				    &tcp_request_sock_ipv4_ops, sk, th);
1554 	if (mss) {
1555 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1556 		tcp_synq_overflow(sk);
1557 	}
1558 #endif
1559 	return mss;
1560 }
1561 
1562 /* The socket must have it's spinlock held when we get
1563  * here, unless it is a TCP_LISTEN socket.
1564  *
1565  * We have a potential double-lock case here, so even when
1566  * doing backlog processing we use the BH locking scheme.
1567  * This is because we cannot sleep with the original spinlock
1568  * held.
1569  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1570 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1571 {
1572 	struct sock *rsk;
1573 
1574 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575 		struct dst_entry *dst;
1576 
1577 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1578 						lockdep_sock_is_held(sk));
1579 
1580 		sock_rps_save_rxhash(sk, skb);
1581 		sk_mark_napi_id(sk, skb);
1582 		if (dst) {
1583 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1584 			    !dst->ops->check(dst, 0)) {
1585 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1586 				dst_release(dst);
1587 			}
1588 		}
1589 		tcp_rcv_established(sk, skb);
1590 		return 0;
1591 	}
1592 
1593 	if (tcp_checksum_complete(skb))
1594 		goto csum_err;
1595 
1596 	if (sk->sk_state == TCP_LISTEN) {
1597 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1598 
1599 		if (!nsk)
1600 			goto discard;
1601 		if (nsk != sk) {
1602 			if (tcp_child_process(sk, nsk, skb)) {
1603 				rsk = nsk;
1604 				goto reset;
1605 			}
1606 			return 0;
1607 		}
1608 	} else
1609 		sock_rps_save_rxhash(sk, skb);
1610 
1611 	if (tcp_rcv_state_process(sk, skb)) {
1612 		rsk = sk;
1613 		goto reset;
1614 	}
1615 	return 0;
1616 
1617 reset:
1618 	tcp_v4_send_reset(rsk, skb);
1619 discard:
1620 	kfree_skb(skb);
1621 	/* Be careful here. If this function gets more complicated and
1622 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1623 	 * might be destroyed here. This current version compiles correctly,
1624 	 * but you have been warned.
1625 	 */
1626 	return 0;
1627 
1628 csum_err:
1629 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1630 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1631 	goto discard;
1632 }
1633 EXPORT_SYMBOL(tcp_v4_do_rcv);
1634 
tcp_v4_early_demux(struct sk_buff * skb)1635 int tcp_v4_early_demux(struct sk_buff *skb)
1636 {
1637 	const struct iphdr *iph;
1638 	const struct tcphdr *th;
1639 	struct sock *sk;
1640 
1641 	if (skb->pkt_type != PACKET_HOST)
1642 		return 0;
1643 
1644 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1645 		return 0;
1646 
1647 	iph = ip_hdr(skb);
1648 	th = tcp_hdr(skb);
1649 
1650 	if (th->doff < sizeof(struct tcphdr) / 4)
1651 		return 0;
1652 
1653 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1654 				       iph->saddr, th->source,
1655 				       iph->daddr, ntohs(th->dest),
1656 				       skb->skb_iif, inet_sdif(skb));
1657 	if (sk) {
1658 		skb->sk = sk;
1659 		skb->destructor = sock_edemux;
1660 		if (sk_fullsock(sk)) {
1661 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1662 
1663 			if (dst)
1664 				dst = dst_check(dst, 0);
1665 			if (dst &&
1666 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1667 				skb_dst_set_noref(skb, dst);
1668 		}
1669 	}
1670 	return 0;
1671 }
1672 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1673 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1674 {
1675 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1676 	u32 tail_gso_size, tail_gso_segs;
1677 	struct skb_shared_info *shinfo;
1678 	const struct tcphdr *th;
1679 	struct tcphdr *thtail;
1680 	struct sk_buff *tail;
1681 	unsigned int hdrlen;
1682 	bool fragstolen;
1683 	u32 gso_segs;
1684 	u32 gso_size;
1685 	int delta;
1686 
1687 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1688 	 * we can fix skb->truesize to its real value to avoid future drops.
1689 	 * This is valid because skb is not yet charged to the socket.
1690 	 * It has been noticed pure SACK packets were sometimes dropped
1691 	 * (if cooked by drivers without copybreak feature).
1692 	 */
1693 	skb_condense(skb);
1694 
1695 	skb_dst_drop(skb);
1696 
1697 	if (unlikely(tcp_checksum_complete(skb))) {
1698 		bh_unlock_sock(sk);
1699 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1700 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1701 		return true;
1702 	}
1703 
1704 	/* Attempt coalescing to last skb in backlog, even if we are
1705 	 * above the limits.
1706 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1707 	 */
1708 	th = (const struct tcphdr *)skb->data;
1709 	hdrlen = th->doff * 4;
1710 
1711 	tail = sk->sk_backlog.tail;
1712 	if (!tail)
1713 		goto no_coalesce;
1714 	thtail = (struct tcphdr *)tail->data;
1715 
1716 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1717 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1718 	    ((TCP_SKB_CB(tail)->tcp_flags |
1719 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1720 	    !((TCP_SKB_CB(tail)->tcp_flags &
1721 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1722 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1723 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1724 #ifdef CONFIG_TLS_DEVICE
1725 	    tail->decrypted != skb->decrypted ||
1726 #endif
1727 	    thtail->doff != th->doff ||
1728 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1729 		goto no_coalesce;
1730 
1731 	__skb_pull(skb, hdrlen);
1732 
1733 	shinfo = skb_shinfo(skb);
1734 	gso_size = shinfo->gso_size ?: skb->len;
1735 	gso_segs = shinfo->gso_segs ?: 1;
1736 
1737 	shinfo = skb_shinfo(tail);
1738 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1739 	tail_gso_segs = shinfo->gso_segs ?: 1;
1740 
1741 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1742 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1743 
1744 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1745 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1746 			thtail->window = th->window;
1747 		}
1748 
1749 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1750 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1751 		 * is not entered if we append a packet with a FIN.
1752 		 * SYN, RST, URG are not present.
1753 		 * ACK is set on both packets.
1754 		 * PSH : we do not really care in TCP stack,
1755 		 *       at least for 'GRO' packets.
1756 		 */
1757 		thtail->fin |= th->fin;
1758 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1759 
1760 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1761 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1762 			tail->tstamp = skb->tstamp;
1763 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1764 		}
1765 
1766 		/* Not as strict as GRO. We only need to carry mss max value */
1767 		shinfo->gso_size = max(gso_size, tail_gso_size);
1768 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1769 
1770 		sk->sk_backlog.len += delta;
1771 		__NET_INC_STATS(sock_net(sk),
1772 				LINUX_MIB_TCPBACKLOGCOALESCE);
1773 		kfree_skb_partial(skb, fragstolen);
1774 		return false;
1775 	}
1776 	__skb_push(skb, hdrlen);
1777 
1778 no_coalesce:
1779 	/* Only socket owner can try to collapse/prune rx queues
1780 	 * to reduce memory overhead, so add a little headroom here.
1781 	 * Few sockets backlog are possibly concurrently non empty.
1782 	 */
1783 	limit += 64*1024;
1784 
1785 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1786 		bh_unlock_sock(sk);
1787 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1788 		return true;
1789 	}
1790 	return false;
1791 }
1792 EXPORT_SYMBOL(tcp_add_backlog);
1793 
tcp_filter(struct sock * sk,struct sk_buff * skb)1794 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1795 {
1796 	struct tcphdr *th = (struct tcphdr *)skb->data;
1797 
1798 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1799 }
1800 EXPORT_SYMBOL(tcp_filter);
1801 
tcp_v4_restore_cb(struct sk_buff * skb)1802 static void tcp_v4_restore_cb(struct sk_buff *skb)
1803 {
1804 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1805 		sizeof(struct inet_skb_parm));
1806 }
1807 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1808 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1809 			   const struct tcphdr *th)
1810 {
1811 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1812 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1813 	 */
1814 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1815 		sizeof(struct inet_skb_parm));
1816 	barrier();
1817 
1818 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1819 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1820 				    skb->len - th->doff * 4);
1821 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1822 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1823 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1824 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1825 	TCP_SKB_CB(skb)->sacked	 = 0;
1826 	TCP_SKB_CB(skb)->has_rxtstamp =
1827 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1828 }
1829 
1830 /*
1831  *	From tcp_input.c
1832  */
1833 
tcp_v4_rcv(struct sk_buff * skb)1834 int tcp_v4_rcv(struct sk_buff *skb)
1835 {
1836 	struct net *net = dev_net(skb->dev);
1837 	struct sk_buff *skb_to_free;
1838 	int sdif = inet_sdif(skb);
1839 	const struct iphdr *iph;
1840 	const struct tcphdr *th;
1841 	bool refcounted;
1842 	struct sock *sk;
1843 	int ret;
1844 
1845 	if (skb->pkt_type != PACKET_HOST)
1846 		goto discard_it;
1847 
1848 	/* Count it even if it's bad */
1849 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1850 
1851 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1852 		goto discard_it;
1853 
1854 	th = (const struct tcphdr *)skb->data;
1855 
1856 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1857 		goto bad_packet;
1858 	if (!pskb_may_pull(skb, th->doff * 4))
1859 		goto discard_it;
1860 
1861 	/* An explanation is required here, I think.
1862 	 * Packet length and doff are validated by header prediction,
1863 	 * provided case of th->doff==0 is eliminated.
1864 	 * So, we defer the checks. */
1865 
1866 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1867 		goto csum_error;
1868 
1869 	th = (const struct tcphdr *)skb->data;
1870 	iph = ip_hdr(skb);
1871 lookup:
1872 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1873 			       th->dest, sdif, &refcounted);
1874 	if (!sk)
1875 		goto no_tcp_socket;
1876 
1877 process:
1878 	if (sk->sk_state == TCP_TIME_WAIT)
1879 		goto do_time_wait;
1880 
1881 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1882 		struct request_sock *req = inet_reqsk(sk);
1883 		bool req_stolen = false;
1884 		struct sock *nsk;
1885 
1886 		sk = req->rsk_listener;
1887 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1888 			sk_drops_add(sk, skb);
1889 			reqsk_put(req);
1890 			goto discard_it;
1891 		}
1892 		if (tcp_checksum_complete(skb)) {
1893 			reqsk_put(req);
1894 			goto csum_error;
1895 		}
1896 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1897 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1898 			goto lookup;
1899 		}
1900 		/* We own a reference on the listener, increase it again
1901 		 * as we might lose it too soon.
1902 		 */
1903 		sock_hold(sk);
1904 		refcounted = true;
1905 		nsk = NULL;
1906 		if (!tcp_filter(sk, skb)) {
1907 			th = (const struct tcphdr *)skb->data;
1908 			iph = ip_hdr(skb);
1909 			tcp_v4_fill_cb(skb, iph, th);
1910 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1911 		}
1912 		if (!nsk) {
1913 			reqsk_put(req);
1914 			if (req_stolen) {
1915 				/* Another cpu got exclusive access to req
1916 				 * and created a full blown socket.
1917 				 * Try to feed this packet to this socket
1918 				 * instead of discarding it.
1919 				 */
1920 				tcp_v4_restore_cb(skb);
1921 				sock_put(sk);
1922 				goto lookup;
1923 			}
1924 			goto discard_and_relse;
1925 		}
1926 		if (nsk == sk) {
1927 			reqsk_put(req);
1928 			tcp_v4_restore_cb(skb);
1929 		} else if (tcp_child_process(sk, nsk, skb)) {
1930 			tcp_v4_send_reset(nsk, skb);
1931 			goto discard_and_relse;
1932 		} else {
1933 			sock_put(sk);
1934 			return 0;
1935 		}
1936 	}
1937 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1938 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1939 		goto discard_and_relse;
1940 	}
1941 
1942 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1943 		goto discard_and_relse;
1944 
1945 	if (tcp_v4_inbound_md5_hash(sk, skb))
1946 		goto discard_and_relse;
1947 
1948 	nf_reset_ct(skb);
1949 
1950 	if (tcp_filter(sk, skb))
1951 		goto discard_and_relse;
1952 	th = (const struct tcphdr *)skb->data;
1953 	iph = ip_hdr(skb);
1954 	tcp_v4_fill_cb(skb, iph, th);
1955 
1956 	skb->dev = NULL;
1957 
1958 	if (sk->sk_state == TCP_LISTEN) {
1959 		ret = tcp_v4_do_rcv(sk, skb);
1960 		goto put_and_return;
1961 	}
1962 
1963 	sk_incoming_cpu_update(sk);
1964 
1965 	bh_lock_sock_nested(sk);
1966 	tcp_segs_in(tcp_sk(sk), skb);
1967 	ret = 0;
1968 	if (!sock_owned_by_user(sk)) {
1969 		skb_to_free = sk->sk_rx_skb_cache;
1970 		sk->sk_rx_skb_cache = NULL;
1971 		ret = tcp_v4_do_rcv(sk, skb);
1972 	} else {
1973 		if (tcp_add_backlog(sk, skb))
1974 			goto discard_and_relse;
1975 		skb_to_free = NULL;
1976 	}
1977 	bh_unlock_sock(sk);
1978 	if (skb_to_free)
1979 		__kfree_skb(skb_to_free);
1980 
1981 put_and_return:
1982 	if (refcounted)
1983 		sock_put(sk);
1984 
1985 	return ret;
1986 
1987 no_tcp_socket:
1988 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1989 		goto discard_it;
1990 
1991 	tcp_v4_fill_cb(skb, iph, th);
1992 
1993 	if (tcp_checksum_complete(skb)) {
1994 csum_error:
1995 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1996 bad_packet:
1997 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1998 	} else {
1999 		tcp_v4_send_reset(NULL, skb);
2000 	}
2001 
2002 discard_it:
2003 	/* Discard frame. */
2004 	kfree_skb(skb);
2005 	return 0;
2006 
2007 discard_and_relse:
2008 	sk_drops_add(sk, skb);
2009 	if (refcounted)
2010 		sock_put(sk);
2011 	goto discard_it;
2012 
2013 do_time_wait:
2014 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2015 		inet_twsk_put(inet_twsk(sk));
2016 		goto discard_it;
2017 	}
2018 
2019 	tcp_v4_fill_cb(skb, iph, th);
2020 
2021 	if (tcp_checksum_complete(skb)) {
2022 		inet_twsk_put(inet_twsk(sk));
2023 		goto csum_error;
2024 	}
2025 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2026 	case TCP_TW_SYN: {
2027 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2028 							&tcp_hashinfo, skb,
2029 							__tcp_hdrlen(th),
2030 							iph->saddr, th->source,
2031 							iph->daddr, th->dest,
2032 							inet_iif(skb),
2033 							sdif);
2034 		if (sk2) {
2035 			inet_twsk_deschedule_put(inet_twsk(sk));
2036 			sk = sk2;
2037 			tcp_v4_restore_cb(skb);
2038 			refcounted = false;
2039 			goto process;
2040 		}
2041 	}
2042 		/* to ACK */
2043 		/* fall through */
2044 	case TCP_TW_ACK:
2045 		tcp_v4_timewait_ack(sk, skb);
2046 		break;
2047 	case TCP_TW_RST:
2048 		tcp_v4_send_reset(sk, skb);
2049 		inet_twsk_deschedule_put(inet_twsk(sk));
2050 		goto discard_it;
2051 	case TCP_TW_SUCCESS:;
2052 	}
2053 	goto discard_it;
2054 }
2055 
2056 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2057 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2058 	.twsk_unique	= tcp_twsk_unique,
2059 	.twsk_destructor= tcp_twsk_destructor,
2060 };
2061 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2062 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2063 {
2064 	struct dst_entry *dst = skb_dst(skb);
2065 
2066 	if (dst && dst_hold_safe(dst)) {
2067 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2068 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2069 	}
2070 }
2071 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2072 
2073 const struct inet_connection_sock_af_ops ipv4_specific = {
2074 	.queue_xmit	   = ip_queue_xmit,
2075 	.send_check	   = tcp_v4_send_check,
2076 	.rebuild_header	   = inet_sk_rebuild_header,
2077 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2078 	.conn_request	   = tcp_v4_conn_request,
2079 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2080 	.net_header_len	   = sizeof(struct iphdr),
2081 	.setsockopt	   = ip_setsockopt,
2082 	.getsockopt	   = ip_getsockopt,
2083 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2084 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2085 #ifdef CONFIG_COMPAT
2086 	.compat_setsockopt = compat_ip_setsockopt,
2087 	.compat_getsockopt = compat_ip_getsockopt,
2088 #endif
2089 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2090 };
2091 EXPORT_SYMBOL(ipv4_specific);
2092 
2093 #ifdef CONFIG_TCP_MD5SIG
2094 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2095 	.md5_lookup		= tcp_v4_md5_lookup,
2096 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2097 	.md5_parse		= tcp_v4_parse_md5_keys,
2098 };
2099 #endif
2100 
2101 /* NOTE: A lot of things set to zero explicitly by call to
2102  *       sk_alloc() so need not be done here.
2103  */
tcp_v4_init_sock(struct sock * sk)2104 static int tcp_v4_init_sock(struct sock *sk)
2105 {
2106 	struct inet_connection_sock *icsk = inet_csk(sk);
2107 
2108 	tcp_init_sock(sk);
2109 
2110 	icsk->icsk_af_ops = &ipv4_specific;
2111 
2112 #ifdef CONFIG_TCP_MD5SIG
2113 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2114 #endif
2115 
2116 	return 0;
2117 }
2118 
tcp_v4_destroy_sock(struct sock * sk)2119 void tcp_v4_destroy_sock(struct sock *sk)
2120 {
2121 	struct tcp_sock *tp = tcp_sk(sk);
2122 
2123 	trace_tcp_destroy_sock(sk);
2124 
2125 	tcp_clear_xmit_timers(sk);
2126 
2127 	tcp_cleanup_congestion_control(sk);
2128 
2129 	tcp_cleanup_ulp(sk);
2130 
2131 	/* Cleanup up the write buffer. */
2132 	tcp_write_queue_purge(sk);
2133 
2134 	/* Check if we want to disable active TFO */
2135 	tcp_fastopen_active_disable_ofo_check(sk);
2136 
2137 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2138 	skb_rbtree_purge(&tp->out_of_order_queue);
2139 
2140 #ifdef CONFIG_TCP_MD5SIG
2141 	/* Clean up the MD5 key list, if any */
2142 	if (tp->md5sig_info) {
2143 		tcp_clear_md5_list(sk);
2144 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2145 		tp->md5sig_info = NULL;
2146 	}
2147 #endif
2148 
2149 	/* Clean up a referenced TCP bind bucket. */
2150 	if (inet_csk(sk)->icsk_bind_hash)
2151 		inet_put_port(sk);
2152 
2153 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2154 
2155 	/* If socket is aborted during connect operation */
2156 	tcp_free_fastopen_req(tp);
2157 	tcp_fastopen_destroy_cipher(sk);
2158 	tcp_saved_syn_free(tp);
2159 
2160 	sk_sockets_allocated_dec(sk);
2161 }
2162 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2163 
2164 #ifdef CONFIG_PROC_FS
2165 /* Proc filesystem TCP sock list dumping. */
2166 
2167 /*
2168  * Get next listener socket follow cur.  If cur is NULL, get first socket
2169  * starting from bucket given in st->bucket; when st->bucket is zero the
2170  * very first socket in the hash table is returned.
2171  */
listening_get_next(struct seq_file * seq,void * cur)2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2173 {
2174 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2175 	struct tcp_iter_state *st = seq->private;
2176 	struct net *net = seq_file_net(seq);
2177 	struct inet_listen_hashbucket *ilb;
2178 	struct hlist_nulls_node *node;
2179 	struct sock *sk = cur;
2180 
2181 	if (!sk) {
2182 get_head:
2183 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2184 		spin_lock(&ilb->lock);
2185 		sk = sk_nulls_head(&ilb->nulls_head);
2186 		st->offset = 0;
2187 		goto get_sk;
2188 	}
2189 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2190 	++st->num;
2191 	++st->offset;
2192 
2193 	sk = sk_nulls_next(sk);
2194 get_sk:
2195 	sk_nulls_for_each_from(sk, node) {
2196 		if (!net_eq(sock_net(sk), net))
2197 			continue;
2198 		if (sk->sk_family == afinfo->family)
2199 			return sk;
2200 	}
2201 	spin_unlock(&ilb->lock);
2202 	st->offset = 0;
2203 	if (++st->bucket < INET_LHTABLE_SIZE)
2204 		goto get_head;
2205 	return NULL;
2206 }
2207 
listening_get_idx(struct seq_file * seq,loff_t * pos)2208 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2209 {
2210 	struct tcp_iter_state *st = seq->private;
2211 	void *rc;
2212 
2213 	st->bucket = 0;
2214 	st->offset = 0;
2215 	rc = listening_get_next(seq, NULL);
2216 
2217 	while (rc && *pos) {
2218 		rc = listening_get_next(seq, rc);
2219 		--*pos;
2220 	}
2221 	return rc;
2222 }
2223 
empty_bucket(const struct tcp_iter_state * st)2224 static inline bool empty_bucket(const struct tcp_iter_state *st)
2225 {
2226 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2227 }
2228 
2229 /*
2230  * Get first established socket starting from bucket given in st->bucket.
2231  * If st->bucket is zero, the very first socket in the hash is returned.
2232  */
established_get_first(struct seq_file * seq)2233 static void *established_get_first(struct seq_file *seq)
2234 {
2235 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2236 	struct tcp_iter_state *st = seq->private;
2237 	struct net *net = seq_file_net(seq);
2238 	void *rc = NULL;
2239 
2240 	st->offset = 0;
2241 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2242 		struct sock *sk;
2243 		struct hlist_nulls_node *node;
2244 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2245 
2246 		/* Lockless fast path for the common case of empty buckets */
2247 		if (empty_bucket(st))
2248 			continue;
2249 
2250 		spin_lock_bh(lock);
2251 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2252 			if (sk->sk_family != afinfo->family ||
2253 			    !net_eq(sock_net(sk), net)) {
2254 				continue;
2255 			}
2256 			rc = sk;
2257 			goto out;
2258 		}
2259 		spin_unlock_bh(lock);
2260 	}
2261 out:
2262 	return rc;
2263 }
2264 
established_get_next(struct seq_file * seq,void * cur)2265 static void *established_get_next(struct seq_file *seq, void *cur)
2266 {
2267 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268 	struct sock *sk = cur;
2269 	struct hlist_nulls_node *node;
2270 	struct tcp_iter_state *st = seq->private;
2271 	struct net *net = seq_file_net(seq);
2272 
2273 	++st->num;
2274 	++st->offset;
2275 
2276 	sk = sk_nulls_next(sk);
2277 
2278 	sk_nulls_for_each_from(sk, node) {
2279 		if (sk->sk_family == afinfo->family &&
2280 		    net_eq(sock_net(sk), net))
2281 			return sk;
2282 	}
2283 
2284 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2285 	++st->bucket;
2286 	return established_get_first(seq);
2287 }
2288 
established_get_idx(struct seq_file * seq,loff_t pos)2289 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2290 {
2291 	struct tcp_iter_state *st = seq->private;
2292 	void *rc;
2293 
2294 	st->bucket = 0;
2295 	rc = established_get_first(seq);
2296 
2297 	while (rc && pos) {
2298 		rc = established_get_next(seq, rc);
2299 		--pos;
2300 	}
2301 	return rc;
2302 }
2303 
tcp_get_idx(struct seq_file * seq,loff_t pos)2304 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2305 {
2306 	void *rc;
2307 	struct tcp_iter_state *st = seq->private;
2308 
2309 	st->state = TCP_SEQ_STATE_LISTENING;
2310 	rc	  = listening_get_idx(seq, &pos);
2311 
2312 	if (!rc) {
2313 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2314 		rc	  = established_get_idx(seq, pos);
2315 	}
2316 
2317 	return rc;
2318 }
2319 
tcp_seek_last_pos(struct seq_file * seq)2320 static void *tcp_seek_last_pos(struct seq_file *seq)
2321 {
2322 	struct tcp_iter_state *st = seq->private;
2323 	int bucket = st->bucket;
2324 	int offset = st->offset;
2325 	int orig_num = st->num;
2326 	void *rc = NULL;
2327 
2328 	switch (st->state) {
2329 	case TCP_SEQ_STATE_LISTENING:
2330 		if (st->bucket >= INET_LHTABLE_SIZE)
2331 			break;
2332 		st->state = TCP_SEQ_STATE_LISTENING;
2333 		rc = listening_get_next(seq, NULL);
2334 		while (offset-- && rc && bucket == st->bucket)
2335 			rc = listening_get_next(seq, rc);
2336 		if (rc)
2337 			break;
2338 		st->bucket = 0;
2339 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2340 		/* Fallthrough */
2341 	case TCP_SEQ_STATE_ESTABLISHED:
2342 		if (st->bucket > tcp_hashinfo.ehash_mask)
2343 			break;
2344 		rc = established_get_first(seq);
2345 		while (offset-- && rc && bucket == st->bucket)
2346 			rc = established_get_next(seq, rc);
2347 	}
2348 
2349 	st->num = orig_num;
2350 
2351 	return rc;
2352 }
2353 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2354 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2355 {
2356 	struct tcp_iter_state *st = seq->private;
2357 	void *rc;
2358 
2359 	if (*pos && *pos == st->last_pos) {
2360 		rc = tcp_seek_last_pos(seq);
2361 		if (rc)
2362 			goto out;
2363 	}
2364 
2365 	st->state = TCP_SEQ_STATE_LISTENING;
2366 	st->num = 0;
2367 	st->bucket = 0;
2368 	st->offset = 0;
2369 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2370 
2371 out:
2372 	st->last_pos = *pos;
2373 	return rc;
2374 }
2375 EXPORT_SYMBOL(tcp_seq_start);
2376 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2377 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2378 {
2379 	struct tcp_iter_state *st = seq->private;
2380 	void *rc = NULL;
2381 
2382 	if (v == SEQ_START_TOKEN) {
2383 		rc = tcp_get_idx(seq, 0);
2384 		goto out;
2385 	}
2386 
2387 	switch (st->state) {
2388 	case TCP_SEQ_STATE_LISTENING:
2389 		rc = listening_get_next(seq, v);
2390 		if (!rc) {
2391 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2392 			st->bucket = 0;
2393 			st->offset = 0;
2394 			rc	  = established_get_first(seq);
2395 		}
2396 		break;
2397 	case TCP_SEQ_STATE_ESTABLISHED:
2398 		rc = established_get_next(seq, v);
2399 		break;
2400 	}
2401 out:
2402 	++*pos;
2403 	st->last_pos = *pos;
2404 	return rc;
2405 }
2406 EXPORT_SYMBOL(tcp_seq_next);
2407 
tcp_seq_stop(struct seq_file * seq,void * v)2408 void tcp_seq_stop(struct seq_file *seq, void *v)
2409 {
2410 	struct tcp_iter_state *st = seq->private;
2411 
2412 	switch (st->state) {
2413 	case TCP_SEQ_STATE_LISTENING:
2414 		if (v != SEQ_START_TOKEN)
2415 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2416 		break;
2417 	case TCP_SEQ_STATE_ESTABLISHED:
2418 		if (v)
2419 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2420 		break;
2421 	}
2422 }
2423 EXPORT_SYMBOL(tcp_seq_stop);
2424 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2425 static void get_openreq4(const struct request_sock *req,
2426 			 struct seq_file *f, int i)
2427 {
2428 	const struct inet_request_sock *ireq = inet_rsk(req);
2429 	long delta = req->rsk_timer.expires - jiffies;
2430 
2431 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2432 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2433 		i,
2434 		ireq->ir_loc_addr,
2435 		ireq->ir_num,
2436 		ireq->ir_rmt_addr,
2437 		ntohs(ireq->ir_rmt_port),
2438 		TCP_SYN_RECV,
2439 		0, 0, /* could print option size, but that is af dependent. */
2440 		1,    /* timers active (only the expire timer) */
2441 		jiffies_delta_to_clock_t(delta),
2442 		req->num_timeout,
2443 		from_kuid_munged(seq_user_ns(f),
2444 				 sock_i_uid(req->rsk_listener)),
2445 		0,  /* non standard timer */
2446 		0, /* open_requests have no inode */
2447 		0,
2448 		req);
2449 }
2450 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2451 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2452 {
2453 	int timer_active;
2454 	unsigned long timer_expires;
2455 	const struct tcp_sock *tp = tcp_sk(sk);
2456 	const struct inet_connection_sock *icsk = inet_csk(sk);
2457 	const struct inet_sock *inet = inet_sk(sk);
2458 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2459 	__be32 dest = inet->inet_daddr;
2460 	__be32 src = inet->inet_rcv_saddr;
2461 	__u16 destp = ntohs(inet->inet_dport);
2462 	__u16 srcp = ntohs(inet->inet_sport);
2463 	int rx_queue;
2464 	int state;
2465 
2466 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2467 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2468 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2469 		timer_active	= 1;
2470 		timer_expires	= icsk->icsk_timeout;
2471 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2472 		timer_active	= 4;
2473 		timer_expires	= icsk->icsk_timeout;
2474 	} else if (timer_pending(&sk->sk_timer)) {
2475 		timer_active	= 2;
2476 		timer_expires	= sk->sk_timer.expires;
2477 	} else {
2478 		timer_active	= 0;
2479 		timer_expires = jiffies;
2480 	}
2481 
2482 	state = inet_sk_state_load(sk);
2483 	if (state == TCP_LISTEN)
2484 		rx_queue = sk->sk_ack_backlog;
2485 	else
2486 		/* Because we don't lock the socket,
2487 		 * we might find a transient negative value.
2488 		 */
2489 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2490 				      READ_ONCE(tp->copied_seq), 0);
2491 
2492 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2494 		i, src, srcp, dest, destp, state,
2495 		READ_ONCE(tp->write_seq) - tp->snd_una,
2496 		rx_queue,
2497 		timer_active,
2498 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2499 		icsk->icsk_retransmits,
2500 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2501 		icsk->icsk_probes_out,
2502 		sock_i_ino(sk),
2503 		refcount_read(&sk->sk_refcnt), sk,
2504 		jiffies_to_clock_t(icsk->icsk_rto),
2505 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2506 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2507 		tp->snd_cwnd,
2508 		state == TCP_LISTEN ?
2509 		    fastopenq->max_qlen :
2510 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2511 }
2512 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2513 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2514 			       struct seq_file *f, int i)
2515 {
2516 	long delta = tw->tw_timer.expires - jiffies;
2517 	__be32 dest, src;
2518 	__u16 destp, srcp;
2519 
2520 	dest  = tw->tw_daddr;
2521 	src   = tw->tw_rcv_saddr;
2522 	destp = ntohs(tw->tw_dport);
2523 	srcp  = ntohs(tw->tw_sport);
2524 
2525 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2526 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2527 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2529 		refcount_read(&tw->tw_refcnt), tw);
2530 }
2531 
2532 #define TMPSZ 150
2533 
tcp4_seq_show(struct seq_file * seq,void * v)2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2535 {
2536 	struct tcp_iter_state *st;
2537 	struct sock *sk = v;
2538 
2539 	seq_setwidth(seq, TMPSZ - 1);
2540 	if (v == SEQ_START_TOKEN) {
2541 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2542 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2543 			   "inode");
2544 		goto out;
2545 	}
2546 	st = seq->private;
2547 
2548 	if (sk->sk_state == TCP_TIME_WAIT)
2549 		get_timewait4_sock(v, seq, st->num);
2550 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2551 		get_openreq4(v, seq, st->num);
2552 	else
2553 		get_tcp4_sock(v, seq, st->num);
2554 out:
2555 	seq_pad(seq, '\n');
2556 	return 0;
2557 }
2558 
2559 static const struct seq_operations tcp4_seq_ops = {
2560 	.show		= tcp4_seq_show,
2561 	.start		= tcp_seq_start,
2562 	.next		= tcp_seq_next,
2563 	.stop		= tcp_seq_stop,
2564 };
2565 
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2567 	.family		= AF_INET,
2568 };
2569 
tcp4_proc_init_net(struct net * net)2570 static int __net_init tcp4_proc_init_net(struct net *net)
2571 {
2572 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2573 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2574 		return -ENOMEM;
2575 	return 0;
2576 }
2577 
tcp4_proc_exit_net(struct net * net)2578 static void __net_exit tcp4_proc_exit_net(struct net *net)
2579 {
2580 	remove_proc_entry("tcp", net->proc_net);
2581 }
2582 
2583 static struct pernet_operations tcp4_net_ops = {
2584 	.init = tcp4_proc_init_net,
2585 	.exit = tcp4_proc_exit_net,
2586 };
2587 
tcp4_proc_init(void)2588 int __init tcp4_proc_init(void)
2589 {
2590 	return register_pernet_subsys(&tcp4_net_ops);
2591 }
2592 
tcp4_proc_exit(void)2593 void tcp4_proc_exit(void)
2594 {
2595 	unregister_pernet_subsys(&tcp4_net_ops);
2596 }
2597 #endif /* CONFIG_PROC_FS */
2598 
2599 struct proto tcp_prot = {
2600 	.name			= "TCP",
2601 	.owner			= THIS_MODULE,
2602 	.close			= tcp_close,
2603 	.pre_connect		= tcp_v4_pre_connect,
2604 	.connect		= tcp_v4_connect,
2605 	.disconnect		= tcp_disconnect,
2606 	.accept			= inet_csk_accept,
2607 	.ioctl			= tcp_ioctl,
2608 	.init			= tcp_v4_init_sock,
2609 	.destroy		= tcp_v4_destroy_sock,
2610 	.shutdown		= tcp_shutdown,
2611 	.setsockopt		= tcp_setsockopt,
2612 	.getsockopt		= tcp_getsockopt,
2613 	.keepalive		= tcp_set_keepalive,
2614 	.recvmsg		= tcp_recvmsg,
2615 	.sendmsg		= tcp_sendmsg,
2616 	.sendpage		= tcp_sendpage,
2617 	.backlog_rcv		= tcp_v4_do_rcv,
2618 	.release_cb		= tcp_release_cb,
2619 	.hash			= inet_hash,
2620 	.unhash			= inet_unhash,
2621 	.get_port		= inet_csk_get_port,
2622 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2623 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2624 	.stream_memory_free	= tcp_stream_memory_free,
2625 	.sockets_allocated	= &tcp_sockets_allocated,
2626 	.orphan_count		= &tcp_orphan_count,
2627 	.memory_allocated	= &tcp_memory_allocated,
2628 	.memory_pressure	= &tcp_memory_pressure,
2629 	.sysctl_mem		= sysctl_tcp_mem,
2630 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2631 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2632 	.max_header		= MAX_TCP_HEADER,
2633 	.obj_size		= sizeof(struct tcp_sock),
2634 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2635 	.twsk_prot		= &tcp_timewait_sock_ops,
2636 	.rsk_prot		= &tcp_request_sock_ops,
2637 	.h.hashinfo		= &tcp_hashinfo,
2638 	.no_autobind		= true,
2639 #ifdef CONFIG_COMPAT
2640 	.compat_setsockopt	= compat_tcp_setsockopt,
2641 	.compat_getsockopt	= compat_tcp_getsockopt,
2642 #endif
2643 	.diag_destroy		= tcp_abort,
2644 };
2645 EXPORT_SYMBOL(tcp_prot);
2646 
tcp_sk_exit(struct net * net)2647 static void __net_exit tcp_sk_exit(struct net *net)
2648 {
2649 	int cpu;
2650 
2651 	if (net->ipv4.tcp_congestion_control)
2652 		module_put(net->ipv4.tcp_congestion_control->owner);
2653 
2654 	for_each_possible_cpu(cpu)
2655 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2656 	free_percpu(net->ipv4.tcp_sk);
2657 }
2658 
tcp_sk_init(struct net * net)2659 static int __net_init tcp_sk_init(struct net *net)
2660 {
2661 	int res, cpu, cnt;
2662 
2663 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2664 	if (!net->ipv4.tcp_sk)
2665 		return -ENOMEM;
2666 
2667 	for_each_possible_cpu(cpu) {
2668 		struct sock *sk;
2669 
2670 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2671 					   IPPROTO_TCP, net);
2672 		if (res)
2673 			goto fail;
2674 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2675 
2676 		/* Please enforce IP_DF and IPID==0 for RST and
2677 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2678 		 */
2679 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2680 
2681 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2682 	}
2683 
2684 	net->ipv4.sysctl_tcp_ecn = 2;
2685 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2686 
2687 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2688 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2689 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2690 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2691 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2692 
2693 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2694 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2695 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2696 
2697 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2698 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2699 	net->ipv4.sysctl_tcp_syncookies = 1;
2700 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2701 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2702 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2703 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2704 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2705 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2706 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2707 
2708 	cnt = tcp_hashinfo.ehash_mask + 1;
2709 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2710 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2711 
2712 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2713 	net->ipv4.sysctl_tcp_sack = 1;
2714 	net->ipv4.sysctl_tcp_window_scaling = 1;
2715 	net->ipv4.sysctl_tcp_timestamps = 1;
2716 	net->ipv4.sysctl_tcp_early_retrans = 3;
2717 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2718 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2719 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2720 	net->ipv4.sysctl_tcp_max_reordering = 300;
2721 	net->ipv4.sysctl_tcp_dsack = 1;
2722 	net->ipv4.sysctl_tcp_app_win = 31;
2723 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2724 	net->ipv4.sysctl_tcp_frto = 2;
2725 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2726 	/* This limits the percentage of the congestion window which we
2727 	 * will allow a single TSO frame to consume.  Building TSO frames
2728 	 * which are too large can cause TCP streams to be bursty.
2729 	 */
2730 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2731 	/* Default TSQ limit of 16 TSO segments */
2732 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2733 	/* rfc5961 challenge ack rate limiting */
2734 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2735 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2736 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2737 	net->ipv4.sysctl_tcp_autocorking = 1;
2738 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2739 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2740 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2741 	if (net != &init_net) {
2742 		memcpy(net->ipv4.sysctl_tcp_rmem,
2743 		       init_net.ipv4.sysctl_tcp_rmem,
2744 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2745 		memcpy(net->ipv4.sysctl_tcp_wmem,
2746 		       init_net.ipv4.sysctl_tcp_wmem,
2747 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2748 	}
2749 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2750 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2751 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2752 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2753 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2754 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2755 
2756 	/* Reno is always built in */
2757 	if (!net_eq(net, &init_net) &&
2758 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2759 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2760 	else
2761 		net->ipv4.tcp_congestion_control = &tcp_reno;
2762 
2763 	return 0;
2764 fail:
2765 	tcp_sk_exit(net);
2766 
2767 	return res;
2768 }
2769 
tcp_sk_exit_batch(struct list_head * net_exit_list)2770 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2771 {
2772 	struct net *net;
2773 
2774 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2775 
2776 	list_for_each_entry(net, net_exit_list, exit_list)
2777 		tcp_fastopen_ctx_destroy(net);
2778 }
2779 
2780 static struct pernet_operations __net_initdata tcp_sk_ops = {
2781        .init	   = tcp_sk_init,
2782        .exit	   = tcp_sk_exit,
2783        .exit_batch = tcp_sk_exit_batch,
2784 };
2785 
tcp_v4_init(void)2786 void __init tcp_v4_init(void)
2787 {
2788 	if (register_pernet_subsys(&tcp_sk_ops))
2789 		panic("Failed to create the TCP control socket.\n");
2790 }
2791