• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
tcp_v4_init_seq(const struct sk_buff * skb)93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
129 				loopback = true;
130 		} else
131 #endif
132 		{
133 			if (ipv4_is_loopback(tw->tw_daddr) ||
134 			    ipv4_is_loopback(tw->tw_rcv_saddr))
135 				loopback = true;
136 		}
137 		if (!loopback)
138 			reuse = 0;
139 	}
140 
141 	/* With PAWS, it is safe from the viewpoint
142 	   of data integrity. Even without PAWS it is safe provided sequence
143 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 
145 	   Actually, the idea is close to VJ's one, only timestamp cache is
146 	   held not per host, but per port pair and TW bucket is used as state
147 	   holder.
148 
149 	   If TW bucket has been already destroyed we fall back to VJ's scheme
150 	   and use initial timestamp retrieved from peer table.
151 	 */
152 	if (tcptw->tw_ts_recent_stamp &&
153 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
154 					    tcptw->tw_ts_recent_stamp)))) {
155 		/* In case of repair and re-using TIME-WAIT sockets we still
156 		 * want to be sure that it is safe as above but honor the
157 		 * sequence numbers and time stamps set as part of the repair
158 		 * process.
159 		 *
160 		 * Without this check re-using a TIME-WAIT socket with TCP
161 		 * repair would accumulate a -1 on the repair assigned
162 		 * sequence number. The first time it is reused the sequence
163 		 * is -1, the second time -2, etc. This fixes that issue
164 		 * without appearing to create any others.
165 		 */
166 		if (likely(!tp->repair)) {
167 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
168 
169 			if (!seq)
170 				seq = 1;
171 			WRITE_ONCE(tp->write_seq, seq);
172 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
173 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
174 		}
175 		sock_hold(sktw);
176 		return 1;
177 	}
178 
179 	return 0;
180 }
181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
184 			      int addr_len)
185 {
186 	/* This check is replicated from tcp_v4_connect() and intended to
187 	 * prevent BPF program called below from accessing bytes that are out
188 	 * of the bound specified by user in addr_len.
189 	 */
190 	if (addr_len < sizeof(struct sockaddr_in))
191 		return -EINVAL;
192 
193 	sock_owned_by_me(sk);
194 
195 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 }
197 
198 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 {
201 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
202 	struct inet_sock *inet = inet_sk(sk);
203 	struct tcp_sock *tp = tcp_sk(sk);
204 	__be16 orig_sport, orig_dport;
205 	__be32 daddr, nexthop;
206 	struct flowi4 *fl4;
207 	struct rtable *rt;
208 	int err;
209 	struct ip_options_rcu *inet_opt;
210 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 
212 	if (addr_len < sizeof(struct sockaddr_in))
213 		return -EINVAL;
214 
215 	if (usin->sin_family != AF_INET)
216 		return -EAFNOSUPPORT;
217 
218 	nexthop = daddr = usin->sin_addr.s_addr;
219 	inet_opt = rcu_dereference_protected(inet->inet_opt,
220 					     lockdep_sock_is_held(sk));
221 	if (inet_opt && inet_opt->opt.srr) {
222 		if (!daddr)
223 			return -EINVAL;
224 		nexthop = inet_opt->opt.faddr;
225 	}
226 
227 	orig_sport = inet->inet_sport;
228 	orig_dport = usin->sin_port;
229 	fl4 = &inet->cork.fl.u.ip4;
230 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
231 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 			      IPPROTO_TCP,
233 			      orig_sport, orig_dport, sk);
234 	if (IS_ERR(rt)) {
235 		err = PTR_ERR(rt);
236 		if (err == -ENETUNREACH)
237 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 		return err;
239 	}
240 
241 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 		ip_rt_put(rt);
243 		return -ENETUNREACH;
244 	}
245 
246 	if (!inet_opt || !inet_opt->opt.srr)
247 		daddr = fl4->daddr;
248 
249 	if (!inet->inet_saddr)
250 		inet->inet_saddr = fl4->saddr;
251 	sk_rcv_saddr_set(sk, inet->inet_saddr);
252 
253 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 		/* Reset inherited state */
255 		tp->rx_opt.ts_recent	   = 0;
256 		tp->rx_opt.ts_recent_stamp = 0;
257 		if (likely(!tp->repair))
258 			WRITE_ONCE(tp->write_seq, 0);
259 	}
260 
261 	inet->inet_dport = usin->sin_port;
262 	sk_daddr_set(sk, daddr);
263 
264 	inet_csk(sk)->icsk_ext_hdr_len = 0;
265 	if (inet_opt)
266 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 
268 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 
270 	/* Socket identity is still unknown (sport may be zero).
271 	 * However we set state to SYN-SENT and not releasing socket
272 	 * lock select source port, enter ourselves into the hash tables and
273 	 * complete initialization after this.
274 	 */
275 	tcp_set_state(sk, TCP_SYN_SENT);
276 	err = inet_hash_connect(tcp_death_row, sk);
277 	if (err)
278 		goto failure;
279 
280 	sk_set_txhash(sk);
281 
282 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 			       inet->inet_sport, inet->inet_dport, sk);
284 	if (IS_ERR(rt)) {
285 		err = PTR_ERR(rt);
286 		rt = NULL;
287 		goto failure;
288 	}
289 	/* OK, now commit destination to socket.  */
290 	sk->sk_gso_type = SKB_GSO_TCPV4;
291 	sk_setup_caps(sk, &rt->dst);
292 	rt = NULL;
293 
294 	if (likely(!tp->repair)) {
295 		if (!tp->write_seq)
296 			WRITE_ONCE(tp->write_seq,
297 				   secure_tcp_seq(inet->inet_saddr,
298 						  inet->inet_daddr,
299 						  inet->inet_sport,
300 						  usin->sin_port));
301 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 						 inet->inet_saddr,
303 						 inet->inet_daddr);
304 	}
305 
306 	inet->inet_id = prandom_u32();
307 
308 	if (tcp_fastopen_defer_connect(sk, &err))
309 		return err;
310 	if (err)
311 		goto failure;
312 
313 	err = tcp_connect(sk);
314 
315 	if (err)
316 		goto failure;
317 
318 	return 0;
319 
320 failure:
321 	/*
322 	 * This unhashes the socket and releases the local port,
323 	 * if necessary.
324 	 */
325 	tcp_set_state(sk, TCP_CLOSE);
326 	ip_rt_put(rt);
327 	sk->sk_route_caps = 0;
328 	inet->inet_dport = 0;
329 	return err;
330 }
331 EXPORT_SYMBOL(tcp_v4_connect);
332 
333 /*
334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335  * It can be called through tcp_release_cb() if socket was owned by user
336  * at the time tcp_v4_err() was called to handle ICMP message.
337  */
tcp_v4_mtu_reduced(struct sock * sk)338 void tcp_v4_mtu_reduced(struct sock *sk)
339 {
340 	struct inet_sock *inet = inet_sk(sk);
341 	struct dst_entry *dst;
342 	u32 mtu;
343 
344 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 		return;
346 	mtu = tcp_sk(sk)->mtu_info;
347 	dst = inet_csk_update_pmtu(sk, mtu);
348 	if (!dst)
349 		return;
350 
351 	/* Something is about to be wrong... Remember soft error
352 	 * for the case, if this connection will not able to recover.
353 	 */
354 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 		sk->sk_err_soft = EMSGSIZE;
356 
357 	mtu = dst_mtu(dst);
358 
359 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 	    ip_sk_accept_pmtu(sk) &&
361 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 		tcp_sync_mss(sk, mtu);
363 
364 		/* Resend the TCP packet because it's
365 		 * clear that the old packet has been
366 		 * dropped. This is the new "fast" path mtu
367 		 * discovery.
368 		 */
369 		tcp_simple_retransmit(sk);
370 	} /* else let the usual retransmit timer handle it */
371 }
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 
do_redirect(struct sk_buff * skb,struct sock * sk)374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 {
376 	struct dst_entry *dst = __sk_dst_check(sk, 0);
377 
378 	if (dst)
379 		dst->ops->redirect(dst, sk, skb);
380 }
381 
382 
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 {
386 	struct request_sock *req = inet_reqsk(sk);
387 	struct net *net = sock_net(sk);
388 
389 	/* ICMPs are not backlogged, hence we cannot get
390 	 * an established socket here.
391 	 */
392 	if (seq != tcp_rsk(req)->snt_isn) {
393 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 	} else if (abort) {
395 		/*
396 		 * Still in SYN_RECV, just remove it silently.
397 		 * There is no good way to pass the error to the newly
398 		 * created socket, and POSIX does not want network
399 		 * errors returned from accept().
400 		 */
401 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 		tcp_listendrop(req->rsk_listener);
403 	}
404 	reqsk_put(req);
405 }
406 EXPORT_SYMBOL(tcp_req_err);
407 
408 /*
409  * This routine is called by the ICMP module when it gets some
410  * sort of error condition.  If err < 0 then the socket should
411  * be closed and the error returned to the user.  If err > 0
412  * it's just the icmp type << 8 | icmp code.  After adjustment
413  * header points to the first 8 bytes of the tcp header.  We need
414  * to find the appropriate port.
415  *
416  * The locking strategy used here is very "optimistic". When
417  * someone else accesses the socket the ICMP is just dropped
418  * and for some paths there is no check at all.
419  * A more general error queue to queue errors for later handling
420  * is probably better.
421  *
422  */
423 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)424 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
425 {
426 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
427 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
428 	struct inet_connection_sock *icsk;
429 	struct tcp_sock *tp;
430 	struct inet_sock *inet;
431 	const int type = icmp_hdr(icmp_skb)->type;
432 	const int code = icmp_hdr(icmp_skb)->code;
433 	struct sock *sk;
434 	struct sk_buff *skb;
435 	struct request_sock *fastopen;
436 	u32 seq, snd_una;
437 	s32 remaining;
438 	u32 delta_us;
439 	int err;
440 	struct net *net = dev_net(icmp_skb->dev);
441 
442 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
443 				       th->dest, iph->saddr, ntohs(th->source),
444 				       inet_iif(icmp_skb), 0);
445 	if (!sk) {
446 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
447 		return -ENOENT;
448 	}
449 	if (sk->sk_state == TCP_TIME_WAIT) {
450 		inet_twsk_put(inet_twsk(sk));
451 		return 0;
452 	}
453 	seq = ntohl(th->seq);
454 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
455 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
456 				     type == ICMP_TIME_EXCEEDED ||
457 				     (type == ICMP_DEST_UNREACH &&
458 				      (code == ICMP_NET_UNREACH ||
459 				       code == ICMP_HOST_UNREACH)));
460 		return 0;
461 	}
462 
463 	bh_lock_sock(sk);
464 	/* If too many ICMPs get dropped on busy
465 	 * servers this needs to be solved differently.
466 	 * We do take care of PMTU discovery (RFC1191) special case :
467 	 * we can receive locally generated ICMP messages while socket is held.
468 	 */
469 	if (sock_owned_by_user(sk)) {
470 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
471 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
472 	}
473 	if (sk->sk_state == TCP_CLOSE)
474 		goto out;
475 
476 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
477 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
478 		goto out;
479 	}
480 
481 	icsk = inet_csk(sk);
482 	tp = tcp_sk(sk);
483 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
484 	fastopen = rcu_dereference(tp->fastopen_rsk);
485 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
486 	if (sk->sk_state != TCP_LISTEN &&
487 	    !between(seq, snd_una, tp->snd_nxt)) {
488 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
489 		goto out;
490 	}
491 
492 	switch (type) {
493 	case ICMP_REDIRECT:
494 		if (!sock_owned_by_user(sk))
495 			do_redirect(icmp_skb, sk);
496 		goto out;
497 	case ICMP_SOURCE_QUENCH:
498 		/* Just silently ignore these. */
499 		goto out;
500 	case ICMP_PARAMETERPROB:
501 		err = EPROTO;
502 		break;
503 	case ICMP_DEST_UNREACH:
504 		if (code > NR_ICMP_UNREACH)
505 			goto out;
506 
507 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
508 			/* We are not interested in TCP_LISTEN and open_requests
509 			 * (SYN-ACKs send out by Linux are always <576bytes so
510 			 * they should go through unfragmented).
511 			 */
512 			if (sk->sk_state == TCP_LISTEN)
513 				goto out;
514 
515 			tp->mtu_info = info;
516 			if (!sock_owned_by_user(sk)) {
517 				tcp_v4_mtu_reduced(sk);
518 			} else {
519 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
520 					sock_hold(sk);
521 			}
522 			goto out;
523 		}
524 
525 		err = icmp_err_convert[code].errno;
526 		/* check if icmp_skb allows revert of backoff
527 		 * (see draft-zimmermann-tcp-lcd) */
528 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
529 			break;
530 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
531 		    !icsk->icsk_backoff || fastopen)
532 			break;
533 
534 		if (sock_owned_by_user(sk))
535 			break;
536 
537 		skb = tcp_rtx_queue_head(sk);
538 		if (WARN_ON_ONCE(!skb))
539 			break;
540 
541 		icsk->icsk_backoff--;
542 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
543 					       TCP_TIMEOUT_INIT;
544 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
545 
546 
547 		tcp_mstamp_refresh(tp);
548 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
549 		remaining = icsk->icsk_rto -
550 			    usecs_to_jiffies(delta_us);
551 
552 		if (remaining > 0) {
553 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
554 						  remaining, TCP_RTO_MAX);
555 		} else {
556 			/* RTO revert clocked out retransmission.
557 			 * Will retransmit now */
558 			tcp_retransmit_timer(sk);
559 		}
560 
561 		break;
562 	case ICMP_TIME_EXCEEDED:
563 		err = EHOSTUNREACH;
564 		break;
565 	default:
566 		goto out;
567 	}
568 
569 	switch (sk->sk_state) {
570 	case TCP_SYN_SENT:
571 	case TCP_SYN_RECV:
572 		/* Only in fast or simultaneous open. If a fast open socket is
573 		 * is already accepted it is treated as a connected one below.
574 		 */
575 		if (fastopen && !fastopen->sk)
576 			break;
577 
578 		if (!sock_owned_by_user(sk)) {
579 			sk->sk_err = err;
580 
581 			sk->sk_error_report(sk);
582 
583 			tcp_done(sk);
584 		} else {
585 			sk->sk_err_soft = err;
586 		}
587 		goto out;
588 	}
589 
590 	/* If we've already connected we will keep trying
591 	 * until we time out, or the user gives up.
592 	 *
593 	 * rfc1122 4.2.3.9 allows to consider as hard errors
594 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
595 	 * but it is obsoleted by pmtu discovery).
596 	 *
597 	 * Note, that in modern internet, where routing is unreliable
598 	 * and in each dark corner broken firewalls sit, sending random
599 	 * errors ordered by their masters even this two messages finally lose
600 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
601 	 *
602 	 * Now we are in compliance with RFCs.
603 	 *							--ANK (980905)
604 	 */
605 
606 	inet = inet_sk(sk);
607 	if (!sock_owned_by_user(sk) && inet->recverr) {
608 		sk->sk_err = err;
609 		sk->sk_error_report(sk);
610 	} else	{ /* Only an error on timeout */
611 		sk->sk_err_soft = err;
612 	}
613 
614 out:
615 	bh_unlock_sock(sk);
616 	sock_put(sk);
617 	return 0;
618 }
619 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)620 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
621 {
622 	struct tcphdr *th = tcp_hdr(skb);
623 
624 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
625 	skb->csum_start = skb_transport_header(skb) - skb->head;
626 	skb->csum_offset = offsetof(struct tcphdr, check);
627 }
628 
629 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)630 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
631 {
632 	const struct inet_sock *inet = inet_sk(sk);
633 
634 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
635 }
636 EXPORT_SYMBOL(tcp_v4_send_check);
637 
638 /*
639  *	This routine will send an RST to the other tcp.
640  *
641  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
642  *		      for reset.
643  *	Answer: if a packet caused RST, it is not for a socket
644  *		existing in our system, if it is matched to a socket,
645  *		it is just duplicate segment or bug in other side's TCP.
646  *		So that we build reply only basing on parameters
647  *		arrived with segment.
648  *	Exception: precedence violation. We do not implement it in any case.
649  */
650 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)651 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
652 {
653 	const struct tcphdr *th = tcp_hdr(skb);
654 	struct {
655 		struct tcphdr th;
656 #ifdef CONFIG_TCP_MD5SIG
657 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
658 #endif
659 	} rep;
660 	struct ip_reply_arg arg;
661 #ifdef CONFIG_TCP_MD5SIG
662 	struct tcp_md5sig_key *key = NULL;
663 	const __u8 *hash_location = NULL;
664 	unsigned char newhash[16];
665 	int genhash;
666 	struct sock *sk1 = NULL;
667 #endif
668 	u64 transmit_time = 0;
669 	struct sock *ctl_sk;
670 	struct net *net;
671 
672 	/* Never send a reset in response to a reset. */
673 	if (th->rst)
674 		return;
675 
676 	/* If sk not NULL, it means we did a successful lookup and incoming
677 	 * route had to be correct. prequeue might have dropped our dst.
678 	 */
679 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
680 		return;
681 
682 	/* Swap the send and the receive. */
683 	memset(&rep, 0, sizeof(rep));
684 	rep.th.dest   = th->source;
685 	rep.th.source = th->dest;
686 	rep.th.doff   = sizeof(struct tcphdr) / 4;
687 	rep.th.rst    = 1;
688 
689 	if (th->ack) {
690 		rep.th.seq = th->ack_seq;
691 	} else {
692 		rep.th.ack = 1;
693 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
694 				       skb->len - (th->doff << 2));
695 	}
696 
697 	memset(&arg, 0, sizeof(arg));
698 	arg.iov[0].iov_base = (unsigned char *)&rep;
699 	arg.iov[0].iov_len  = sizeof(rep.th);
700 
701 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
702 #ifdef CONFIG_TCP_MD5SIG
703 	rcu_read_lock();
704 	hash_location = tcp_parse_md5sig_option(th);
705 	if (sk && sk_fullsock(sk)) {
706 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
707 					&ip_hdr(skb)->saddr, AF_INET);
708 	} else if (hash_location) {
709 		/*
710 		 * active side is lost. Try to find listening socket through
711 		 * source port, and then find md5 key through listening socket.
712 		 * we are not loose security here:
713 		 * Incoming packet is checked with md5 hash with finding key,
714 		 * no RST generated if md5 hash doesn't match.
715 		 */
716 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
717 					     ip_hdr(skb)->saddr,
718 					     th->source, ip_hdr(skb)->daddr,
719 					     ntohs(th->source), inet_iif(skb),
720 					     tcp_v4_sdif(skb));
721 		/* don't send rst if it can't find key */
722 		if (!sk1)
723 			goto out;
724 
725 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
726 					&ip_hdr(skb)->saddr, AF_INET);
727 		if (!key)
728 			goto out;
729 
730 
731 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
732 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
733 			goto out;
734 
735 	}
736 
737 	if (key) {
738 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
739 				   (TCPOPT_NOP << 16) |
740 				   (TCPOPT_MD5SIG << 8) |
741 				   TCPOLEN_MD5SIG);
742 		/* Update length and the length the header thinks exists */
743 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
744 		rep.th.doff = arg.iov[0].iov_len / 4;
745 
746 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
747 				     key, ip_hdr(skb)->saddr,
748 				     ip_hdr(skb)->daddr, &rep.th);
749 	}
750 #endif
751 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
752 				      ip_hdr(skb)->saddr, /* XXX */
753 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
754 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
755 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
756 
757 	/* When socket is gone, all binding information is lost.
758 	 * routing might fail in this case. No choice here, if we choose to force
759 	 * input interface, we will misroute in case of asymmetric route.
760 	 */
761 	if (sk) {
762 		arg.bound_dev_if = sk->sk_bound_dev_if;
763 		if (sk_fullsock(sk))
764 			trace_tcp_send_reset(sk, skb);
765 	}
766 
767 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
768 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
769 
770 	arg.tos = ip_hdr(skb)->tos;
771 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
772 	local_bh_disable();
773 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
774 	if (sk) {
775 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
776 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
777 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
778 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
779 		transmit_time = tcp_transmit_time(sk);
780 	}
781 	ip_send_unicast_reply(ctl_sk,
782 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
783 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
784 			      &arg, arg.iov[0].iov_len,
785 			      transmit_time);
786 
787 	ctl_sk->sk_mark = 0;
788 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
789 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
790 	local_bh_enable();
791 
792 #ifdef CONFIG_TCP_MD5SIG
793 out:
794 	rcu_read_unlock();
795 #endif
796 }
797 
798 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
799    outside socket context is ugly, certainly. What can I do?
800  */
801 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)802 static void tcp_v4_send_ack(const struct sock *sk,
803 			    struct sk_buff *skb, u32 seq, u32 ack,
804 			    u32 win, u32 tsval, u32 tsecr, int oif,
805 			    struct tcp_md5sig_key *key,
806 			    int reply_flags, u8 tos)
807 {
808 	const struct tcphdr *th = tcp_hdr(skb);
809 	struct {
810 		struct tcphdr th;
811 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
812 #ifdef CONFIG_TCP_MD5SIG
813 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
814 #endif
815 			];
816 	} rep;
817 	struct net *net = sock_net(sk);
818 	struct ip_reply_arg arg;
819 	struct sock *ctl_sk;
820 	u64 transmit_time;
821 
822 	memset(&rep.th, 0, sizeof(struct tcphdr));
823 	memset(&arg, 0, sizeof(arg));
824 
825 	arg.iov[0].iov_base = (unsigned char *)&rep;
826 	arg.iov[0].iov_len  = sizeof(rep.th);
827 	if (tsecr) {
828 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
829 				   (TCPOPT_TIMESTAMP << 8) |
830 				   TCPOLEN_TIMESTAMP);
831 		rep.opt[1] = htonl(tsval);
832 		rep.opt[2] = htonl(tsecr);
833 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
834 	}
835 
836 	/* Swap the send and the receive. */
837 	rep.th.dest    = th->source;
838 	rep.th.source  = th->dest;
839 	rep.th.doff    = arg.iov[0].iov_len / 4;
840 	rep.th.seq     = htonl(seq);
841 	rep.th.ack_seq = htonl(ack);
842 	rep.th.ack     = 1;
843 	rep.th.window  = htons(win);
844 
845 #ifdef CONFIG_TCP_MD5SIG
846 	if (key) {
847 		int offset = (tsecr) ? 3 : 0;
848 
849 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
850 					  (TCPOPT_NOP << 16) |
851 					  (TCPOPT_MD5SIG << 8) |
852 					  TCPOLEN_MD5SIG);
853 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854 		rep.th.doff = arg.iov[0].iov_len/4;
855 
856 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
857 				    key, ip_hdr(skb)->saddr,
858 				    ip_hdr(skb)->daddr, &rep.th);
859 	}
860 #endif
861 	arg.flags = reply_flags;
862 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
863 				      ip_hdr(skb)->saddr, /* XXX */
864 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
865 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
866 	if (oif)
867 		arg.bound_dev_if = oif;
868 	arg.tos = tos;
869 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
870 	local_bh_disable();
871 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
872 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
873 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
874 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
875 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
876 	transmit_time = tcp_transmit_time(sk);
877 	ip_send_unicast_reply(ctl_sk,
878 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
879 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
880 			      &arg, arg.iov[0].iov_len,
881 			      transmit_time);
882 
883 	ctl_sk->sk_mark = 0;
884 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
885 	local_bh_enable();
886 }
887 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)888 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
889 {
890 	struct inet_timewait_sock *tw = inet_twsk(sk);
891 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
892 
893 	tcp_v4_send_ack(sk, skb,
894 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
895 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
896 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
897 			tcptw->tw_ts_recent,
898 			tw->tw_bound_dev_if,
899 			tcp_twsk_md5_key(tcptw),
900 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
901 			tw->tw_tos
902 			);
903 
904 	inet_twsk_put(tw);
905 }
906 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)907 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
908 				  struct request_sock *req)
909 {
910 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
911 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
912 	 */
913 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
914 					     tcp_sk(sk)->snd_nxt;
915 
916 	/* RFC 7323 2.3
917 	 * The window field (SEG.WND) of every outgoing segment, with the
918 	 * exception of <SYN> segments, MUST be right-shifted by
919 	 * Rcv.Wind.Shift bits:
920 	 */
921 	tcp_v4_send_ack(sk, skb, seq,
922 			tcp_rsk(req)->rcv_nxt,
923 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
924 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
925 			req->ts_recent,
926 			0,
927 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
928 					  AF_INET),
929 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
930 			ip_hdr(skb)->tos);
931 }
932 
933 /*
934  *	Send a SYN-ACK after having received a SYN.
935  *	This still operates on a request_sock only, not on a big
936  *	socket.
937  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)938 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
939 			      struct flowi *fl,
940 			      struct request_sock *req,
941 			      struct tcp_fastopen_cookie *foc,
942 			      enum tcp_synack_type synack_type)
943 {
944 	const struct inet_request_sock *ireq = inet_rsk(req);
945 	struct flowi4 fl4;
946 	int err = -1;
947 	struct sk_buff *skb;
948 
949 	/* First, grab a route. */
950 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
951 		return -1;
952 
953 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
954 
955 	if (skb) {
956 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
957 
958 		rcu_read_lock();
959 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
960 					    ireq->ir_rmt_addr,
961 					    rcu_dereference(ireq->ireq_opt));
962 		rcu_read_unlock();
963 		err = net_xmit_eval(err);
964 	}
965 
966 	return err;
967 }
968 
969 /*
970  *	IPv4 request_sock destructor.
971  */
tcp_v4_reqsk_destructor(struct request_sock * req)972 static void tcp_v4_reqsk_destructor(struct request_sock *req)
973 {
974 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
975 }
976 
977 #ifdef CONFIG_TCP_MD5SIG
978 /*
979  * RFC2385 MD5 checksumming requires a mapping of
980  * IP address->MD5 Key.
981  * We need to maintain these in the sk structure.
982  */
983 
984 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
985 EXPORT_SYMBOL(tcp_md5_needed);
986 
987 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)988 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
989 					   const union tcp_md5_addr *addr,
990 					   int family)
991 {
992 	const struct tcp_sock *tp = tcp_sk(sk);
993 	struct tcp_md5sig_key *key;
994 	const struct tcp_md5sig_info *md5sig;
995 	__be32 mask;
996 	struct tcp_md5sig_key *best_match = NULL;
997 	bool match;
998 
999 	/* caller either holds rcu_read_lock() or socket lock */
1000 	md5sig = rcu_dereference_check(tp->md5sig_info,
1001 				       lockdep_sock_is_held(sk));
1002 	if (!md5sig)
1003 		return NULL;
1004 
1005 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1006 		if (key->family != family)
1007 			continue;
1008 
1009 		if (family == AF_INET) {
1010 			mask = inet_make_mask(key->prefixlen);
1011 			match = (key->addr.a4.s_addr & mask) ==
1012 				(addr->a4.s_addr & mask);
1013 #if IS_ENABLED(CONFIG_IPV6)
1014 		} else if (family == AF_INET6) {
1015 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1016 						  key->prefixlen);
1017 #endif
1018 		} else {
1019 			match = false;
1020 		}
1021 
1022 		if (match && (!best_match ||
1023 			      key->prefixlen > best_match->prefixlen))
1024 			best_match = key;
1025 	}
1026 	return best_match;
1027 }
1028 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1029 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1030 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1031 						      const union tcp_md5_addr *addr,
1032 						      int family, u8 prefixlen)
1033 {
1034 	const struct tcp_sock *tp = tcp_sk(sk);
1035 	struct tcp_md5sig_key *key;
1036 	unsigned int size = sizeof(struct in_addr);
1037 	const struct tcp_md5sig_info *md5sig;
1038 
1039 	/* caller either holds rcu_read_lock() or socket lock */
1040 	md5sig = rcu_dereference_check(tp->md5sig_info,
1041 				       lockdep_sock_is_held(sk));
1042 	if (!md5sig)
1043 		return NULL;
1044 #if IS_ENABLED(CONFIG_IPV6)
1045 	if (family == AF_INET6)
1046 		size = sizeof(struct in6_addr);
1047 #endif
1048 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1049 		if (key->family != family)
1050 			continue;
1051 		if (!memcmp(&key->addr, addr, size) &&
1052 		    key->prefixlen == prefixlen)
1053 			return key;
1054 	}
1055 	return NULL;
1056 }
1057 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1058 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1059 					 const struct sock *addr_sk)
1060 {
1061 	const union tcp_md5_addr *addr;
1062 
1063 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1064 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1065 }
1066 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1067 
1068 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)1069 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1070 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1071 		   gfp_t gfp)
1072 {
1073 	/* Add Key to the list */
1074 	struct tcp_md5sig_key *key;
1075 	struct tcp_sock *tp = tcp_sk(sk);
1076 	struct tcp_md5sig_info *md5sig;
1077 
1078 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1079 	if (key) {
1080 		/* Pre-existing entry - just update that one. */
1081 		memcpy(key->key, newkey, newkeylen);
1082 		key->keylen = newkeylen;
1083 		return 0;
1084 	}
1085 
1086 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1087 					   lockdep_sock_is_held(sk));
1088 	if (!md5sig) {
1089 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1090 		if (!md5sig)
1091 			return -ENOMEM;
1092 
1093 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1094 		INIT_HLIST_HEAD(&md5sig->head);
1095 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1096 	}
1097 
1098 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1099 	if (!key)
1100 		return -ENOMEM;
1101 	if (!tcp_alloc_md5sig_pool()) {
1102 		sock_kfree_s(sk, key, sizeof(*key));
1103 		return -ENOMEM;
1104 	}
1105 
1106 	memcpy(key->key, newkey, newkeylen);
1107 	key->keylen = newkeylen;
1108 	key->family = family;
1109 	key->prefixlen = prefixlen;
1110 	memcpy(&key->addr, addr,
1111 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1112 				      sizeof(struct in_addr));
1113 	hlist_add_head_rcu(&key->node, &md5sig->head);
1114 	return 0;
1115 }
1116 EXPORT_SYMBOL(tcp_md5_do_add);
1117 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1118 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1119 		   u8 prefixlen)
1120 {
1121 	struct tcp_md5sig_key *key;
1122 
1123 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1124 	if (!key)
1125 		return -ENOENT;
1126 	hlist_del_rcu(&key->node);
1127 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1128 	kfree_rcu(key, rcu);
1129 	return 0;
1130 }
1131 EXPORT_SYMBOL(tcp_md5_do_del);
1132 
tcp_clear_md5_list(struct sock * sk)1133 static void tcp_clear_md5_list(struct sock *sk)
1134 {
1135 	struct tcp_sock *tp = tcp_sk(sk);
1136 	struct tcp_md5sig_key *key;
1137 	struct hlist_node *n;
1138 	struct tcp_md5sig_info *md5sig;
1139 
1140 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1141 
1142 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1143 		hlist_del_rcu(&key->node);
1144 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1145 		kfree_rcu(key, rcu);
1146 	}
1147 }
1148 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1149 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1150 				 char __user *optval, int optlen)
1151 {
1152 	struct tcp_md5sig cmd;
1153 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1154 	u8 prefixlen = 32;
1155 
1156 	if (optlen < sizeof(cmd))
1157 		return -EINVAL;
1158 
1159 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1160 		return -EFAULT;
1161 
1162 	if (sin->sin_family != AF_INET)
1163 		return -EINVAL;
1164 
1165 	if (optname == TCP_MD5SIG_EXT &&
1166 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1167 		prefixlen = cmd.tcpm_prefixlen;
1168 		if (prefixlen > 32)
1169 			return -EINVAL;
1170 	}
1171 
1172 	if (!cmd.tcpm_keylen)
1173 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1174 				      AF_INET, prefixlen);
1175 
1176 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1177 		return -EINVAL;
1178 
1179 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1180 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1181 			      GFP_KERNEL);
1182 }
1183 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1184 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1185 				   __be32 daddr, __be32 saddr,
1186 				   const struct tcphdr *th, int nbytes)
1187 {
1188 	struct tcp4_pseudohdr *bp;
1189 	struct scatterlist sg;
1190 	struct tcphdr *_th;
1191 
1192 	bp = hp->scratch;
1193 	bp->saddr = saddr;
1194 	bp->daddr = daddr;
1195 	bp->pad = 0;
1196 	bp->protocol = IPPROTO_TCP;
1197 	bp->len = cpu_to_be16(nbytes);
1198 
1199 	_th = (struct tcphdr *)(bp + 1);
1200 	memcpy(_th, th, sizeof(*th));
1201 	_th->check = 0;
1202 
1203 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1204 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1205 				sizeof(*bp) + sizeof(*th));
1206 	return crypto_ahash_update(hp->md5_req);
1207 }
1208 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1209 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1210 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1211 {
1212 	struct tcp_md5sig_pool *hp;
1213 	struct ahash_request *req;
1214 
1215 	hp = tcp_get_md5sig_pool();
1216 	if (!hp)
1217 		goto clear_hash_noput;
1218 	req = hp->md5_req;
1219 
1220 	if (crypto_ahash_init(req))
1221 		goto clear_hash;
1222 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1223 		goto clear_hash;
1224 	if (tcp_md5_hash_key(hp, key))
1225 		goto clear_hash;
1226 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1227 	if (crypto_ahash_final(req))
1228 		goto clear_hash;
1229 
1230 	tcp_put_md5sig_pool();
1231 	return 0;
1232 
1233 clear_hash:
1234 	tcp_put_md5sig_pool();
1235 clear_hash_noput:
1236 	memset(md5_hash, 0, 16);
1237 	return 1;
1238 }
1239 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1240 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1241 			const struct sock *sk,
1242 			const struct sk_buff *skb)
1243 {
1244 	struct tcp_md5sig_pool *hp;
1245 	struct ahash_request *req;
1246 	const struct tcphdr *th = tcp_hdr(skb);
1247 	__be32 saddr, daddr;
1248 
1249 	if (sk) { /* valid for establish/request sockets */
1250 		saddr = sk->sk_rcv_saddr;
1251 		daddr = sk->sk_daddr;
1252 	} else {
1253 		const struct iphdr *iph = ip_hdr(skb);
1254 		saddr = iph->saddr;
1255 		daddr = iph->daddr;
1256 	}
1257 
1258 	hp = tcp_get_md5sig_pool();
1259 	if (!hp)
1260 		goto clear_hash_noput;
1261 	req = hp->md5_req;
1262 
1263 	if (crypto_ahash_init(req))
1264 		goto clear_hash;
1265 
1266 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1267 		goto clear_hash;
1268 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1269 		goto clear_hash;
1270 	if (tcp_md5_hash_key(hp, key))
1271 		goto clear_hash;
1272 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1273 	if (crypto_ahash_final(req))
1274 		goto clear_hash;
1275 
1276 	tcp_put_md5sig_pool();
1277 	return 0;
1278 
1279 clear_hash:
1280 	tcp_put_md5sig_pool();
1281 clear_hash_noput:
1282 	memset(md5_hash, 0, 16);
1283 	return 1;
1284 }
1285 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1286 
1287 #endif
1288 
1289 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1290 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1291 				    const struct sk_buff *skb)
1292 {
1293 #ifdef CONFIG_TCP_MD5SIG
1294 	/*
1295 	 * This gets called for each TCP segment that arrives
1296 	 * so we want to be efficient.
1297 	 * We have 3 drop cases:
1298 	 * o No MD5 hash and one expected.
1299 	 * o MD5 hash and we're not expecting one.
1300 	 * o MD5 hash and its wrong.
1301 	 */
1302 	const __u8 *hash_location = NULL;
1303 	struct tcp_md5sig_key *hash_expected;
1304 	const struct iphdr *iph = ip_hdr(skb);
1305 	const struct tcphdr *th = tcp_hdr(skb);
1306 	int genhash;
1307 	unsigned char newhash[16];
1308 
1309 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1310 					  AF_INET);
1311 	hash_location = tcp_parse_md5sig_option(th);
1312 
1313 	/* We've parsed the options - do we have a hash? */
1314 	if (!hash_expected && !hash_location)
1315 		return false;
1316 
1317 	if (hash_expected && !hash_location) {
1318 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1319 		return true;
1320 	}
1321 
1322 	if (!hash_expected && hash_location) {
1323 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1324 		return true;
1325 	}
1326 
1327 	/* Okay, so this is hash_expected and hash_location -
1328 	 * so we need to calculate the checksum.
1329 	 */
1330 	genhash = tcp_v4_md5_hash_skb(newhash,
1331 				      hash_expected,
1332 				      NULL, skb);
1333 
1334 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1335 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1336 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1337 				     &iph->saddr, ntohs(th->source),
1338 				     &iph->daddr, ntohs(th->dest),
1339 				     genhash ? " tcp_v4_calc_md5_hash failed"
1340 				     : "");
1341 		return true;
1342 	}
1343 	return false;
1344 #endif
1345 	return false;
1346 }
1347 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1348 static void tcp_v4_init_req(struct request_sock *req,
1349 			    const struct sock *sk_listener,
1350 			    struct sk_buff *skb)
1351 {
1352 	struct inet_request_sock *ireq = inet_rsk(req);
1353 	struct net *net = sock_net(sk_listener);
1354 
1355 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1356 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1357 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1358 }
1359 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1360 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1361 					  struct flowi *fl,
1362 					  const struct request_sock *req)
1363 {
1364 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1365 }
1366 
1367 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1368 	.family		=	PF_INET,
1369 	.obj_size	=	sizeof(struct tcp_request_sock),
1370 	.rtx_syn_ack	=	tcp_rtx_synack,
1371 	.send_ack	=	tcp_v4_reqsk_send_ack,
1372 	.destructor	=	tcp_v4_reqsk_destructor,
1373 	.send_reset	=	tcp_v4_send_reset,
1374 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1375 };
1376 
1377 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1378 	.mss_clamp	=	TCP_MSS_DEFAULT,
1379 #ifdef CONFIG_TCP_MD5SIG
1380 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1381 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1382 #endif
1383 	.init_req	=	tcp_v4_init_req,
1384 #ifdef CONFIG_SYN_COOKIES
1385 	.cookie_init_seq =	cookie_v4_init_sequence,
1386 #endif
1387 	.route_req	=	tcp_v4_route_req,
1388 	.init_seq	=	tcp_v4_init_seq,
1389 	.init_ts_off	=	tcp_v4_init_ts_off,
1390 	.send_synack	=	tcp_v4_send_synack,
1391 };
1392 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1393 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1394 {
1395 	/* Never answer to SYNs send to broadcast or multicast */
1396 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1397 		goto drop;
1398 
1399 	return tcp_conn_request(&tcp_request_sock_ops,
1400 				&tcp_request_sock_ipv4_ops, sk, skb);
1401 
1402 drop:
1403 	tcp_listendrop(sk);
1404 	return 0;
1405 }
1406 EXPORT_SYMBOL(tcp_v4_conn_request);
1407 
1408 
1409 /*
1410  * The three way handshake has completed - we got a valid synack -
1411  * now create the new socket.
1412  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1413 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1414 				  struct request_sock *req,
1415 				  struct dst_entry *dst,
1416 				  struct request_sock *req_unhash,
1417 				  bool *own_req)
1418 {
1419 	struct inet_request_sock *ireq;
1420 	struct inet_sock *newinet;
1421 	struct tcp_sock *newtp;
1422 	struct sock *newsk;
1423 #ifdef CONFIG_TCP_MD5SIG
1424 	struct tcp_md5sig_key *key;
1425 #endif
1426 	struct ip_options_rcu *inet_opt;
1427 
1428 	if (sk_acceptq_is_full(sk))
1429 		goto exit_overflow;
1430 
1431 	newsk = tcp_create_openreq_child(sk, req, skb);
1432 	if (!newsk)
1433 		goto exit_nonewsk;
1434 
1435 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1436 	inet_sk_rx_dst_set(newsk, skb);
1437 
1438 	newtp		      = tcp_sk(newsk);
1439 	newinet		      = inet_sk(newsk);
1440 	ireq		      = inet_rsk(req);
1441 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1442 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1443 	newsk->sk_bound_dev_if = ireq->ir_iif;
1444 	newinet->inet_saddr   = ireq->ir_loc_addr;
1445 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1446 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1447 	newinet->mc_index     = inet_iif(skb);
1448 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1449 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1450 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1451 	if (inet_opt)
1452 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1453 	newinet->inet_id = prandom_u32();
1454 
1455 	if (!dst) {
1456 		dst = inet_csk_route_child_sock(sk, newsk, req);
1457 		if (!dst)
1458 			goto put_and_exit;
1459 	} else {
1460 		/* syncookie case : see end of cookie_v4_check() */
1461 	}
1462 	sk_setup_caps(newsk, dst);
1463 
1464 	tcp_ca_openreq_child(newsk, dst);
1465 
1466 	tcp_sync_mss(newsk, dst_mtu(dst));
1467 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1468 
1469 	tcp_initialize_rcv_mss(newsk);
1470 
1471 #ifdef CONFIG_TCP_MD5SIG
1472 	/* Copy over the MD5 key from the original socket */
1473 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1474 				AF_INET);
1475 	if (key) {
1476 		/*
1477 		 * We're using one, so create a matching key
1478 		 * on the newsk structure. If we fail to get
1479 		 * memory, then we end up not copying the key
1480 		 * across. Shucks.
1481 		 */
1482 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1483 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1484 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1485 	}
1486 #endif
1487 
1488 	if (__inet_inherit_port(sk, newsk) < 0)
1489 		goto put_and_exit;
1490 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1491 	if (likely(*own_req)) {
1492 		tcp_move_syn(newtp, req);
1493 		ireq->ireq_opt = NULL;
1494 	} else {
1495 		newinet->inet_opt = NULL;
1496 	}
1497 	return newsk;
1498 
1499 exit_overflow:
1500 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501 exit_nonewsk:
1502 	dst_release(dst);
1503 exit:
1504 	tcp_listendrop(sk);
1505 	return NULL;
1506 put_and_exit:
1507 	newinet->inet_opt = NULL;
1508 	inet_csk_prepare_forced_close(newsk);
1509 	tcp_done(newsk);
1510 	goto exit;
1511 }
1512 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1513 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1514 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1515 {
1516 #ifdef CONFIG_SYN_COOKIES
1517 	const struct tcphdr *th = tcp_hdr(skb);
1518 
1519 	if (!th->syn)
1520 		sk = cookie_v4_check(sk, skb);
1521 #endif
1522 	return sk;
1523 }
1524 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1525 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1526 			 struct tcphdr *th, u32 *cookie)
1527 {
1528 	u16 mss = 0;
1529 #ifdef CONFIG_SYN_COOKIES
1530 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1531 				    &tcp_request_sock_ipv4_ops, sk, th);
1532 	if (mss) {
1533 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1534 		tcp_synq_overflow(sk);
1535 	}
1536 #endif
1537 	return mss;
1538 }
1539 
1540 /* The socket must have it's spinlock held when we get
1541  * here, unless it is a TCP_LISTEN socket.
1542  *
1543  * We have a potential double-lock case here, so even when
1544  * doing backlog processing we use the BH locking scheme.
1545  * This is because we cannot sleep with the original spinlock
1546  * held.
1547  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1548 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549 {
1550 	struct sock *rsk;
1551 
1552 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553 		struct dst_entry *dst = sk->sk_rx_dst;
1554 
1555 		sock_rps_save_rxhash(sk, skb);
1556 		sk_mark_napi_id(sk, skb);
1557 		if (dst) {
1558 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1559 			    !dst->ops->check(dst, 0)) {
1560 				dst_release(dst);
1561 				sk->sk_rx_dst = NULL;
1562 			}
1563 		}
1564 		tcp_rcv_established(sk, skb);
1565 		return 0;
1566 	}
1567 
1568 	if (tcp_checksum_complete(skb))
1569 		goto csum_err;
1570 
1571 	if (sk->sk_state == TCP_LISTEN) {
1572 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1573 
1574 		if (!nsk)
1575 			goto discard;
1576 		if (nsk != sk) {
1577 			if (tcp_child_process(sk, nsk, skb)) {
1578 				rsk = nsk;
1579 				goto reset;
1580 			}
1581 			return 0;
1582 		}
1583 	} else
1584 		sock_rps_save_rxhash(sk, skb);
1585 
1586 	if (tcp_rcv_state_process(sk, skb)) {
1587 		rsk = sk;
1588 		goto reset;
1589 	}
1590 	return 0;
1591 
1592 reset:
1593 	tcp_v4_send_reset(rsk, skb);
1594 discard:
1595 	kfree_skb(skb);
1596 	/* Be careful here. If this function gets more complicated and
1597 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1598 	 * might be destroyed here. This current version compiles correctly,
1599 	 * but you have been warned.
1600 	 */
1601 	return 0;
1602 
1603 csum_err:
1604 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1605 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1606 	goto discard;
1607 }
1608 EXPORT_SYMBOL(tcp_v4_do_rcv);
1609 
tcp_v4_early_demux(struct sk_buff * skb)1610 int tcp_v4_early_demux(struct sk_buff *skb)
1611 {
1612 	const struct iphdr *iph;
1613 	const struct tcphdr *th;
1614 	struct sock *sk;
1615 
1616 	if (skb->pkt_type != PACKET_HOST)
1617 		return 0;
1618 
1619 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1620 		return 0;
1621 
1622 	iph = ip_hdr(skb);
1623 	th = tcp_hdr(skb);
1624 
1625 	if (th->doff < sizeof(struct tcphdr) / 4)
1626 		return 0;
1627 
1628 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1629 				       iph->saddr, th->source,
1630 				       iph->daddr, ntohs(th->dest),
1631 				       skb->skb_iif, inet_sdif(skb));
1632 	if (sk) {
1633 		skb->sk = sk;
1634 		skb->destructor = sock_edemux;
1635 		if (sk_fullsock(sk)) {
1636 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1637 
1638 			if (dst)
1639 				dst = dst_check(dst, 0);
1640 			if (dst &&
1641 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1642 				skb_dst_set_noref(skb, dst);
1643 		}
1644 	}
1645 	return 0;
1646 }
1647 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1648 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1649 {
1650 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1651 	struct skb_shared_info *shinfo;
1652 	const struct tcphdr *th;
1653 	struct tcphdr *thtail;
1654 	struct sk_buff *tail;
1655 	unsigned int hdrlen;
1656 	bool fragstolen;
1657 	u32 gso_segs;
1658 	int delta;
1659 
1660 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1661 	 * we can fix skb->truesize to its real value to avoid future drops.
1662 	 * This is valid because skb is not yet charged to the socket.
1663 	 * It has been noticed pure SACK packets were sometimes dropped
1664 	 * (if cooked by drivers without copybreak feature).
1665 	 */
1666 	skb_condense(skb);
1667 
1668 	skb_dst_drop(skb);
1669 
1670 	if (unlikely(tcp_checksum_complete(skb))) {
1671 		bh_unlock_sock(sk);
1672 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1673 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1674 		return true;
1675 	}
1676 
1677 	/* Attempt coalescing to last skb in backlog, even if we are
1678 	 * above the limits.
1679 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1680 	 */
1681 	th = (const struct tcphdr *)skb->data;
1682 	hdrlen = th->doff * 4;
1683 	shinfo = skb_shinfo(skb);
1684 
1685 	if (!shinfo->gso_size)
1686 		shinfo->gso_size = skb->len - hdrlen;
1687 
1688 	if (!shinfo->gso_segs)
1689 		shinfo->gso_segs = 1;
1690 
1691 	tail = sk->sk_backlog.tail;
1692 	if (!tail)
1693 		goto no_coalesce;
1694 	thtail = (struct tcphdr *)tail->data;
1695 
1696 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1697 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1698 	    ((TCP_SKB_CB(tail)->tcp_flags |
1699 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1700 	    !((TCP_SKB_CB(tail)->tcp_flags &
1701 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1702 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1703 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1704 #ifdef CONFIG_TLS_DEVICE
1705 	    tail->decrypted != skb->decrypted ||
1706 #endif
1707 	    thtail->doff != th->doff ||
1708 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1709 		goto no_coalesce;
1710 
1711 	__skb_pull(skb, hdrlen);
1712 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1713 		thtail->window = th->window;
1714 
1715 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1716 
1717 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1718 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1719 
1720 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1721 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1722 		 * is not entered if we append a packet with a FIN.
1723 		 * SYN, RST, URG are not present.
1724 		 * ACK is set on both packets.
1725 		 * PSH : we do not really care in TCP stack,
1726 		 *       at least for 'GRO' packets.
1727 		 */
1728 		thtail->fin |= th->fin;
1729 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1730 
1731 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1732 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1733 			tail->tstamp = skb->tstamp;
1734 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1735 		}
1736 
1737 		/* Not as strict as GRO. We only need to carry mss max value */
1738 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1739 						 skb_shinfo(tail)->gso_size);
1740 
1741 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1742 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1743 
1744 		sk->sk_backlog.len += delta;
1745 		__NET_INC_STATS(sock_net(sk),
1746 				LINUX_MIB_TCPBACKLOGCOALESCE);
1747 		kfree_skb_partial(skb, fragstolen);
1748 		return false;
1749 	}
1750 	__skb_push(skb, hdrlen);
1751 
1752 no_coalesce:
1753 	/* Only socket owner can try to collapse/prune rx queues
1754 	 * to reduce memory overhead, so add a little headroom here.
1755 	 * Few sockets backlog are possibly concurrently non empty.
1756 	 */
1757 	limit += 64*1024;
1758 
1759 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1760 		bh_unlock_sock(sk);
1761 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1762 		return true;
1763 	}
1764 	return false;
1765 }
1766 EXPORT_SYMBOL(tcp_add_backlog);
1767 
tcp_filter(struct sock * sk,struct sk_buff * skb)1768 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1769 {
1770 	struct tcphdr *th = (struct tcphdr *)skb->data;
1771 
1772 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1773 }
1774 EXPORT_SYMBOL(tcp_filter);
1775 
tcp_v4_restore_cb(struct sk_buff * skb)1776 static void tcp_v4_restore_cb(struct sk_buff *skb)
1777 {
1778 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1779 		sizeof(struct inet_skb_parm));
1780 }
1781 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1782 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1783 			   const struct tcphdr *th)
1784 {
1785 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1786 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1787 	 */
1788 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1789 		sizeof(struct inet_skb_parm));
1790 	barrier();
1791 
1792 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1793 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1794 				    skb->len - th->doff * 4);
1795 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1796 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1797 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1798 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1799 	TCP_SKB_CB(skb)->sacked	 = 0;
1800 	TCP_SKB_CB(skb)->has_rxtstamp =
1801 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1802 }
1803 
1804 /*
1805  *	From tcp_input.c
1806  */
1807 
tcp_v4_rcv(struct sk_buff * skb)1808 int tcp_v4_rcv(struct sk_buff *skb)
1809 {
1810 	struct net *net = dev_net(skb->dev);
1811 	struct sk_buff *skb_to_free;
1812 	int sdif = inet_sdif(skb);
1813 	const struct iphdr *iph;
1814 	const struct tcphdr *th;
1815 	bool refcounted;
1816 	struct sock *sk;
1817 	int ret;
1818 
1819 	if (skb->pkt_type != PACKET_HOST)
1820 		goto discard_it;
1821 
1822 	/* Count it even if it's bad */
1823 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1824 
1825 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1826 		goto discard_it;
1827 
1828 	th = (const struct tcphdr *)skb->data;
1829 
1830 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1831 		goto bad_packet;
1832 	if (!pskb_may_pull(skb, th->doff * 4))
1833 		goto discard_it;
1834 
1835 	/* An explanation is required here, I think.
1836 	 * Packet length and doff are validated by header prediction,
1837 	 * provided case of th->doff==0 is eliminated.
1838 	 * So, we defer the checks. */
1839 
1840 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1841 		goto csum_error;
1842 
1843 	th = (const struct tcphdr *)skb->data;
1844 	iph = ip_hdr(skb);
1845 lookup:
1846 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1847 			       th->dest, sdif, &refcounted);
1848 	if (!sk)
1849 		goto no_tcp_socket;
1850 
1851 process:
1852 	if (sk->sk_state == TCP_TIME_WAIT)
1853 		goto do_time_wait;
1854 
1855 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1856 		struct request_sock *req = inet_reqsk(sk);
1857 		bool req_stolen = false;
1858 		struct sock *nsk;
1859 
1860 		sk = req->rsk_listener;
1861 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1862 			sk_drops_add(sk, skb);
1863 			reqsk_put(req);
1864 			goto discard_it;
1865 		}
1866 		if (tcp_checksum_complete(skb)) {
1867 			reqsk_put(req);
1868 			goto csum_error;
1869 		}
1870 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1871 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1872 			goto lookup;
1873 		}
1874 		/* We own a reference on the listener, increase it again
1875 		 * as we might lose it too soon.
1876 		 */
1877 		sock_hold(sk);
1878 		refcounted = true;
1879 		nsk = NULL;
1880 		if (!tcp_filter(sk, skb)) {
1881 			th = (const struct tcphdr *)skb->data;
1882 			iph = ip_hdr(skb);
1883 			tcp_v4_fill_cb(skb, iph, th);
1884 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1885 		}
1886 		if (!nsk) {
1887 			reqsk_put(req);
1888 			if (req_stolen) {
1889 				/* Another cpu got exclusive access to req
1890 				 * and created a full blown socket.
1891 				 * Try to feed this packet to this socket
1892 				 * instead of discarding it.
1893 				 */
1894 				tcp_v4_restore_cb(skb);
1895 				sock_put(sk);
1896 				goto lookup;
1897 			}
1898 			goto discard_and_relse;
1899 		}
1900 		if (nsk == sk) {
1901 			reqsk_put(req);
1902 			tcp_v4_restore_cb(skb);
1903 		} else if (tcp_child_process(sk, nsk, skb)) {
1904 			tcp_v4_send_reset(nsk, skb);
1905 			goto discard_and_relse;
1906 		} else {
1907 			sock_put(sk);
1908 			return 0;
1909 		}
1910 	}
1911 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1912 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1913 		goto discard_and_relse;
1914 	}
1915 
1916 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1917 		goto discard_and_relse;
1918 
1919 	if (tcp_v4_inbound_md5_hash(sk, skb))
1920 		goto discard_and_relse;
1921 
1922 	nf_reset_ct(skb);
1923 
1924 	if (tcp_filter(sk, skb))
1925 		goto discard_and_relse;
1926 	th = (const struct tcphdr *)skb->data;
1927 	iph = ip_hdr(skb);
1928 	tcp_v4_fill_cb(skb, iph, th);
1929 
1930 	skb->dev = NULL;
1931 
1932 	if (sk->sk_state == TCP_LISTEN) {
1933 		ret = tcp_v4_do_rcv(sk, skb);
1934 		goto put_and_return;
1935 	}
1936 
1937 	sk_incoming_cpu_update(sk);
1938 
1939 	bh_lock_sock_nested(sk);
1940 	tcp_segs_in(tcp_sk(sk), skb);
1941 	ret = 0;
1942 	if (!sock_owned_by_user(sk)) {
1943 		skb_to_free = sk->sk_rx_skb_cache;
1944 		sk->sk_rx_skb_cache = NULL;
1945 		ret = tcp_v4_do_rcv(sk, skb);
1946 	} else {
1947 		if (tcp_add_backlog(sk, skb))
1948 			goto discard_and_relse;
1949 		skb_to_free = NULL;
1950 	}
1951 	bh_unlock_sock(sk);
1952 	if (skb_to_free)
1953 		__kfree_skb(skb_to_free);
1954 
1955 put_and_return:
1956 	if (refcounted)
1957 		sock_put(sk);
1958 
1959 	return ret;
1960 
1961 no_tcp_socket:
1962 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1963 		goto discard_it;
1964 
1965 	tcp_v4_fill_cb(skb, iph, th);
1966 
1967 	if (tcp_checksum_complete(skb)) {
1968 csum_error:
1969 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1970 bad_packet:
1971 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1972 	} else {
1973 		tcp_v4_send_reset(NULL, skb);
1974 	}
1975 
1976 discard_it:
1977 	/* Discard frame. */
1978 	kfree_skb(skb);
1979 	return 0;
1980 
1981 discard_and_relse:
1982 	sk_drops_add(sk, skb);
1983 	if (refcounted)
1984 		sock_put(sk);
1985 	goto discard_it;
1986 
1987 do_time_wait:
1988 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1989 		inet_twsk_put(inet_twsk(sk));
1990 		goto discard_it;
1991 	}
1992 
1993 	tcp_v4_fill_cb(skb, iph, th);
1994 
1995 	if (tcp_checksum_complete(skb)) {
1996 		inet_twsk_put(inet_twsk(sk));
1997 		goto csum_error;
1998 	}
1999 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2000 	case TCP_TW_SYN: {
2001 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2002 							&tcp_hashinfo, skb,
2003 							__tcp_hdrlen(th),
2004 							iph->saddr, th->source,
2005 							iph->daddr, th->dest,
2006 							inet_iif(skb),
2007 							sdif);
2008 		if (sk2) {
2009 			inet_twsk_deschedule_put(inet_twsk(sk));
2010 			sk = sk2;
2011 			tcp_v4_restore_cb(skb);
2012 			refcounted = false;
2013 			goto process;
2014 		}
2015 	}
2016 		/* to ACK */
2017 		/* fall through */
2018 	case TCP_TW_ACK:
2019 		tcp_v4_timewait_ack(sk, skb);
2020 		break;
2021 	case TCP_TW_RST:
2022 		tcp_v4_send_reset(sk, skb);
2023 		inet_twsk_deschedule_put(inet_twsk(sk));
2024 		goto discard_it;
2025 	case TCP_TW_SUCCESS:;
2026 	}
2027 	goto discard_it;
2028 }
2029 
2030 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2031 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2032 	.twsk_unique	= tcp_twsk_unique,
2033 	.twsk_destructor= tcp_twsk_destructor,
2034 };
2035 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2036 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2037 {
2038 	struct dst_entry *dst = skb_dst(skb);
2039 
2040 	if (dst && dst_hold_safe(dst)) {
2041 		sk->sk_rx_dst = dst;
2042 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2043 	}
2044 }
2045 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2046 
2047 const struct inet_connection_sock_af_ops ipv4_specific = {
2048 	.queue_xmit	   = ip_queue_xmit,
2049 	.send_check	   = tcp_v4_send_check,
2050 	.rebuild_header	   = inet_sk_rebuild_header,
2051 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2052 	.conn_request	   = tcp_v4_conn_request,
2053 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2054 	.net_header_len	   = sizeof(struct iphdr),
2055 	.setsockopt	   = ip_setsockopt,
2056 	.getsockopt	   = ip_getsockopt,
2057 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2058 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2059 #ifdef CONFIG_COMPAT
2060 	.compat_setsockopt = compat_ip_setsockopt,
2061 	.compat_getsockopt = compat_ip_getsockopt,
2062 #endif
2063 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2064 };
2065 EXPORT_SYMBOL(ipv4_specific);
2066 
2067 #ifdef CONFIG_TCP_MD5SIG
2068 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2069 	.md5_lookup		= tcp_v4_md5_lookup,
2070 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2071 	.md5_parse		= tcp_v4_parse_md5_keys,
2072 };
2073 #endif
2074 
2075 /* NOTE: A lot of things set to zero explicitly by call to
2076  *       sk_alloc() so need not be done here.
2077  */
tcp_v4_init_sock(struct sock * sk)2078 static int tcp_v4_init_sock(struct sock *sk)
2079 {
2080 	struct inet_connection_sock *icsk = inet_csk(sk);
2081 
2082 	tcp_init_sock(sk);
2083 
2084 	icsk->icsk_af_ops = &ipv4_specific;
2085 
2086 #ifdef CONFIG_TCP_MD5SIG
2087 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2088 #endif
2089 
2090 	return 0;
2091 }
2092 
tcp_v4_destroy_sock(struct sock * sk)2093 void tcp_v4_destroy_sock(struct sock *sk)
2094 {
2095 	struct tcp_sock *tp = tcp_sk(sk);
2096 
2097 	trace_tcp_destroy_sock(sk);
2098 
2099 	tcp_clear_xmit_timers(sk);
2100 
2101 	tcp_cleanup_congestion_control(sk);
2102 
2103 	tcp_cleanup_ulp(sk);
2104 
2105 	/* Cleanup up the write buffer. */
2106 	tcp_write_queue_purge(sk);
2107 
2108 	/* Check if we want to disable active TFO */
2109 	tcp_fastopen_active_disable_ofo_check(sk);
2110 
2111 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2112 	skb_rbtree_purge(&tp->out_of_order_queue);
2113 
2114 #ifdef CONFIG_TCP_MD5SIG
2115 	/* Clean up the MD5 key list, if any */
2116 	if (tp->md5sig_info) {
2117 		tcp_clear_md5_list(sk);
2118 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2119 		tp->md5sig_info = NULL;
2120 	}
2121 #endif
2122 
2123 	/* Clean up a referenced TCP bind bucket. */
2124 	if (inet_csk(sk)->icsk_bind_hash)
2125 		inet_put_port(sk);
2126 
2127 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2128 
2129 	/* If socket is aborted during connect operation */
2130 	tcp_free_fastopen_req(tp);
2131 	tcp_fastopen_destroy_cipher(sk);
2132 	tcp_saved_syn_free(tp);
2133 
2134 	sk_sockets_allocated_dec(sk);
2135 }
2136 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2137 
2138 #ifdef CONFIG_PROC_FS
2139 /* Proc filesystem TCP sock list dumping. */
2140 
2141 /*
2142  * Get next listener socket follow cur.  If cur is NULL, get first socket
2143  * starting from bucket given in st->bucket; when st->bucket is zero the
2144  * very first socket in the hash table is returned.
2145  */
listening_get_next(struct seq_file * seq,void * cur)2146 static void *listening_get_next(struct seq_file *seq, void *cur)
2147 {
2148 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2149 	struct tcp_iter_state *st = seq->private;
2150 	struct net *net = seq_file_net(seq);
2151 	struct inet_listen_hashbucket *ilb;
2152 	struct hlist_nulls_node *node;
2153 	struct sock *sk = cur;
2154 
2155 	if (!sk) {
2156 get_head:
2157 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2158 		spin_lock(&ilb->lock);
2159 		sk = sk_nulls_head(&ilb->nulls_head);
2160 		st->offset = 0;
2161 		goto get_sk;
2162 	}
2163 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2164 	++st->num;
2165 	++st->offset;
2166 
2167 	sk = sk_nulls_next(sk);
2168 get_sk:
2169 	sk_nulls_for_each_from(sk, node) {
2170 		if (!net_eq(sock_net(sk), net))
2171 			continue;
2172 		if (sk->sk_family == afinfo->family)
2173 			return sk;
2174 	}
2175 	spin_unlock(&ilb->lock);
2176 	st->offset = 0;
2177 	if (++st->bucket < INET_LHTABLE_SIZE)
2178 		goto get_head;
2179 	return NULL;
2180 }
2181 
listening_get_idx(struct seq_file * seq,loff_t * pos)2182 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2183 {
2184 	struct tcp_iter_state *st = seq->private;
2185 	void *rc;
2186 
2187 	st->bucket = 0;
2188 	st->offset = 0;
2189 	rc = listening_get_next(seq, NULL);
2190 
2191 	while (rc && *pos) {
2192 		rc = listening_get_next(seq, rc);
2193 		--*pos;
2194 	}
2195 	return rc;
2196 }
2197 
empty_bucket(const struct tcp_iter_state * st)2198 static inline bool empty_bucket(const struct tcp_iter_state *st)
2199 {
2200 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2201 }
2202 
2203 /*
2204  * Get first established socket starting from bucket given in st->bucket.
2205  * If st->bucket is zero, the very first socket in the hash is returned.
2206  */
established_get_first(struct seq_file * seq)2207 static void *established_get_first(struct seq_file *seq)
2208 {
2209 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2210 	struct tcp_iter_state *st = seq->private;
2211 	struct net *net = seq_file_net(seq);
2212 	void *rc = NULL;
2213 
2214 	st->offset = 0;
2215 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2216 		struct sock *sk;
2217 		struct hlist_nulls_node *node;
2218 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2219 
2220 		/* Lockless fast path for the common case of empty buckets */
2221 		if (empty_bucket(st))
2222 			continue;
2223 
2224 		spin_lock_bh(lock);
2225 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2226 			if (sk->sk_family != afinfo->family ||
2227 			    !net_eq(sock_net(sk), net)) {
2228 				continue;
2229 			}
2230 			rc = sk;
2231 			goto out;
2232 		}
2233 		spin_unlock_bh(lock);
2234 	}
2235 out:
2236 	return rc;
2237 }
2238 
established_get_next(struct seq_file * seq,void * cur)2239 static void *established_get_next(struct seq_file *seq, void *cur)
2240 {
2241 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2242 	struct sock *sk = cur;
2243 	struct hlist_nulls_node *node;
2244 	struct tcp_iter_state *st = seq->private;
2245 	struct net *net = seq_file_net(seq);
2246 
2247 	++st->num;
2248 	++st->offset;
2249 
2250 	sk = sk_nulls_next(sk);
2251 
2252 	sk_nulls_for_each_from(sk, node) {
2253 		if (sk->sk_family == afinfo->family &&
2254 		    net_eq(sock_net(sk), net))
2255 			return sk;
2256 	}
2257 
2258 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2259 	++st->bucket;
2260 	return established_get_first(seq);
2261 }
2262 
established_get_idx(struct seq_file * seq,loff_t pos)2263 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2264 {
2265 	struct tcp_iter_state *st = seq->private;
2266 	void *rc;
2267 
2268 	st->bucket = 0;
2269 	rc = established_get_first(seq);
2270 
2271 	while (rc && pos) {
2272 		rc = established_get_next(seq, rc);
2273 		--pos;
2274 	}
2275 	return rc;
2276 }
2277 
tcp_get_idx(struct seq_file * seq,loff_t pos)2278 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2279 {
2280 	void *rc;
2281 	struct tcp_iter_state *st = seq->private;
2282 
2283 	st->state = TCP_SEQ_STATE_LISTENING;
2284 	rc	  = listening_get_idx(seq, &pos);
2285 
2286 	if (!rc) {
2287 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2288 		rc	  = established_get_idx(seq, pos);
2289 	}
2290 
2291 	return rc;
2292 }
2293 
tcp_seek_last_pos(struct seq_file * seq)2294 static void *tcp_seek_last_pos(struct seq_file *seq)
2295 {
2296 	struct tcp_iter_state *st = seq->private;
2297 	int offset = st->offset;
2298 	int orig_num = st->num;
2299 	void *rc = NULL;
2300 
2301 	switch (st->state) {
2302 	case TCP_SEQ_STATE_LISTENING:
2303 		if (st->bucket >= INET_LHTABLE_SIZE)
2304 			break;
2305 		st->state = TCP_SEQ_STATE_LISTENING;
2306 		rc = listening_get_next(seq, NULL);
2307 		while (offset-- && rc)
2308 			rc = listening_get_next(seq, rc);
2309 		if (rc)
2310 			break;
2311 		st->bucket = 0;
2312 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2313 		/* Fallthrough */
2314 	case TCP_SEQ_STATE_ESTABLISHED:
2315 		if (st->bucket > tcp_hashinfo.ehash_mask)
2316 			break;
2317 		rc = established_get_first(seq);
2318 		while (offset-- && rc)
2319 			rc = established_get_next(seq, rc);
2320 	}
2321 
2322 	st->num = orig_num;
2323 
2324 	return rc;
2325 }
2326 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2327 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2328 {
2329 	struct tcp_iter_state *st = seq->private;
2330 	void *rc;
2331 
2332 	if (*pos && *pos == st->last_pos) {
2333 		rc = tcp_seek_last_pos(seq);
2334 		if (rc)
2335 			goto out;
2336 	}
2337 
2338 	st->state = TCP_SEQ_STATE_LISTENING;
2339 	st->num = 0;
2340 	st->bucket = 0;
2341 	st->offset = 0;
2342 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2343 
2344 out:
2345 	st->last_pos = *pos;
2346 	return rc;
2347 }
2348 EXPORT_SYMBOL(tcp_seq_start);
2349 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2350 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2351 {
2352 	struct tcp_iter_state *st = seq->private;
2353 	void *rc = NULL;
2354 
2355 	if (v == SEQ_START_TOKEN) {
2356 		rc = tcp_get_idx(seq, 0);
2357 		goto out;
2358 	}
2359 
2360 	switch (st->state) {
2361 	case TCP_SEQ_STATE_LISTENING:
2362 		rc = listening_get_next(seq, v);
2363 		if (!rc) {
2364 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2365 			st->bucket = 0;
2366 			st->offset = 0;
2367 			rc	  = established_get_first(seq);
2368 		}
2369 		break;
2370 	case TCP_SEQ_STATE_ESTABLISHED:
2371 		rc = established_get_next(seq, v);
2372 		break;
2373 	}
2374 out:
2375 	++*pos;
2376 	st->last_pos = *pos;
2377 	return rc;
2378 }
2379 EXPORT_SYMBOL(tcp_seq_next);
2380 
tcp_seq_stop(struct seq_file * seq,void * v)2381 void tcp_seq_stop(struct seq_file *seq, void *v)
2382 {
2383 	struct tcp_iter_state *st = seq->private;
2384 
2385 	switch (st->state) {
2386 	case TCP_SEQ_STATE_LISTENING:
2387 		if (v != SEQ_START_TOKEN)
2388 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2389 		break;
2390 	case TCP_SEQ_STATE_ESTABLISHED:
2391 		if (v)
2392 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2393 		break;
2394 	}
2395 }
2396 EXPORT_SYMBOL(tcp_seq_stop);
2397 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2398 static void get_openreq4(const struct request_sock *req,
2399 			 struct seq_file *f, int i)
2400 {
2401 	const struct inet_request_sock *ireq = inet_rsk(req);
2402 	long delta = req->rsk_timer.expires - jiffies;
2403 
2404 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2405 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2406 		i,
2407 		ireq->ir_loc_addr,
2408 		ireq->ir_num,
2409 		ireq->ir_rmt_addr,
2410 		ntohs(ireq->ir_rmt_port),
2411 		TCP_SYN_RECV,
2412 		0, 0, /* could print option size, but that is af dependent. */
2413 		1,    /* timers active (only the expire timer) */
2414 		jiffies_delta_to_clock_t(delta),
2415 		req->num_timeout,
2416 		from_kuid_munged(seq_user_ns(f),
2417 				 sock_i_uid(req->rsk_listener)),
2418 		0,  /* non standard timer */
2419 		0, /* open_requests have no inode */
2420 		0,
2421 		req);
2422 }
2423 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2424 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2425 {
2426 	int timer_active;
2427 	unsigned long timer_expires;
2428 	const struct tcp_sock *tp = tcp_sk(sk);
2429 	const struct inet_connection_sock *icsk = inet_csk(sk);
2430 	const struct inet_sock *inet = inet_sk(sk);
2431 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2432 	__be32 dest = inet->inet_daddr;
2433 	__be32 src = inet->inet_rcv_saddr;
2434 	__u16 destp = ntohs(inet->inet_dport);
2435 	__u16 srcp = ntohs(inet->inet_sport);
2436 	int rx_queue;
2437 	int state;
2438 
2439 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2440 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2441 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2442 		timer_active	= 1;
2443 		timer_expires	= icsk->icsk_timeout;
2444 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2445 		timer_active	= 4;
2446 		timer_expires	= icsk->icsk_timeout;
2447 	} else if (timer_pending(&sk->sk_timer)) {
2448 		timer_active	= 2;
2449 		timer_expires	= sk->sk_timer.expires;
2450 	} else {
2451 		timer_active	= 0;
2452 		timer_expires = jiffies;
2453 	}
2454 
2455 	state = inet_sk_state_load(sk);
2456 	if (state == TCP_LISTEN)
2457 		rx_queue = sk->sk_ack_backlog;
2458 	else
2459 		/* Because we don't lock the socket,
2460 		 * we might find a transient negative value.
2461 		 */
2462 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2463 				      READ_ONCE(tp->copied_seq), 0);
2464 
2465 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2466 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2467 		i, src, srcp, dest, destp, state,
2468 		READ_ONCE(tp->write_seq) - tp->snd_una,
2469 		rx_queue,
2470 		timer_active,
2471 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2472 		icsk->icsk_retransmits,
2473 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2474 		icsk->icsk_probes_out,
2475 		sock_i_ino(sk),
2476 		refcount_read(&sk->sk_refcnt), sk,
2477 		jiffies_to_clock_t(icsk->icsk_rto),
2478 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2479 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2480 		tp->snd_cwnd,
2481 		state == TCP_LISTEN ?
2482 		    fastopenq->max_qlen :
2483 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2484 }
2485 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2486 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2487 			       struct seq_file *f, int i)
2488 {
2489 	long delta = tw->tw_timer.expires - jiffies;
2490 	__be32 dest, src;
2491 	__u16 destp, srcp;
2492 
2493 	dest  = tw->tw_daddr;
2494 	src   = tw->tw_rcv_saddr;
2495 	destp = ntohs(tw->tw_dport);
2496 	srcp  = ntohs(tw->tw_sport);
2497 
2498 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2499 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2500 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2501 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2502 		refcount_read(&tw->tw_refcnt), tw);
2503 }
2504 
2505 #define TMPSZ 150
2506 
tcp4_seq_show(struct seq_file * seq,void * v)2507 static int tcp4_seq_show(struct seq_file *seq, void *v)
2508 {
2509 	struct tcp_iter_state *st;
2510 	struct sock *sk = v;
2511 
2512 	seq_setwidth(seq, TMPSZ - 1);
2513 	if (v == SEQ_START_TOKEN) {
2514 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2515 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2516 			   "inode");
2517 		goto out;
2518 	}
2519 	st = seq->private;
2520 
2521 	if (sk->sk_state == TCP_TIME_WAIT)
2522 		get_timewait4_sock(v, seq, st->num);
2523 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2524 		get_openreq4(v, seq, st->num);
2525 	else
2526 		get_tcp4_sock(v, seq, st->num);
2527 out:
2528 	seq_pad(seq, '\n');
2529 	return 0;
2530 }
2531 
2532 static const struct seq_operations tcp4_seq_ops = {
2533 	.show		= tcp4_seq_show,
2534 	.start		= tcp_seq_start,
2535 	.next		= tcp_seq_next,
2536 	.stop		= tcp_seq_stop,
2537 };
2538 
2539 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2540 	.family		= AF_INET,
2541 };
2542 
tcp4_proc_init_net(struct net * net)2543 static int __net_init tcp4_proc_init_net(struct net *net)
2544 {
2545 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2546 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2547 		return -ENOMEM;
2548 	return 0;
2549 }
2550 
tcp4_proc_exit_net(struct net * net)2551 static void __net_exit tcp4_proc_exit_net(struct net *net)
2552 {
2553 	remove_proc_entry("tcp", net->proc_net);
2554 }
2555 
2556 static struct pernet_operations tcp4_net_ops = {
2557 	.init = tcp4_proc_init_net,
2558 	.exit = tcp4_proc_exit_net,
2559 };
2560 
tcp4_proc_init(void)2561 int __init tcp4_proc_init(void)
2562 {
2563 	return register_pernet_subsys(&tcp4_net_ops);
2564 }
2565 
tcp4_proc_exit(void)2566 void tcp4_proc_exit(void)
2567 {
2568 	unregister_pernet_subsys(&tcp4_net_ops);
2569 }
2570 #endif /* CONFIG_PROC_FS */
2571 
2572 struct proto tcp_prot = {
2573 	.name			= "TCP",
2574 	.owner			= THIS_MODULE,
2575 	.close			= tcp_close,
2576 	.pre_connect		= tcp_v4_pre_connect,
2577 	.connect		= tcp_v4_connect,
2578 	.disconnect		= tcp_disconnect,
2579 	.accept			= inet_csk_accept,
2580 	.ioctl			= tcp_ioctl,
2581 	.init			= tcp_v4_init_sock,
2582 	.destroy		= tcp_v4_destroy_sock,
2583 	.shutdown		= tcp_shutdown,
2584 	.setsockopt		= tcp_setsockopt,
2585 	.getsockopt		= tcp_getsockopt,
2586 	.keepalive		= tcp_set_keepalive,
2587 	.recvmsg		= tcp_recvmsg,
2588 	.sendmsg		= tcp_sendmsg,
2589 	.sendpage		= tcp_sendpage,
2590 	.backlog_rcv		= tcp_v4_do_rcv,
2591 	.release_cb		= tcp_release_cb,
2592 	.hash			= inet_hash,
2593 	.unhash			= inet_unhash,
2594 	.get_port		= inet_csk_get_port,
2595 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2596 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2597 	.stream_memory_free	= tcp_stream_memory_free,
2598 	.sockets_allocated	= &tcp_sockets_allocated,
2599 	.orphan_count		= &tcp_orphan_count,
2600 	.memory_allocated	= &tcp_memory_allocated,
2601 	.memory_pressure	= &tcp_memory_pressure,
2602 	.sysctl_mem		= sysctl_tcp_mem,
2603 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2604 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2605 	.max_header		= MAX_TCP_HEADER,
2606 	.obj_size		= sizeof(struct tcp_sock),
2607 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2608 	.twsk_prot		= &tcp_timewait_sock_ops,
2609 	.rsk_prot		= &tcp_request_sock_ops,
2610 	.h.hashinfo		= &tcp_hashinfo,
2611 	.no_autobind		= true,
2612 #ifdef CONFIG_COMPAT
2613 	.compat_setsockopt	= compat_tcp_setsockopt,
2614 	.compat_getsockopt	= compat_tcp_getsockopt,
2615 #endif
2616 	.diag_destroy		= tcp_abort,
2617 };
2618 EXPORT_SYMBOL(tcp_prot);
2619 
tcp_sk_exit(struct net * net)2620 static void __net_exit tcp_sk_exit(struct net *net)
2621 {
2622 	int cpu;
2623 
2624 	if (net->ipv4.tcp_congestion_control)
2625 		module_put(net->ipv4.tcp_congestion_control->owner);
2626 
2627 	for_each_possible_cpu(cpu)
2628 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2629 	free_percpu(net->ipv4.tcp_sk);
2630 }
2631 
tcp_sk_init(struct net * net)2632 static int __net_init tcp_sk_init(struct net *net)
2633 {
2634 	int res, cpu, cnt;
2635 
2636 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2637 	if (!net->ipv4.tcp_sk)
2638 		return -ENOMEM;
2639 
2640 	for_each_possible_cpu(cpu) {
2641 		struct sock *sk;
2642 
2643 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2644 					   IPPROTO_TCP, net);
2645 		if (res)
2646 			goto fail;
2647 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2648 
2649 		/* Please enforce IP_DF and IPID==0 for RST and
2650 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2651 		 */
2652 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2653 
2654 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2655 	}
2656 
2657 	net->ipv4.sysctl_tcp_ecn = 2;
2658 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2659 
2660 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2661 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2662 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2663 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2664 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2665 
2666 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2667 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2668 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2669 
2670 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2671 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2672 	net->ipv4.sysctl_tcp_syncookies = 1;
2673 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2674 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2675 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2676 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2677 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2678 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2679 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2680 
2681 	cnt = tcp_hashinfo.ehash_mask + 1;
2682 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2683 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2684 
2685 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2686 	net->ipv4.sysctl_tcp_sack = 1;
2687 	net->ipv4.sysctl_tcp_window_scaling = 1;
2688 	net->ipv4.sysctl_tcp_timestamps = 1;
2689 	net->ipv4.sysctl_tcp_early_retrans = 3;
2690 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2691 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2692 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2693 	net->ipv4.sysctl_tcp_max_reordering = 300;
2694 	net->ipv4.sysctl_tcp_dsack = 1;
2695 	net->ipv4.sysctl_tcp_app_win = 31;
2696 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2697 	net->ipv4.sysctl_tcp_frto = 2;
2698 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2699 	/* This limits the percentage of the congestion window which we
2700 	 * will allow a single TSO frame to consume.  Building TSO frames
2701 	 * which are too large can cause TCP streams to be bursty.
2702 	 */
2703 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2704 	/* Default TSQ limit of 16 TSO segments */
2705 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2706 	/* rfc5961 challenge ack rate limiting */
2707 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2708 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2709 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2710 	net->ipv4.sysctl_tcp_autocorking = 1;
2711 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2712 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2713 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2714 	if (net != &init_net) {
2715 		memcpy(net->ipv4.sysctl_tcp_rmem,
2716 		       init_net.ipv4.sysctl_tcp_rmem,
2717 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2718 		memcpy(net->ipv4.sysctl_tcp_wmem,
2719 		       init_net.ipv4.sysctl_tcp_wmem,
2720 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2721 	}
2722 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2723 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2724 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2725 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2726 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2727 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2728 
2729 	/* Reno is always built in */
2730 	if (!net_eq(net, &init_net) &&
2731 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2732 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2733 	else
2734 		net->ipv4.tcp_congestion_control = &tcp_reno;
2735 
2736 	return 0;
2737 fail:
2738 	tcp_sk_exit(net);
2739 
2740 	return res;
2741 }
2742 
tcp_sk_exit_batch(struct list_head * net_exit_list)2743 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2744 {
2745 	struct net *net;
2746 
2747 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2748 
2749 	list_for_each_entry(net, net_exit_list, exit_list)
2750 		tcp_fastopen_ctx_destroy(net);
2751 }
2752 
2753 static struct pernet_operations __net_initdata tcp_sk_ops = {
2754        .init	   = tcp_sk_init,
2755        .exit	   = tcp_sk_exit,
2756        .exit_batch = tcp_sk_exit_batch,
2757 };
2758 
tcp_v4_init(void)2759 void __init tcp_v4_init(void)
2760 {
2761 	if (register_pernet_subsys(&tcp_sk_ops))
2762 		panic("Failed to create the TCP control socket.\n");
2763 }
2764