• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
tcp_v4_mtu_reduced(struct sock * sk)337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
do_redirect(struct sk_buff * skb,struct sock * sk)373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
tcp_v4_err(struct sk_buff * skb,u32 info)463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			WRITE_ONCE(tp->mtu_info, info);
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type,
969 			      struct sk_buff *syn_skb)
970 {
971 	const struct inet_request_sock *ireq = inet_rsk(req);
972 	struct flowi4 fl4;
973 	int err = -1;
974 	struct sk_buff *skb;
975 	u8 tos;
976 
977 	/* First, grab a route. */
978 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 		return -1;
980 
981 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982 
983 	if (skb) {
984 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
985 
986 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
988 				(inet_sk(sk)->tos & INET_ECN_MASK) :
989 				inet_sk(sk)->tos;
990 
991 		if (!INET_ECN_is_capable(tos) &&
992 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
993 			tos |= INET_ECN_ECT_0;
994 
995 		rcu_read_lock();
996 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
997 					    ireq->ir_rmt_addr,
998 					    rcu_dereference(ireq->ireq_opt),
999 					    tos);
1000 		rcu_read_unlock();
1001 		err = net_xmit_eval(err);
1002 	}
1003 
1004 	return err;
1005 }
1006 
1007 /*
1008  *	IPv4 request_sock destructor.
1009  */
tcp_v4_reqsk_destructor(struct request_sock * req)1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 {
1012 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 }
1014 
1015 #ifdef CONFIG_TCP_MD5SIG
1016 /*
1017  * RFC2385 MD5 checksumming requires a mapping of
1018  * IP address->MD5 Key.
1019  * We need to maintain these in the sk structure.
1020  */
1021 
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1024 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1025 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1026 {
1027 	if (!old)
1028 		return true;
1029 
1030 	/* l3index always overrides non-l3index */
1031 	if (old->l3index && new->l3index == 0)
1032 		return false;
1033 	if (old->l3index == 0 && new->l3index)
1034 		return true;
1035 
1036 	return old->prefixlen < new->prefixlen;
1037 }
1038 
1039 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1040 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1041 					   const union tcp_md5_addr *addr,
1042 					   int family)
1043 {
1044 	const struct tcp_sock *tp = tcp_sk(sk);
1045 	struct tcp_md5sig_key *key;
1046 	const struct tcp_md5sig_info *md5sig;
1047 	__be32 mask;
1048 	struct tcp_md5sig_key *best_match = NULL;
1049 	bool match;
1050 
1051 	/* caller either holds rcu_read_lock() or socket lock */
1052 	md5sig = rcu_dereference_check(tp->md5sig_info,
1053 				       lockdep_sock_is_held(sk));
1054 	if (!md5sig)
1055 		return NULL;
1056 
1057 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1058 				 lockdep_sock_is_held(sk)) {
1059 		if (key->family != family)
1060 			continue;
1061 		if (key->l3index && key->l3index != l3index)
1062 			continue;
1063 		if (family == AF_INET) {
1064 			mask = inet_make_mask(key->prefixlen);
1065 			match = (key->addr.a4.s_addr & mask) ==
1066 				(addr->a4.s_addr & mask);
1067 #if IS_ENABLED(CONFIG_IPV6)
1068 		} else if (family == AF_INET6) {
1069 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1070 						  key->prefixlen);
1071 #endif
1072 		} else {
1073 			match = false;
1074 		}
1075 
1076 		if (match && better_md5_match(best_match, key))
1077 			best_match = key;
1078 	}
1079 	return best_match;
1080 }
1081 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1082 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1083 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1084 						      const union tcp_md5_addr *addr,
1085 						      int family, u8 prefixlen,
1086 						      int l3index)
1087 {
1088 	const struct tcp_sock *tp = tcp_sk(sk);
1089 	struct tcp_md5sig_key *key;
1090 	unsigned int size = sizeof(struct in_addr);
1091 	const struct tcp_md5sig_info *md5sig;
1092 
1093 	/* caller either holds rcu_read_lock() or socket lock */
1094 	md5sig = rcu_dereference_check(tp->md5sig_info,
1095 				       lockdep_sock_is_held(sk));
1096 	if (!md5sig)
1097 		return NULL;
1098 #if IS_ENABLED(CONFIG_IPV6)
1099 	if (family == AF_INET6)
1100 		size = sizeof(struct in6_addr);
1101 #endif
1102 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103 				 lockdep_sock_is_held(sk)) {
1104 		if (key->family != family)
1105 			continue;
1106 		if (key->l3index != l3index)
1107 			continue;
1108 		if (!memcmp(&key->addr, addr, size) &&
1109 		    key->prefixlen == prefixlen)
1110 			return key;
1111 	}
1112 	return NULL;
1113 }
1114 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1115 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1116 					 const struct sock *addr_sk)
1117 {
1118 	const union tcp_md5_addr *addr;
1119 	int l3index;
1120 
1121 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1122 						 addr_sk->sk_bound_dev_if);
1123 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1124 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1125 }
1126 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1127 
1128 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1129 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1130 		   int family, u8 prefixlen, int l3index,
1131 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1132 {
1133 	/* Add Key to the list */
1134 	struct tcp_md5sig_key *key;
1135 	struct tcp_sock *tp = tcp_sk(sk);
1136 	struct tcp_md5sig_info *md5sig;
1137 
1138 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1139 	if (key) {
1140 		/* Pre-existing entry - just update that one.
1141 		 * Note that the key might be used concurrently.
1142 		 * data_race() is telling kcsan that we do not care of
1143 		 * key mismatches, since changing MD5 key on live flows
1144 		 * can lead to packet drops.
1145 		 */
1146 		data_race(memcpy(key->key, newkey, newkeylen));
1147 
1148 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1149 		 * Also note that a reader could catch new key->keylen value
1150 		 * but old key->key[], this is the reason we use __GFP_ZERO
1151 		 * at sock_kmalloc() time below these lines.
1152 		 */
1153 		WRITE_ONCE(key->keylen, newkeylen);
1154 
1155 		return 0;
1156 	}
1157 
1158 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1159 					   lockdep_sock_is_held(sk));
1160 	if (!md5sig) {
1161 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1162 		if (!md5sig)
1163 			return -ENOMEM;
1164 
1165 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1166 		INIT_HLIST_HEAD(&md5sig->head);
1167 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1168 	}
1169 
1170 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1171 	if (!key)
1172 		return -ENOMEM;
1173 	if (!tcp_alloc_md5sig_pool()) {
1174 		sock_kfree_s(sk, key, sizeof(*key));
1175 		return -ENOMEM;
1176 	}
1177 
1178 	memcpy(key->key, newkey, newkeylen);
1179 	key->keylen = newkeylen;
1180 	key->family = family;
1181 	key->prefixlen = prefixlen;
1182 	key->l3index = l3index;
1183 	memcpy(&key->addr, addr,
1184 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1185 				      sizeof(struct in_addr));
1186 	hlist_add_head_rcu(&key->node, &md5sig->head);
1187 	return 0;
1188 }
1189 EXPORT_SYMBOL(tcp_md5_do_add);
1190 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1191 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1192 		   u8 prefixlen, int l3index)
1193 {
1194 	struct tcp_md5sig_key *key;
1195 
1196 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1197 	if (!key)
1198 		return -ENOENT;
1199 	hlist_del_rcu(&key->node);
1200 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1201 	kfree_rcu(key, rcu);
1202 	return 0;
1203 }
1204 EXPORT_SYMBOL(tcp_md5_do_del);
1205 
tcp_clear_md5_list(struct sock * sk)1206 static void tcp_clear_md5_list(struct sock *sk)
1207 {
1208 	struct tcp_sock *tp = tcp_sk(sk);
1209 	struct tcp_md5sig_key *key;
1210 	struct hlist_node *n;
1211 	struct tcp_md5sig_info *md5sig;
1212 
1213 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1214 
1215 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1216 		hlist_del_rcu(&key->node);
1217 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 		kfree_rcu(key, rcu);
1219 	}
1220 }
1221 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1222 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1223 				 sockptr_t optval, int optlen)
1224 {
1225 	struct tcp_md5sig cmd;
1226 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1227 	const union tcp_md5_addr *addr;
1228 	u8 prefixlen = 32;
1229 	int l3index = 0;
1230 
1231 	if (optlen < sizeof(cmd))
1232 		return -EINVAL;
1233 
1234 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1235 		return -EFAULT;
1236 
1237 	if (sin->sin_family != AF_INET)
1238 		return -EINVAL;
1239 
1240 	if (optname == TCP_MD5SIG_EXT &&
1241 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1242 		prefixlen = cmd.tcpm_prefixlen;
1243 		if (prefixlen > 32)
1244 			return -EINVAL;
1245 	}
1246 
1247 	if (optname == TCP_MD5SIG_EXT &&
1248 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1249 		struct net_device *dev;
1250 
1251 		rcu_read_lock();
1252 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1253 		if (dev && netif_is_l3_master(dev))
1254 			l3index = dev->ifindex;
1255 
1256 		rcu_read_unlock();
1257 
1258 		/* ok to reference set/not set outside of rcu;
1259 		 * right now device MUST be an L3 master
1260 		 */
1261 		if (!dev || !l3index)
1262 			return -EINVAL;
1263 	}
1264 
1265 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1266 
1267 	if (!cmd.tcpm_keylen)
1268 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1269 
1270 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1271 		return -EINVAL;
1272 
1273 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1274 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1275 }
1276 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1277 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1278 				   __be32 daddr, __be32 saddr,
1279 				   const struct tcphdr *th, int nbytes)
1280 {
1281 	struct tcp4_pseudohdr *bp;
1282 	struct scatterlist sg;
1283 	struct tcphdr *_th;
1284 
1285 	bp = hp->scratch;
1286 	bp->saddr = saddr;
1287 	bp->daddr = daddr;
1288 	bp->pad = 0;
1289 	bp->protocol = IPPROTO_TCP;
1290 	bp->len = cpu_to_be16(nbytes);
1291 
1292 	_th = (struct tcphdr *)(bp + 1);
1293 	memcpy(_th, th, sizeof(*th));
1294 	_th->check = 0;
1295 
1296 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1297 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1298 				sizeof(*bp) + sizeof(*th));
1299 	return crypto_ahash_update(hp->md5_req);
1300 }
1301 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1302 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1303 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1304 {
1305 	struct tcp_md5sig_pool *hp;
1306 	struct ahash_request *req;
1307 
1308 	hp = tcp_get_md5sig_pool();
1309 	if (!hp)
1310 		goto clear_hash_noput;
1311 	req = hp->md5_req;
1312 
1313 	if (crypto_ahash_init(req))
1314 		goto clear_hash;
1315 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1316 		goto clear_hash;
1317 	if (tcp_md5_hash_key(hp, key))
1318 		goto clear_hash;
1319 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 	if (crypto_ahash_final(req))
1321 		goto clear_hash;
1322 
1323 	tcp_put_md5sig_pool();
1324 	return 0;
1325 
1326 clear_hash:
1327 	tcp_put_md5sig_pool();
1328 clear_hash_noput:
1329 	memset(md5_hash, 0, 16);
1330 	return 1;
1331 }
1332 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1333 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1334 			const struct sock *sk,
1335 			const struct sk_buff *skb)
1336 {
1337 	struct tcp_md5sig_pool *hp;
1338 	struct ahash_request *req;
1339 	const struct tcphdr *th = tcp_hdr(skb);
1340 	__be32 saddr, daddr;
1341 
1342 	if (sk) { /* valid for establish/request sockets */
1343 		saddr = sk->sk_rcv_saddr;
1344 		daddr = sk->sk_daddr;
1345 	} else {
1346 		const struct iphdr *iph = ip_hdr(skb);
1347 		saddr = iph->saddr;
1348 		daddr = iph->daddr;
1349 	}
1350 
1351 	hp = tcp_get_md5sig_pool();
1352 	if (!hp)
1353 		goto clear_hash_noput;
1354 	req = hp->md5_req;
1355 
1356 	if (crypto_ahash_init(req))
1357 		goto clear_hash;
1358 
1359 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1360 		goto clear_hash;
1361 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1362 		goto clear_hash;
1363 	if (tcp_md5_hash_key(hp, key))
1364 		goto clear_hash;
1365 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1366 	if (crypto_ahash_final(req))
1367 		goto clear_hash;
1368 
1369 	tcp_put_md5sig_pool();
1370 	return 0;
1371 
1372 clear_hash:
1373 	tcp_put_md5sig_pool();
1374 clear_hash_noput:
1375 	memset(md5_hash, 0, 16);
1376 	return 1;
1377 }
1378 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1379 
1380 #endif
1381 
1382 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1383 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1384 				    const struct sk_buff *skb,
1385 				    int dif, int sdif)
1386 {
1387 #ifdef CONFIG_TCP_MD5SIG
1388 	/*
1389 	 * This gets called for each TCP segment that arrives
1390 	 * so we want to be efficient.
1391 	 * We have 3 drop cases:
1392 	 * o No MD5 hash and one expected.
1393 	 * o MD5 hash and we're not expecting one.
1394 	 * o MD5 hash and its wrong.
1395 	 */
1396 	const __u8 *hash_location = NULL;
1397 	struct tcp_md5sig_key *hash_expected;
1398 	const struct iphdr *iph = ip_hdr(skb);
1399 	const struct tcphdr *th = tcp_hdr(skb);
1400 	const union tcp_md5_addr *addr;
1401 	unsigned char newhash[16];
1402 	int genhash, l3index;
1403 
1404 	/* sdif set, means packet ingressed via a device
1405 	 * in an L3 domain and dif is set to the l3mdev
1406 	 */
1407 	l3index = sdif ? dif : 0;
1408 
1409 	addr = (union tcp_md5_addr *)&iph->saddr;
1410 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1411 	hash_location = tcp_parse_md5sig_option(th);
1412 
1413 	/* We've parsed the options - do we have a hash? */
1414 	if (!hash_expected && !hash_location)
1415 		return false;
1416 
1417 	if (hash_expected && !hash_location) {
1418 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1419 		return true;
1420 	}
1421 
1422 	if (!hash_expected && hash_location) {
1423 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1424 		return true;
1425 	}
1426 
1427 	/* Okay, so this is hash_expected and hash_location -
1428 	 * so we need to calculate the checksum.
1429 	 */
1430 	genhash = tcp_v4_md5_hash_skb(newhash,
1431 				      hash_expected,
1432 				      NULL, skb);
1433 
1434 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1435 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1436 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1437 				     &iph->saddr, ntohs(th->source),
1438 				     &iph->daddr, ntohs(th->dest),
1439 				     genhash ? " tcp_v4_calc_md5_hash failed"
1440 				     : "", l3index);
1441 		return true;
1442 	}
1443 	return false;
1444 #endif
1445 	return false;
1446 }
1447 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1448 static void tcp_v4_init_req(struct request_sock *req,
1449 			    const struct sock *sk_listener,
1450 			    struct sk_buff *skb)
1451 {
1452 	struct inet_request_sock *ireq = inet_rsk(req);
1453 	struct net *net = sock_net(sk_listener);
1454 
1455 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1456 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1457 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1458 }
1459 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1460 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1461 					  struct flowi *fl,
1462 					  const struct request_sock *req)
1463 {
1464 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1465 }
1466 
1467 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1468 	.family		=	PF_INET,
1469 	.obj_size	=	sizeof(struct tcp_request_sock),
1470 	.rtx_syn_ack	=	tcp_rtx_synack,
1471 	.send_ack	=	tcp_v4_reqsk_send_ack,
1472 	.destructor	=	tcp_v4_reqsk_destructor,
1473 	.send_reset	=	tcp_v4_send_reset,
1474 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1475 };
1476 
1477 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1478 	.mss_clamp	=	TCP_MSS_DEFAULT,
1479 #ifdef CONFIG_TCP_MD5SIG
1480 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1481 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1482 #endif
1483 	.init_req	=	tcp_v4_init_req,
1484 #ifdef CONFIG_SYN_COOKIES
1485 	.cookie_init_seq =	cookie_v4_init_sequence,
1486 #endif
1487 	.route_req	=	tcp_v4_route_req,
1488 	.init_seq	=	tcp_v4_init_seq,
1489 	.init_ts_off	=	tcp_v4_init_ts_off,
1490 	.send_synack	=	tcp_v4_send_synack,
1491 };
1492 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1493 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 {
1495 	/* Never answer to SYNs send to broadcast or multicast */
1496 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1497 		goto drop;
1498 
1499 	return tcp_conn_request(&tcp_request_sock_ops,
1500 				&tcp_request_sock_ipv4_ops, sk, skb);
1501 
1502 drop:
1503 	tcp_listendrop(sk);
1504 	return 0;
1505 }
1506 EXPORT_SYMBOL(tcp_v4_conn_request);
1507 
1508 
1509 /*
1510  * The three way handshake has completed - we got a valid synack -
1511  * now create the new socket.
1512  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1513 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1514 				  struct request_sock *req,
1515 				  struct dst_entry *dst,
1516 				  struct request_sock *req_unhash,
1517 				  bool *own_req)
1518 {
1519 	struct inet_request_sock *ireq;
1520 	bool found_dup_sk = false;
1521 	struct inet_sock *newinet;
1522 	struct tcp_sock *newtp;
1523 	struct sock *newsk;
1524 #ifdef CONFIG_TCP_MD5SIG
1525 	const union tcp_md5_addr *addr;
1526 	struct tcp_md5sig_key *key;
1527 	int l3index;
1528 #endif
1529 	struct ip_options_rcu *inet_opt;
1530 
1531 	if (sk_acceptq_is_full(sk))
1532 		goto exit_overflow;
1533 
1534 	newsk = tcp_create_openreq_child(sk, req, skb);
1535 	if (!newsk)
1536 		goto exit_nonewsk;
1537 
1538 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1539 	inet_sk_rx_dst_set(newsk, skb);
1540 
1541 	newtp		      = tcp_sk(newsk);
1542 	newinet		      = inet_sk(newsk);
1543 	ireq		      = inet_rsk(req);
1544 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1545 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1546 	newsk->sk_bound_dev_if = ireq->ir_iif;
1547 	newinet->inet_saddr   = ireq->ir_loc_addr;
1548 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1549 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1550 	newinet->mc_index     = inet_iif(skb);
1551 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1552 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1553 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1554 	if (inet_opt)
1555 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1556 	newinet->inet_id = prandom_u32();
1557 
1558 	/* Set ToS of the new socket based upon the value of incoming SYN.
1559 	 * ECT bits are set later in tcp_init_transfer().
1560 	 */
1561 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1562 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1563 
1564 	if (!dst) {
1565 		dst = inet_csk_route_child_sock(sk, newsk, req);
1566 		if (!dst)
1567 			goto put_and_exit;
1568 	} else {
1569 		/* syncookie case : see end of cookie_v4_check() */
1570 	}
1571 	sk_setup_caps(newsk, dst);
1572 
1573 	tcp_ca_openreq_child(newsk, dst);
1574 
1575 	tcp_sync_mss(newsk, dst_mtu(dst));
1576 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1577 
1578 	tcp_initialize_rcv_mss(newsk);
1579 
1580 #ifdef CONFIG_TCP_MD5SIG
1581 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1582 	/* Copy over the MD5 key from the original socket */
1583 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1584 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1585 	if (key) {
1586 		/*
1587 		 * We're using one, so create a matching key
1588 		 * on the newsk structure. If we fail to get
1589 		 * memory, then we end up not copying the key
1590 		 * across. Shucks.
1591 		 */
1592 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1593 			       key->key, key->keylen, GFP_ATOMIC);
1594 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1595 	}
1596 #endif
1597 
1598 	if (__inet_inherit_port(sk, newsk) < 0)
1599 		goto put_and_exit;
1600 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1601 				       &found_dup_sk);
1602 	if (likely(*own_req)) {
1603 		tcp_move_syn(newtp, req);
1604 		ireq->ireq_opt = NULL;
1605 	} else {
1606 		newinet->inet_opt = NULL;
1607 
1608 		if (!req_unhash && found_dup_sk) {
1609 			/* This code path should only be executed in the
1610 			 * syncookie case only
1611 			 */
1612 			bh_unlock_sock(newsk);
1613 			sock_put(newsk);
1614 			newsk = NULL;
1615 		}
1616 	}
1617 	return newsk;
1618 
1619 exit_overflow:
1620 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1621 exit_nonewsk:
1622 	dst_release(dst);
1623 exit:
1624 	tcp_listendrop(sk);
1625 	return NULL;
1626 put_and_exit:
1627 	newinet->inet_opt = NULL;
1628 	inet_csk_prepare_forced_close(newsk);
1629 	tcp_done(newsk);
1630 	goto exit;
1631 }
1632 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1633 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1634 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1635 {
1636 #ifdef CONFIG_SYN_COOKIES
1637 	const struct tcphdr *th = tcp_hdr(skb);
1638 
1639 	if (!th->syn)
1640 		sk = cookie_v4_check(sk, skb);
1641 #endif
1642 	return sk;
1643 }
1644 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1645 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1646 			 struct tcphdr *th, u32 *cookie)
1647 {
1648 	u16 mss = 0;
1649 #ifdef CONFIG_SYN_COOKIES
1650 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1651 				    &tcp_request_sock_ipv4_ops, sk, th);
1652 	if (mss) {
1653 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1654 		tcp_synq_overflow(sk);
1655 	}
1656 #endif
1657 	return mss;
1658 }
1659 
1660 /* The socket must have it's spinlock held when we get
1661  * here, unless it is a TCP_LISTEN socket.
1662  *
1663  * We have a potential double-lock case here, so even when
1664  * doing backlog processing we use the BH locking scheme.
1665  * This is because we cannot sleep with the original spinlock
1666  * held.
1667  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1668 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1669 {
1670 	struct sock *rsk;
1671 
1672 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1673 		struct dst_entry *dst = sk->sk_rx_dst;
1674 
1675 		sock_rps_save_rxhash(sk, skb);
1676 		sk_mark_napi_id(sk, skb);
1677 		if (dst) {
1678 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1679 			    !dst->ops->check(dst, 0)) {
1680 				dst_release(dst);
1681 				sk->sk_rx_dst = NULL;
1682 			}
1683 		}
1684 		tcp_rcv_established(sk, skb);
1685 		return 0;
1686 	}
1687 
1688 	if (tcp_checksum_complete(skb))
1689 		goto csum_err;
1690 
1691 	if (sk->sk_state == TCP_LISTEN) {
1692 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1693 
1694 		if (!nsk)
1695 			goto discard;
1696 		if (nsk != sk) {
1697 			if (tcp_child_process(sk, nsk, skb)) {
1698 				rsk = nsk;
1699 				goto reset;
1700 			}
1701 			return 0;
1702 		}
1703 	} else
1704 		sock_rps_save_rxhash(sk, skb);
1705 
1706 	if (tcp_rcv_state_process(sk, skb)) {
1707 		rsk = sk;
1708 		goto reset;
1709 	}
1710 	return 0;
1711 
1712 reset:
1713 	tcp_v4_send_reset(rsk, skb);
1714 discard:
1715 	kfree_skb(skb);
1716 	/* Be careful here. If this function gets more complicated and
1717 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1718 	 * might be destroyed here. This current version compiles correctly,
1719 	 * but you have been warned.
1720 	 */
1721 	return 0;
1722 
1723 csum_err:
1724 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1726 	goto discard;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1729 
tcp_v4_early_demux(struct sk_buff * skb)1730 int tcp_v4_early_demux(struct sk_buff *skb)
1731 {
1732 	const struct iphdr *iph;
1733 	const struct tcphdr *th;
1734 	struct sock *sk;
1735 
1736 	if (skb->pkt_type != PACKET_HOST)
1737 		return 0;
1738 
1739 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1740 		return 0;
1741 
1742 	iph = ip_hdr(skb);
1743 	th = tcp_hdr(skb);
1744 
1745 	if (th->doff < sizeof(struct tcphdr) / 4)
1746 		return 0;
1747 
1748 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1749 				       iph->saddr, th->source,
1750 				       iph->daddr, ntohs(th->dest),
1751 				       skb->skb_iif, inet_sdif(skb));
1752 	if (sk) {
1753 		skb->sk = sk;
1754 		skb->destructor = sock_edemux;
1755 		if (sk_fullsock(sk)) {
1756 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1757 
1758 			if (dst)
1759 				dst = dst_check(dst, 0);
1760 			if (dst &&
1761 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1762 				skb_dst_set_noref(skb, dst);
1763 		}
1764 	}
1765 	return 0;
1766 }
1767 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1768 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1769 {
1770 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1771 	u32 tail_gso_size, tail_gso_segs;
1772 	struct skb_shared_info *shinfo;
1773 	const struct tcphdr *th;
1774 	struct tcphdr *thtail;
1775 	struct sk_buff *tail;
1776 	unsigned int hdrlen;
1777 	bool fragstolen;
1778 	u32 gso_segs;
1779 	u32 gso_size;
1780 	int delta;
1781 
1782 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1783 	 * we can fix skb->truesize to its real value to avoid future drops.
1784 	 * This is valid because skb is not yet charged to the socket.
1785 	 * It has been noticed pure SACK packets were sometimes dropped
1786 	 * (if cooked by drivers without copybreak feature).
1787 	 */
1788 	skb_condense(skb);
1789 
1790 	skb_dst_drop(skb);
1791 
1792 	if (unlikely(tcp_checksum_complete(skb))) {
1793 		bh_unlock_sock(sk);
1794 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1795 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1796 		return true;
1797 	}
1798 
1799 	/* Attempt coalescing to last skb in backlog, even if we are
1800 	 * above the limits.
1801 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1802 	 */
1803 	th = (const struct tcphdr *)skb->data;
1804 	hdrlen = th->doff * 4;
1805 
1806 	tail = sk->sk_backlog.tail;
1807 	if (!tail)
1808 		goto no_coalesce;
1809 	thtail = (struct tcphdr *)tail->data;
1810 
1811 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1812 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1813 	    ((TCP_SKB_CB(tail)->tcp_flags |
1814 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1815 	    !((TCP_SKB_CB(tail)->tcp_flags &
1816 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1817 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1818 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1819 #ifdef CONFIG_TLS_DEVICE
1820 	    tail->decrypted != skb->decrypted ||
1821 #endif
1822 	    thtail->doff != th->doff ||
1823 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1824 		goto no_coalesce;
1825 
1826 	__skb_pull(skb, hdrlen);
1827 
1828 	shinfo = skb_shinfo(skb);
1829 	gso_size = shinfo->gso_size ?: skb->len;
1830 	gso_segs = shinfo->gso_segs ?: 1;
1831 
1832 	shinfo = skb_shinfo(tail);
1833 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1834 	tail_gso_segs = shinfo->gso_segs ?: 1;
1835 
1836 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1837 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1838 
1839 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1840 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1841 			thtail->window = th->window;
1842 		}
1843 
1844 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1845 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1846 		 * is not entered if we append a packet with a FIN.
1847 		 * SYN, RST, URG are not present.
1848 		 * ACK is set on both packets.
1849 		 * PSH : we do not really care in TCP stack,
1850 		 *       at least for 'GRO' packets.
1851 		 */
1852 		thtail->fin |= th->fin;
1853 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1854 
1855 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1856 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1857 			tail->tstamp = skb->tstamp;
1858 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1859 		}
1860 
1861 		/* Not as strict as GRO. We only need to carry mss max value */
1862 		shinfo->gso_size = max(gso_size, tail_gso_size);
1863 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1864 
1865 		sk->sk_backlog.len += delta;
1866 		__NET_INC_STATS(sock_net(sk),
1867 				LINUX_MIB_TCPBACKLOGCOALESCE);
1868 		kfree_skb_partial(skb, fragstolen);
1869 		return false;
1870 	}
1871 	__skb_push(skb, hdrlen);
1872 
1873 no_coalesce:
1874 	/* Only socket owner can try to collapse/prune rx queues
1875 	 * to reduce memory overhead, so add a little headroom here.
1876 	 * Few sockets backlog are possibly concurrently non empty.
1877 	 */
1878 	limit += 64*1024;
1879 
1880 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1881 		bh_unlock_sock(sk);
1882 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1883 		return true;
1884 	}
1885 	return false;
1886 }
1887 EXPORT_SYMBOL(tcp_add_backlog);
1888 
tcp_filter(struct sock * sk,struct sk_buff * skb)1889 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1890 {
1891 	struct tcphdr *th = (struct tcphdr *)skb->data;
1892 
1893 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1894 }
1895 EXPORT_SYMBOL(tcp_filter);
1896 
tcp_v4_restore_cb(struct sk_buff * skb)1897 static void tcp_v4_restore_cb(struct sk_buff *skb)
1898 {
1899 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1900 		sizeof(struct inet_skb_parm));
1901 }
1902 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1903 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1904 			   const struct tcphdr *th)
1905 {
1906 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1907 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1908 	 */
1909 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1910 		sizeof(struct inet_skb_parm));
1911 	barrier();
1912 
1913 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1914 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1915 				    skb->len - th->doff * 4);
1916 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1917 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1918 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1919 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1920 	TCP_SKB_CB(skb)->sacked	 = 0;
1921 	TCP_SKB_CB(skb)->has_rxtstamp =
1922 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1923 }
1924 
1925 /*
1926  *	From tcp_input.c
1927  */
1928 
tcp_v4_rcv(struct sk_buff * skb)1929 int tcp_v4_rcv(struct sk_buff *skb)
1930 {
1931 	struct net *net = dev_net(skb->dev);
1932 	struct sk_buff *skb_to_free;
1933 	int sdif = inet_sdif(skb);
1934 	int dif = inet_iif(skb);
1935 	const struct iphdr *iph;
1936 	const struct tcphdr *th;
1937 	bool refcounted;
1938 	struct sock *sk;
1939 	int ret;
1940 
1941 	if (skb->pkt_type != PACKET_HOST)
1942 		goto discard_it;
1943 
1944 	/* Count it even if it's bad */
1945 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1946 
1947 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1948 		goto discard_it;
1949 
1950 	th = (const struct tcphdr *)skb->data;
1951 
1952 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1953 		goto bad_packet;
1954 	if (!pskb_may_pull(skb, th->doff * 4))
1955 		goto discard_it;
1956 
1957 	/* An explanation is required here, I think.
1958 	 * Packet length and doff are validated by header prediction,
1959 	 * provided case of th->doff==0 is eliminated.
1960 	 * So, we defer the checks. */
1961 
1962 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1963 		goto csum_error;
1964 
1965 	th = (const struct tcphdr *)skb->data;
1966 	iph = ip_hdr(skb);
1967 lookup:
1968 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1969 			       th->dest, sdif, &refcounted);
1970 	if (!sk)
1971 		goto no_tcp_socket;
1972 
1973 process:
1974 	if (sk->sk_state == TCP_TIME_WAIT)
1975 		goto do_time_wait;
1976 
1977 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1978 		struct request_sock *req = inet_reqsk(sk);
1979 		bool req_stolen = false;
1980 		struct sock *nsk;
1981 
1982 		sk = req->rsk_listener;
1983 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1984 			sk_drops_add(sk, skb);
1985 			reqsk_put(req);
1986 			goto discard_it;
1987 		}
1988 		if (tcp_checksum_complete(skb)) {
1989 			reqsk_put(req);
1990 			goto csum_error;
1991 		}
1992 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1993 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1994 			goto lookup;
1995 		}
1996 		/* We own a reference on the listener, increase it again
1997 		 * as we might lose it too soon.
1998 		 */
1999 		sock_hold(sk);
2000 		refcounted = true;
2001 		nsk = NULL;
2002 		if (!tcp_filter(sk, skb)) {
2003 			th = (const struct tcphdr *)skb->data;
2004 			iph = ip_hdr(skb);
2005 			tcp_v4_fill_cb(skb, iph, th);
2006 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007 		}
2008 		if (!nsk) {
2009 			reqsk_put(req);
2010 			if (req_stolen) {
2011 				/* Another cpu got exclusive access to req
2012 				 * and created a full blown socket.
2013 				 * Try to feed this packet to this socket
2014 				 * instead of discarding it.
2015 				 */
2016 				tcp_v4_restore_cb(skb);
2017 				sock_put(sk);
2018 				goto lookup;
2019 			}
2020 			goto discard_and_relse;
2021 		}
2022 		if (nsk == sk) {
2023 			reqsk_put(req);
2024 			tcp_v4_restore_cb(skb);
2025 		} else if (tcp_child_process(sk, nsk, skb)) {
2026 			tcp_v4_send_reset(nsk, skb);
2027 			goto discard_and_relse;
2028 		} else {
2029 			sock_put(sk);
2030 			return 0;
2031 		}
2032 	}
2033 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2034 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2035 		goto discard_and_relse;
2036 	}
2037 
2038 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2039 		goto discard_and_relse;
2040 
2041 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2042 		goto discard_and_relse;
2043 
2044 	nf_reset_ct(skb);
2045 
2046 	if (tcp_filter(sk, skb))
2047 		goto discard_and_relse;
2048 	th = (const struct tcphdr *)skb->data;
2049 	iph = ip_hdr(skb);
2050 	tcp_v4_fill_cb(skb, iph, th);
2051 
2052 	skb->dev = NULL;
2053 
2054 	if (sk->sk_state == TCP_LISTEN) {
2055 		ret = tcp_v4_do_rcv(sk, skb);
2056 		goto put_and_return;
2057 	}
2058 
2059 	sk_incoming_cpu_update(sk);
2060 
2061 	bh_lock_sock_nested(sk);
2062 	tcp_segs_in(tcp_sk(sk), skb);
2063 	ret = 0;
2064 	if (!sock_owned_by_user(sk)) {
2065 		skb_to_free = sk->sk_rx_skb_cache;
2066 		sk->sk_rx_skb_cache = NULL;
2067 		ret = tcp_v4_do_rcv(sk, skb);
2068 	} else {
2069 		if (tcp_add_backlog(sk, skb))
2070 			goto discard_and_relse;
2071 		skb_to_free = NULL;
2072 	}
2073 	bh_unlock_sock(sk);
2074 	if (skb_to_free)
2075 		__kfree_skb(skb_to_free);
2076 
2077 put_and_return:
2078 	if (refcounted)
2079 		sock_put(sk);
2080 
2081 	return ret;
2082 
2083 no_tcp_socket:
2084 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2085 		goto discard_it;
2086 
2087 	tcp_v4_fill_cb(skb, iph, th);
2088 
2089 	if (tcp_checksum_complete(skb)) {
2090 csum_error:
2091 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2092 bad_packet:
2093 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2094 	} else {
2095 		tcp_v4_send_reset(NULL, skb);
2096 	}
2097 
2098 discard_it:
2099 	/* Discard frame. */
2100 	kfree_skb(skb);
2101 	return 0;
2102 
2103 discard_and_relse:
2104 	sk_drops_add(sk, skb);
2105 	if (refcounted)
2106 		sock_put(sk);
2107 	goto discard_it;
2108 
2109 do_time_wait:
2110 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2111 		inet_twsk_put(inet_twsk(sk));
2112 		goto discard_it;
2113 	}
2114 
2115 	tcp_v4_fill_cb(skb, iph, th);
2116 
2117 	if (tcp_checksum_complete(skb)) {
2118 		inet_twsk_put(inet_twsk(sk));
2119 		goto csum_error;
2120 	}
2121 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2122 	case TCP_TW_SYN: {
2123 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2124 							&tcp_hashinfo, skb,
2125 							__tcp_hdrlen(th),
2126 							iph->saddr, th->source,
2127 							iph->daddr, th->dest,
2128 							inet_iif(skb),
2129 							sdif);
2130 		if (sk2) {
2131 			inet_twsk_deschedule_put(inet_twsk(sk));
2132 			sk = sk2;
2133 			tcp_v4_restore_cb(skb);
2134 			refcounted = false;
2135 			goto process;
2136 		}
2137 	}
2138 		/* to ACK */
2139 		fallthrough;
2140 	case TCP_TW_ACK:
2141 		tcp_v4_timewait_ack(sk, skb);
2142 		break;
2143 	case TCP_TW_RST:
2144 		tcp_v4_send_reset(sk, skb);
2145 		inet_twsk_deschedule_put(inet_twsk(sk));
2146 		goto discard_it;
2147 	case TCP_TW_SUCCESS:;
2148 	}
2149 	goto discard_it;
2150 }
2151 
2152 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2153 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2154 	.twsk_unique	= tcp_twsk_unique,
2155 	.twsk_destructor= tcp_twsk_destructor,
2156 };
2157 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2158 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2159 {
2160 	struct dst_entry *dst = skb_dst(skb);
2161 
2162 	if (dst && dst_hold_safe(dst)) {
2163 		sk->sk_rx_dst = dst;
2164 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2165 	}
2166 }
2167 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2168 
2169 const struct inet_connection_sock_af_ops ipv4_specific = {
2170 	.queue_xmit	   = ip_queue_xmit,
2171 	.send_check	   = tcp_v4_send_check,
2172 	.rebuild_header	   = inet_sk_rebuild_header,
2173 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2174 	.conn_request	   = tcp_v4_conn_request,
2175 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2176 	.net_header_len	   = sizeof(struct iphdr),
2177 	.setsockopt	   = ip_setsockopt,
2178 	.getsockopt	   = ip_getsockopt,
2179 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2180 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2181 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2182 };
2183 EXPORT_SYMBOL(ipv4_specific);
2184 
2185 #ifdef CONFIG_TCP_MD5SIG
2186 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2187 	.md5_lookup		= tcp_v4_md5_lookup,
2188 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2189 	.md5_parse		= tcp_v4_parse_md5_keys,
2190 };
2191 #endif
2192 
2193 /* NOTE: A lot of things set to zero explicitly by call to
2194  *       sk_alloc() so need not be done here.
2195  */
tcp_v4_init_sock(struct sock * sk)2196 static int tcp_v4_init_sock(struct sock *sk)
2197 {
2198 	struct inet_connection_sock *icsk = inet_csk(sk);
2199 
2200 	tcp_init_sock(sk);
2201 
2202 	icsk->icsk_af_ops = &ipv4_specific;
2203 
2204 #ifdef CONFIG_TCP_MD5SIG
2205 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2206 #endif
2207 
2208 	return 0;
2209 }
2210 
tcp_v4_destroy_sock(struct sock * sk)2211 void tcp_v4_destroy_sock(struct sock *sk)
2212 {
2213 	struct tcp_sock *tp = tcp_sk(sk);
2214 
2215 	trace_tcp_destroy_sock(sk);
2216 
2217 	tcp_clear_xmit_timers(sk);
2218 
2219 	tcp_cleanup_congestion_control(sk);
2220 
2221 	tcp_cleanup_ulp(sk);
2222 
2223 	/* Cleanup up the write buffer. */
2224 	tcp_write_queue_purge(sk);
2225 
2226 	/* Check if we want to disable active TFO */
2227 	tcp_fastopen_active_disable_ofo_check(sk);
2228 
2229 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2230 	skb_rbtree_purge(&tp->out_of_order_queue);
2231 
2232 #ifdef CONFIG_TCP_MD5SIG
2233 	/* Clean up the MD5 key list, if any */
2234 	if (tp->md5sig_info) {
2235 		tcp_clear_md5_list(sk);
2236 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2237 		tp->md5sig_info = NULL;
2238 	}
2239 #endif
2240 
2241 	/* Clean up a referenced TCP bind bucket. */
2242 	if (inet_csk(sk)->icsk_bind_hash)
2243 		inet_put_port(sk);
2244 
2245 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2246 
2247 	/* If socket is aborted during connect operation */
2248 	tcp_free_fastopen_req(tp);
2249 	tcp_fastopen_destroy_cipher(sk);
2250 	tcp_saved_syn_free(tp);
2251 
2252 	sk_sockets_allocated_dec(sk);
2253 }
2254 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2255 
2256 #ifdef CONFIG_PROC_FS
2257 /* Proc filesystem TCP sock list dumping. */
2258 
2259 /*
2260  * Get next listener socket follow cur.  If cur is NULL, get first socket
2261  * starting from bucket given in st->bucket; when st->bucket is zero the
2262  * very first socket in the hash table is returned.
2263  */
listening_get_next(struct seq_file * seq,void * cur)2264 static void *listening_get_next(struct seq_file *seq, void *cur)
2265 {
2266 	struct tcp_seq_afinfo *afinfo;
2267 	struct tcp_iter_state *st = seq->private;
2268 	struct net *net = seq_file_net(seq);
2269 	struct inet_listen_hashbucket *ilb;
2270 	struct hlist_nulls_node *node;
2271 	struct sock *sk = cur;
2272 
2273 	if (st->bpf_seq_afinfo)
2274 		afinfo = st->bpf_seq_afinfo;
2275 	else
2276 		afinfo = PDE_DATA(file_inode(seq->file));
2277 
2278 	if (!sk) {
2279 get_head:
2280 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2281 		spin_lock(&ilb->lock);
2282 		sk = sk_nulls_head(&ilb->nulls_head);
2283 		st->offset = 0;
2284 		goto get_sk;
2285 	}
2286 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2287 	++st->num;
2288 	++st->offset;
2289 
2290 	sk = sk_nulls_next(sk);
2291 get_sk:
2292 	sk_nulls_for_each_from(sk, node) {
2293 		if (!net_eq(sock_net(sk), net))
2294 			continue;
2295 		if (afinfo->family == AF_UNSPEC ||
2296 		    sk->sk_family == afinfo->family)
2297 			return sk;
2298 	}
2299 	spin_unlock(&ilb->lock);
2300 	st->offset = 0;
2301 	if (++st->bucket < INET_LHTABLE_SIZE)
2302 		goto get_head;
2303 	return NULL;
2304 }
2305 
listening_get_idx(struct seq_file * seq,loff_t * pos)2306 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2307 {
2308 	struct tcp_iter_state *st = seq->private;
2309 	void *rc;
2310 
2311 	st->bucket = 0;
2312 	st->offset = 0;
2313 	rc = listening_get_next(seq, NULL);
2314 
2315 	while (rc && *pos) {
2316 		rc = listening_get_next(seq, rc);
2317 		--*pos;
2318 	}
2319 	return rc;
2320 }
2321 
empty_bucket(const struct tcp_iter_state * st)2322 static inline bool empty_bucket(const struct tcp_iter_state *st)
2323 {
2324 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2325 }
2326 
2327 /*
2328  * Get first established socket starting from bucket given in st->bucket.
2329  * If st->bucket is zero, the very first socket in the hash is returned.
2330  */
established_get_first(struct seq_file * seq)2331 static void *established_get_first(struct seq_file *seq)
2332 {
2333 	struct tcp_seq_afinfo *afinfo;
2334 	struct tcp_iter_state *st = seq->private;
2335 	struct net *net = seq_file_net(seq);
2336 	void *rc = NULL;
2337 
2338 	if (st->bpf_seq_afinfo)
2339 		afinfo = st->bpf_seq_afinfo;
2340 	else
2341 		afinfo = PDE_DATA(file_inode(seq->file));
2342 
2343 	st->offset = 0;
2344 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2345 		struct sock *sk;
2346 		struct hlist_nulls_node *node;
2347 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2348 
2349 		/* Lockless fast path for the common case of empty buckets */
2350 		if (empty_bucket(st))
2351 			continue;
2352 
2353 		spin_lock_bh(lock);
2354 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2355 			if ((afinfo->family != AF_UNSPEC &&
2356 			     sk->sk_family != afinfo->family) ||
2357 			    !net_eq(sock_net(sk), net)) {
2358 				continue;
2359 			}
2360 			rc = sk;
2361 			goto out;
2362 		}
2363 		spin_unlock_bh(lock);
2364 	}
2365 out:
2366 	return rc;
2367 }
2368 
established_get_next(struct seq_file * seq,void * cur)2369 static void *established_get_next(struct seq_file *seq, void *cur)
2370 {
2371 	struct tcp_seq_afinfo *afinfo;
2372 	struct sock *sk = cur;
2373 	struct hlist_nulls_node *node;
2374 	struct tcp_iter_state *st = seq->private;
2375 	struct net *net = seq_file_net(seq);
2376 
2377 	if (st->bpf_seq_afinfo)
2378 		afinfo = st->bpf_seq_afinfo;
2379 	else
2380 		afinfo = PDE_DATA(file_inode(seq->file));
2381 
2382 	++st->num;
2383 	++st->offset;
2384 
2385 	sk = sk_nulls_next(sk);
2386 
2387 	sk_nulls_for_each_from(sk, node) {
2388 		if ((afinfo->family == AF_UNSPEC ||
2389 		     sk->sk_family == afinfo->family) &&
2390 		    net_eq(sock_net(sk), net))
2391 			return sk;
2392 	}
2393 
2394 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2395 	++st->bucket;
2396 	return established_get_first(seq);
2397 }
2398 
established_get_idx(struct seq_file * seq,loff_t pos)2399 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2400 {
2401 	struct tcp_iter_state *st = seq->private;
2402 	void *rc;
2403 
2404 	st->bucket = 0;
2405 	rc = established_get_first(seq);
2406 
2407 	while (rc && pos) {
2408 		rc = established_get_next(seq, rc);
2409 		--pos;
2410 	}
2411 	return rc;
2412 }
2413 
tcp_get_idx(struct seq_file * seq,loff_t pos)2414 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2415 {
2416 	void *rc;
2417 	struct tcp_iter_state *st = seq->private;
2418 
2419 	st->state = TCP_SEQ_STATE_LISTENING;
2420 	rc	  = listening_get_idx(seq, &pos);
2421 
2422 	if (!rc) {
2423 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2424 		rc	  = established_get_idx(seq, pos);
2425 	}
2426 
2427 	return rc;
2428 }
2429 
tcp_seek_last_pos(struct seq_file * seq)2430 static void *tcp_seek_last_pos(struct seq_file *seq)
2431 {
2432 	struct tcp_iter_state *st = seq->private;
2433 	int bucket = st->bucket;
2434 	int offset = st->offset;
2435 	int orig_num = st->num;
2436 	void *rc = NULL;
2437 
2438 	switch (st->state) {
2439 	case TCP_SEQ_STATE_LISTENING:
2440 		if (st->bucket >= INET_LHTABLE_SIZE)
2441 			break;
2442 		st->state = TCP_SEQ_STATE_LISTENING;
2443 		rc = listening_get_next(seq, NULL);
2444 		while (offset-- && rc && bucket == st->bucket)
2445 			rc = listening_get_next(seq, rc);
2446 		if (rc)
2447 			break;
2448 		st->bucket = 0;
2449 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2450 		fallthrough;
2451 	case TCP_SEQ_STATE_ESTABLISHED:
2452 		if (st->bucket > tcp_hashinfo.ehash_mask)
2453 			break;
2454 		rc = established_get_first(seq);
2455 		while (offset-- && rc && bucket == st->bucket)
2456 			rc = established_get_next(seq, rc);
2457 	}
2458 
2459 	st->num = orig_num;
2460 
2461 	return rc;
2462 }
2463 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2464 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2465 {
2466 	struct tcp_iter_state *st = seq->private;
2467 	void *rc;
2468 
2469 	if (*pos && *pos == st->last_pos) {
2470 		rc = tcp_seek_last_pos(seq);
2471 		if (rc)
2472 			goto out;
2473 	}
2474 
2475 	st->state = TCP_SEQ_STATE_LISTENING;
2476 	st->num = 0;
2477 	st->bucket = 0;
2478 	st->offset = 0;
2479 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2480 
2481 out:
2482 	st->last_pos = *pos;
2483 	return rc;
2484 }
2485 EXPORT_SYMBOL(tcp_seq_start);
2486 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2487 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2488 {
2489 	struct tcp_iter_state *st = seq->private;
2490 	void *rc = NULL;
2491 
2492 	if (v == SEQ_START_TOKEN) {
2493 		rc = tcp_get_idx(seq, 0);
2494 		goto out;
2495 	}
2496 
2497 	switch (st->state) {
2498 	case TCP_SEQ_STATE_LISTENING:
2499 		rc = listening_get_next(seq, v);
2500 		if (!rc) {
2501 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2502 			st->bucket = 0;
2503 			st->offset = 0;
2504 			rc	  = established_get_first(seq);
2505 		}
2506 		break;
2507 	case TCP_SEQ_STATE_ESTABLISHED:
2508 		rc = established_get_next(seq, v);
2509 		break;
2510 	}
2511 out:
2512 	++*pos;
2513 	st->last_pos = *pos;
2514 	return rc;
2515 }
2516 EXPORT_SYMBOL(tcp_seq_next);
2517 
tcp_seq_stop(struct seq_file * seq,void * v)2518 void tcp_seq_stop(struct seq_file *seq, void *v)
2519 {
2520 	struct tcp_iter_state *st = seq->private;
2521 
2522 	switch (st->state) {
2523 	case TCP_SEQ_STATE_LISTENING:
2524 		if (v != SEQ_START_TOKEN)
2525 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2526 		break;
2527 	case TCP_SEQ_STATE_ESTABLISHED:
2528 		if (v)
2529 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2530 		break;
2531 	}
2532 }
2533 EXPORT_SYMBOL(tcp_seq_stop);
2534 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2535 static void get_openreq4(const struct request_sock *req,
2536 			 struct seq_file *f, int i)
2537 {
2538 	const struct inet_request_sock *ireq = inet_rsk(req);
2539 	long delta = req->rsk_timer.expires - jiffies;
2540 
2541 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2542 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2543 		i,
2544 		ireq->ir_loc_addr,
2545 		ireq->ir_num,
2546 		ireq->ir_rmt_addr,
2547 		ntohs(ireq->ir_rmt_port),
2548 		TCP_SYN_RECV,
2549 		0, 0, /* could print option size, but that is af dependent. */
2550 		1,    /* timers active (only the expire timer) */
2551 		jiffies_delta_to_clock_t(delta),
2552 		req->num_timeout,
2553 		from_kuid_munged(seq_user_ns(f),
2554 				 sock_i_uid(req->rsk_listener)),
2555 		0,  /* non standard timer */
2556 		0, /* open_requests have no inode */
2557 		0,
2558 		req);
2559 }
2560 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2561 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2562 {
2563 	int timer_active;
2564 	unsigned long timer_expires;
2565 	const struct tcp_sock *tp = tcp_sk(sk);
2566 	const struct inet_connection_sock *icsk = inet_csk(sk);
2567 	const struct inet_sock *inet = inet_sk(sk);
2568 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2569 	__be32 dest = inet->inet_daddr;
2570 	__be32 src = inet->inet_rcv_saddr;
2571 	__u16 destp = ntohs(inet->inet_dport);
2572 	__u16 srcp = ntohs(inet->inet_sport);
2573 	int rx_queue;
2574 	int state;
2575 
2576 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2577 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2578 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2579 		timer_active	= 1;
2580 		timer_expires	= icsk->icsk_timeout;
2581 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2582 		timer_active	= 4;
2583 		timer_expires	= icsk->icsk_timeout;
2584 	} else if (timer_pending(&sk->sk_timer)) {
2585 		timer_active	= 2;
2586 		timer_expires	= sk->sk_timer.expires;
2587 	} else {
2588 		timer_active	= 0;
2589 		timer_expires = jiffies;
2590 	}
2591 
2592 	state = inet_sk_state_load(sk);
2593 	if (state == TCP_LISTEN)
2594 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2595 	else
2596 		/* Because we don't lock the socket,
2597 		 * we might find a transient negative value.
2598 		 */
2599 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2600 				      READ_ONCE(tp->copied_seq), 0);
2601 
2602 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2603 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2604 		i, src, srcp, dest, destp, state,
2605 		READ_ONCE(tp->write_seq) - tp->snd_una,
2606 		rx_queue,
2607 		timer_active,
2608 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2609 		icsk->icsk_retransmits,
2610 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2611 		icsk->icsk_probes_out,
2612 		sock_i_ino(sk),
2613 		refcount_read(&sk->sk_refcnt), sk,
2614 		jiffies_to_clock_t(icsk->icsk_rto),
2615 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2616 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2617 		tp->snd_cwnd,
2618 		state == TCP_LISTEN ?
2619 		    fastopenq->max_qlen :
2620 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2621 }
2622 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2623 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2624 			       struct seq_file *f, int i)
2625 {
2626 	long delta = tw->tw_timer.expires - jiffies;
2627 	__be32 dest, src;
2628 	__u16 destp, srcp;
2629 
2630 	dest  = tw->tw_daddr;
2631 	src   = tw->tw_rcv_saddr;
2632 	destp = ntohs(tw->tw_dport);
2633 	srcp  = ntohs(tw->tw_sport);
2634 
2635 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2636 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2637 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2638 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2639 		refcount_read(&tw->tw_refcnt), tw);
2640 }
2641 
2642 #define TMPSZ 150
2643 
tcp4_seq_show(struct seq_file * seq,void * v)2644 static int tcp4_seq_show(struct seq_file *seq, void *v)
2645 {
2646 	struct tcp_iter_state *st;
2647 	struct sock *sk = v;
2648 
2649 	seq_setwidth(seq, TMPSZ - 1);
2650 	if (v == SEQ_START_TOKEN) {
2651 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2652 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2653 			   "inode");
2654 		goto out;
2655 	}
2656 	st = seq->private;
2657 
2658 	if (sk->sk_state == TCP_TIME_WAIT)
2659 		get_timewait4_sock(v, seq, st->num);
2660 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2661 		get_openreq4(v, seq, st->num);
2662 	else
2663 		get_tcp4_sock(v, seq, st->num);
2664 out:
2665 	seq_pad(seq, '\n');
2666 	return 0;
2667 }
2668 
2669 #ifdef CONFIG_BPF_SYSCALL
2670 struct bpf_iter__tcp {
2671 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2672 	__bpf_md_ptr(struct sock_common *, sk_common);
2673 	uid_t uid __aligned(8);
2674 };
2675 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2676 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2677 			     struct sock_common *sk_common, uid_t uid)
2678 {
2679 	struct bpf_iter__tcp ctx;
2680 
2681 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2682 	ctx.meta = meta;
2683 	ctx.sk_common = sk_common;
2684 	ctx.uid = uid;
2685 	return bpf_iter_run_prog(prog, &ctx);
2686 }
2687 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2688 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2689 {
2690 	struct bpf_iter_meta meta;
2691 	struct bpf_prog *prog;
2692 	struct sock *sk = v;
2693 	uid_t uid;
2694 
2695 	if (v == SEQ_START_TOKEN)
2696 		return 0;
2697 
2698 	if (sk->sk_state == TCP_TIME_WAIT) {
2699 		uid = 0;
2700 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2701 		const struct request_sock *req = v;
2702 
2703 		uid = from_kuid_munged(seq_user_ns(seq),
2704 				       sock_i_uid(req->rsk_listener));
2705 	} else {
2706 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2707 	}
2708 
2709 	meta.seq = seq;
2710 	prog = bpf_iter_get_info(&meta, false);
2711 	return tcp_prog_seq_show(prog, &meta, v, uid);
2712 }
2713 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2714 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2715 {
2716 	struct bpf_iter_meta meta;
2717 	struct bpf_prog *prog;
2718 
2719 	if (!v) {
2720 		meta.seq = seq;
2721 		prog = bpf_iter_get_info(&meta, true);
2722 		if (prog)
2723 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2724 	}
2725 
2726 	tcp_seq_stop(seq, v);
2727 }
2728 
2729 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2730 	.show		= bpf_iter_tcp_seq_show,
2731 	.start		= tcp_seq_start,
2732 	.next		= tcp_seq_next,
2733 	.stop		= bpf_iter_tcp_seq_stop,
2734 };
2735 #endif
2736 
2737 static const struct seq_operations tcp4_seq_ops = {
2738 	.show		= tcp4_seq_show,
2739 	.start		= tcp_seq_start,
2740 	.next		= tcp_seq_next,
2741 	.stop		= tcp_seq_stop,
2742 };
2743 
2744 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2745 	.family		= AF_INET,
2746 };
2747 
tcp4_proc_init_net(struct net * net)2748 static int __net_init tcp4_proc_init_net(struct net *net)
2749 {
2750 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2751 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2752 		return -ENOMEM;
2753 	return 0;
2754 }
2755 
tcp4_proc_exit_net(struct net * net)2756 static void __net_exit tcp4_proc_exit_net(struct net *net)
2757 {
2758 	remove_proc_entry("tcp", net->proc_net);
2759 }
2760 
2761 static struct pernet_operations tcp4_net_ops = {
2762 	.init = tcp4_proc_init_net,
2763 	.exit = tcp4_proc_exit_net,
2764 };
2765 
tcp4_proc_init(void)2766 int __init tcp4_proc_init(void)
2767 {
2768 	return register_pernet_subsys(&tcp4_net_ops);
2769 }
2770 
tcp4_proc_exit(void)2771 void tcp4_proc_exit(void)
2772 {
2773 	unregister_pernet_subsys(&tcp4_net_ops);
2774 }
2775 #endif /* CONFIG_PROC_FS */
2776 
2777 struct proto tcp_prot = {
2778 	.name			= "TCP",
2779 	.owner			= THIS_MODULE,
2780 	.close			= tcp_close,
2781 	.pre_connect		= tcp_v4_pre_connect,
2782 	.connect		= tcp_v4_connect,
2783 	.disconnect		= tcp_disconnect,
2784 	.accept			= inet_csk_accept,
2785 	.ioctl			= tcp_ioctl,
2786 	.init			= tcp_v4_init_sock,
2787 	.destroy		= tcp_v4_destroy_sock,
2788 	.shutdown		= tcp_shutdown,
2789 	.setsockopt		= tcp_setsockopt,
2790 	.getsockopt		= tcp_getsockopt,
2791 	.keepalive		= tcp_set_keepalive,
2792 	.recvmsg		= tcp_recvmsg,
2793 	.sendmsg		= tcp_sendmsg,
2794 	.sendpage		= tcp_sendpage,
2795 	.backlog_rcv		= tcp_v4_do_rcv,
2796 	.release_cb		= tcp_release_cb,
2797 	.hash			= inet_hash,
2798 	.unhash			= inet_unhash,
2799 	.get_port		= inet_csk_get_port,
2800 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2801 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2802 	.stream_memory_free	= tcp_stream_memory_free,
2803 	.sockets_allocated	= &tcp_sockets_allocated,
2804 	.orphan_count		= &tcp_orphan_count,
2805 	.memory_allocated	= &tcp_memory_allocated,
2806 	.memory_pressure	= &tcp_memory_pressure,
2807 	.sysctl_mem		= sysctl_tcp_mem,
2808 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2809 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2810 	.max_header		= MAX_TCP_HEADER,
2811 	.obj_size		= sizeof(struct tcp_sock),
2812 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2813 	.twsk_prot		= &tcp_timewait_sock_ops,
2814 	.rsk_prot		= &tcp_request_sock_ops,
2815 	.h.hashinfo		= &tcp_hashinfo,
2816 	.no_autobind		= true,
2817 	.diag_destroy		= tcp_abort,
2818 };
2819 EXPORT_SYMBOL(tcp_prot);
2820 
tcp_sk_exit(struct net * net)2821 static void __net_exit tcp_sk_exit(struct net *net)
2822 {
2823 	int cpu;
2824 
2825 	if (net->ipv4.tcp_congestion_control)
2826 		bpf_module_put(net->ipv4.tcp_congestion_control,
2827 			       net->ipv4.tcp_congestion_control->owner);
2828 
2829 	for_each_possible_cpu(cpu)
2830 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2831 	free_percpu(net->ipv4.tcp_sk);
2832 }
2833 
tcp_sk_init(struct net * net)2834 static int __net_init tcp_sk_init(struct net *net)
2835 {
2836 	int res, cpu, cnt;
2837 
2838 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2839 	if (!net->ipv4.tcp_sk)
2840 		return -ENOMEM;
2841 
2842 	for_each_possible_cpu(cpu) {
2843 		struct sock *sk;
2844 
2845 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2846 					   IPPROTO_TCP, net);
2847 		if (res)
2848 			goto fail;
2849 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2850 
2851 		/* Please enforce IP_DF and IPID==0 for RST and
2852 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2853 		 */
2854 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2855 
2856 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2857 	}
2858 
2859 	net->ipv4.sysctl_tcp_ecn = 2;
2860 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2861 
2862 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2863 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2864 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2865 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2866 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2867 
2868 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2869 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2870 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2871 
2872 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2873 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2874 	net->ipv4.sysctl_tcp_syncookies = 1;
2875 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2876 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2877 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2878 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2879 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2880 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2881 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2882 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2883 
2884 	cnt = tcp_hashinfo.ehash_mask + 1;
2885 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2886 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2887 
2888 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2889 	net->ipv4.sysctl_tcp_sack = 1;
2890 	net->ipv4.sysctl_tcp_window_scaling = 1;
2891 	net->ipv4.sysctl_tcp_timestamps = 1;
2892 	net->ipv4.sysctl_tcp_early_retrans = 3;
2893 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2894 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2895 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2896 	net->ipv4.sysctl_tcp_max_reordering = 300;
2897 	net->ipv4.sysctl_tcp_dsack = 1;
2898 	net->ipv4.sysctl_tcp_app_win = 31;
2899 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2900 	net->ipv4.sysctl_tcp_frto = 2;
2901 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2902 	/* This limits the percentage of the congestion window which we
2903 	 * will allow a single TSO frame to consume.  Building TSO frames
2904 	 * which are too large can cause TCP streams to be bursty.
2905 	 */
2906 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2907 	/* Default TSQ limit of 16 TSO segments */
2908 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2909 	/* rfc5961 challenge ack rate limiting */
2910 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2911 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2912 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2913 	net->ipv4.sysctl_tcp_autocorking = 1;
2914 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2915 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2916 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2917 	if (net != &init_net) {
2918 		memcpy(net->ipv4.sysctl_tcp_rmem,
2919 		       init_net.ipv4.sysctl_tcp_rmem,
2920 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2921 		memcpy(net->ipv4.sysctl_tcp_wmem,
2922 		       init_net.ipv4.sysctl_tcp_wmem,
2923 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2924 	}
2925 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2926 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2927 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2928 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2929 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2930 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2931 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2932 
2933 	/* Reno is always built in */
2934 	if (!net_eq(net, &init_net) &&
2935 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2936 			       init_net.ipv4.tcp_congestion_control->owner))
2937 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2938 	else
2939 		net->ipv4.tcp_congestion_control = &tcp_reno;
2940 
2941 	return 0;
2942 fail:
2943 	tcp_sk_exit(net);
2944 
2945 	return res;
2946 }
2947 
tcp_sk_exit_batch(struct list_head * net_exit_list)2948 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2949 {
2950 	struct net *net;
2951 
2952 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2953 
2954 	list_for_each_entry(net, net_exit_list, exit_list)
2955 		tcp_fastopen_ctx_destroy(net);
2956 }
2957 
2958 static struct pernet_operations __net_initdata tcp_sk_ops = {
2959        .init	   = tcp_sk_init,
2960        .exit	   = tcp_sk_exit,
2961        .exit_batch = tcp_sk_exit_batch,
2962 };
2963 
2964 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2965 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2966 		     struct sock_common *sk_common, uid_t uid)
2967 
2968 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2969 {
2970 	struct tcp_iter_state *st = priv_data;
2971 	struct tcp_seq_afinfo *afinfo;
2972 	int ret;
2973 
2974 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2975 	if (!afinfo)
2976 		return -ENOMEM;
2977 
2978 	afinfo->family = AF_UNSPEC;
2979 	st->bpf_seq_afinfo = afinfo;
2980 	ret = bpf_iter_init_seq_net(priv_data, aux);
2981 	if (ret)
2982 		kfree(afinfo);
2983 	return ret;
2984 }
2985 
bpf_iter_fini_tcp(void * priv_data)2986 static void bpf_iter_fini_tcp(void *priv_data)
2987 {
2988 	struct tcp_iter_state *st = priv_data;
2989 
2990 	kfree(st->bpf_seq_afinfo);
2991 	bpf_iter_fini_seq_net(priv_data);
2992 }
2993 
2994 static const struct bpf_iter_seq_info tcp_seq_info = {
2995 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2996 	.init_seq_private	= bpf_iter_init_tcp,
2997 	.fini_seq_private	= bpf_iter_fini_tcp,
2998 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2999 };
3000 
3001 static struct bpf_iter_reg tcp_reg_info = {
3002 	.target			= "tcp",
3003 	.ctx_arg_info_size	= 1,
3004 	.ctx_arg_info		= {
3005 		{ offsetof(struct bpf_iter__tcp, sk_common),
3006 		  PTR_TO_BTF_ID_OR_NULL },
3007 	},
3008 	.seq_info		= &tcp_seq_info,
3009 };
3010 
bpf_iter_register(void)3011 static void __init bpf_iter_register(void)
3012 {
3013 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3014 	if (bpf_iter_reg_target(&tcp_reg_info))
3015 		pr_warn("Warning: could not register bpf iterator tcp\n");
3016 }
3017 
3018 #endif
3019 
tcp_v4_init(void)3020 void __init tcp_v4_init(void)
3021 {
3022 	if (register_pernet_subsys(&tcp_sk_ops))
3023 		panic("Failed to create the TCP control socket.\n");
3024 
3025 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3026 	bpf_iter_register();
3027 #endif
3028 }
3029