• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 #ifdef CONFIG_TCP_NATA_URC
73 #include <net/nata.h>
74 #endif
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 #include <linux/inetdevice.h>
82 #include <linux/btf_ids.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 #include <trace/events/tcp.h>
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
98 
tcp_v4_init_seq(const struct sk_buff * skb)99 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
100 {
101 	return secure_tcp_seq(ip_hdr(skb)->daddr,
102 			      ip_hdr(skb)->saddr,
103 			      tcp_hdr(skb)->dest,
104 			      tcp_hdr(skb)->source);
105 }
106 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)107 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
108 {
109 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
110 }
111 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
115 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	if (reuse == 2) {
120 		/* Still does not detect *everything* that goes through
121 		 * lo, since we require a loopback src or dst address
122 		 * or direct binding to 'lo' interface.
123 		 */
124 		bool loopback = false;
125 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
126 			loopback = true;
127 #if IS_ENABLED(CONFIG_IPV6)
128 		if (tw->tw_family == AF_INET6) {
129 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
131 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
133 				loopback = true;
134 		} else
135 #endif
136 		{
137 			if (ipv4_is_loopback(tw->tw_daddr) ||
138 			    ipv4_is_loopback(tw->tw_rcv_saddr))
139 				loopback = true;
140 		}
141 		if (!loopback)
142 			reuse = 0;
143 	}
144 
145 	/* With PAWS, it is safe from the viewpoint
146 	   of data integrity. Even without PAWS it is safe provided sequence
147 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
148 
149 	   Actually, the idea is close to VJ's one, only timestamp cache is
150 	   held not per host, but per port pair and TW bucket is used as state
151 	   holder.
152 
153 	   If TW bucket has been already destroyed we fall back to VJ's scheme
154 	   and use initial timestamp retrieved from peer table.
155 	 */
156 	if (tcptw->tw_ts_recent_stamp &&
157 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
158 					    tcptw->tw_ts_recent_stamp)))) {
159 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
160 		 * and releasing the bucket lock.
161 		 */
162 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
163 			return 0;
164 
165 		/* In case of repair and re-using TIME-WAIT sockets we still
166 		 * want to be sure that it is safe as above but honor the
167 		 * sequence numbers and time stamps set as part of the repair
168 		 * process.
169 		 *
170 		 * Without this check re-using a TIME-WAIT socket with TCP
171 		 * repair would accumulate a -1 on the repair assigned
172 		 * sequence number. The first time it is reused the sequence
173 		 * is -1, the second time -2, etc. This fixes that issue
174 		 * without appearing to create any others.
175 		 */
176 		if (likely(!tp->repair)) {
177 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
178 
179 			if (!seq)
180 				seq = 1;
181 			WRITE_ONCE(tp->write_seq, seq);
182 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
183 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
184 		}
185 
186 		return 1;
187 	}
188 
189 	return 0;
190 }
191 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
192 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)193 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
194 			      int addr_len)
195 {
196 	/* This check is replicated from tcp_v4_connect() and intended to
197 	 * prevent BPF program called below from accessing bytes that are out
198 	 * of the bound specified by user in addr_len.
199 	 */
200 	if (addr_len < sizeof(struct sockaddr_in))
201 		return -EINVAL;
202 
203 	sock_owned_by_me(sk);
204 
205 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
206 }
207 
208 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)209 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
210 {
211 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
212 	struct inet_sock *inet = inet_sk(sk);
213 	struct tcp_sock *tp = tcp_sk(sk);
214 	__be16 orig_sport, orig_dport;
215 	__be32 daddr, nexthop;
216 	struct flowi4 *fl4;
217 	struct rtable *rt;
218 	int err;
219 	struct ip_options_rcu *inet_opt;
220 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
221 
222 	if (addr_len < sizeof(struct sockaddr_in))
223 		return -EINVAL;
224 
225 	if (usin->sin_family != AF_INET)
226 		return -EAFNOSUPPORT;
227 
228 	nexthop = daddr = usin->sin_addr.s_addr;
229 	inet_opt = rcu_dereference_protected(inet->inet_opt,
230 					     lockdep_sock_is_held(sk));
231 	if (inet_opt && inet_opt->opt.srr) {
232 		if (!daddr)
233 			return -EINVAL;
234 		nexthop = inet_opt->opt.faddr;
235 	}
236 
237 	orig_sport = inet->inet_sport;
238 	orig_dport = usin->sin_port;
239 	fl4 = &inet->cork.fl.u.ip4;
240 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
241 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
242 			      IPPROTO_TCP,
243 			      orig_sport, orig_dport, sk);
244 	if (IS_ERR(rt)) {
245 		err = PTR_ERR(rt);
246 		if (err == -ENETUNREACH)
247 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
248 		return err;
249 	}
250 
251 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
252 		ip_rt_put(rt);
253 		return -ENETUNREACH;
254 	}
255 
256 	if (!inet_opt || !inet_opt->opt.srr)
257 		daddr = fl4->daddr;
258 
259 	if (!inet->inet_saddr)
260 		inet->inet_saddr = fl4->saddr;
261 	sk_rcv_saddr_set(sk, inet->inet_saddr);
262 
263 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264 		/* Reset inherited state */
265 		tp->rx_opt.ts_recent	   = 0;
266 		tp->rx_opt.ts_recent_stamp = 0;
267 		if (likely(!tp->repair))
268 			WRITE_ONCE(tp->write_seq, 0);
269 	}
270 
271 	inet->inet_dport = usin->sin_port;
272 	sk_daddr_set(sk, daddr);
273 
274 	inet_csk(sk)->icsk_ext_hdr_len = 0;
275 	if (inet_opt)
276 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277 
278 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279 
280 	/* Socket identity is still unknown (sport may be zero).
281 	 * However we set state to SYN-SENT and not releasing socket
282 	 * lock select source port, enter ourselves into the hash tables and
283 	 * complete initialization after this.
284 	 */
285 #ifdef CONFIG_TCP_NATA_URC
286 	tcp_set_nata_push_urc(sk);
287 #endif /* CONFIG_TCP_NATA_URC */
288 	tcp_set_state(sk, TCP_SYN_SENT);
289 	err = inet_hash_connect(tcp_death_row, sk);
290 	if (err)
291 		goto failure;
292 
293 	sk_set_txhash(sk);
294 
295 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
296 			       inet->inet_sport, inet->inet_dport, sk);
297 	if (IS_ERR(rt)) {
298 		err = PTR_ERR(rt);
299 		rt = NULL;
300 		goto failure;
301 	}
302 	/* OK, now commit destination to socket.  */
303 	sk->sk_gso_type = SKB_GSO_TCPV4;
304 	sk_setup_caps(sk, &rt->dst);
305 	rt = NULL;
306 
307 	if (likely(!tp->repair)) {
308 		if (!tp->write_seq)
309 			WRITE_ONCE(tp->write_seq,
310 				   secure_tcp_seq(inet->inet_saddr,
311 						  inet->inet_daddr,
312 						  inet->inet_sport,
313 						  usin->sin_port));
314 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
315 						 inet->inet_saddr,
316 						 inet->inet_daddr);
317 	}
318 
319 	inet->inet_id = prandom_u32();
320 
321 	if (tcp_fastopen_defer_connect(sk, &err))
322 		return err;
323 	if (err)
324 		goto failure;
325 
326 	err = tcp_connect(sk);
327 
328 	if (err)
329 		goto failure;
330 
331 	return 0;
332 
333 failure:
334 	/*
335 	 * This unhashes the socket and releases the local port,
336 	 * if necessary.
337 	 */
338 	tcp_set_state(sk, TCP_CLOSE);
339 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
340 		inet_reset_saddr(sk);
341 	ip_rt_put(rt);
342 	sk->sk_route_caps = 0;
343 	inet->inet_dport = 0;
344 	return err;
345 }
346 EXPORT_SYMBOL(tcp_v4_connect);
347 
348 /*
349  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
350  * It can be called through tcp_release_cb() if socket was owned by user
351  * at the time tcp_v4_err() was called to handle ICMP message.
352  */
tcp_v4_mtu_reduced(struct sock * sk)353 void tcp_v4_mtu_reduced(struct sock *sk)
354 {
355 	struct inet_sock *inet = inet_sk(sk);
356 	struct dst_entry *dst;
357 	u32 mtu;
358 
359 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
360 		return;
361 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
362 	dst = inet_csk_update_pmtu(sk, mtu);
363 	if (!dst)
364 		return;
365 
366 	/* Something is about to be wrong... Remember soft error
367 	 * for the case, if this connection will not able to recover.
368 	 */
369 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
370 		sk->sk_err_soft = EMSGSIZE;
371 
372 	mtu = dst_mtu(dst);
373 
374 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
375 	    ip_sk_accept_pmtu(sk) &&
376 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
377 		tcp_sync_mss(sk, mtu);
378 
379 		/* Resend the TCP packet because it's
380 		 * clear that the old packet has been
381 		 * dropped. This is the new "fast" path mtu
382 		 * discovery.
383 		 */
384 		tcp_simple_retransmit(sk);
385 	} /* else let the usual retransmit timer handle it */
386 }
387 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
388 
do_redirect(struct sk_buff * skb,struct sock * sk)389 static void do_redirect(struct sk_buff *skb, struct sock *sk)
390 {
391 	struct dst_entry *dst = __sk_dst_check(sk, 0);
392 
393 	if (dst)
394 		dst->ops->redirect(dst, sk, skb);
395 }
396 
397 
398 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)399 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
400 {
401 	struct request_sock *req = inet_reqsk(sk);
402 	struct net *net = sock_net(sk);
403 
404 	/* ICMPs are not backlogged, hence we cannot get
405 	 * an established socket here.
406 	 */
407 	if (seq != tcp_rsk(req)->snt_isn) {
408 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
409 	} else if (abort) {
410 		/*
411 		 * Still in SYN_RECV, just remove it silently.
412 		 * There is no good way to pass the error to the newly
413 		 * created socket, and POSIX does not want network
414 		 * errors returned from accept().
415 		 */
416 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
417 		tcp_listendrop(req->rsk_listener);
418 	}
419 	reqsk_put(req);
420 }
421 EXPORT_SYMBOL(tcp_req_err);
422 
423 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)424 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
425 {
426 	struct inet_connection_sock *icsk = inet_csk(sk);
427 	struct tcp_sock *tp = tcp_sk(sk);
428 	struct sk_buff *skb;
429 	s32 remaining;
430 	u32 delta_us;
431 
432 	if (sock_owned_by_user(sk))
433 		return;
434 
435 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
436 	    !icsk->icsk_backoff)
437 		return;
438 
439 	skb = tcp_rtx_queue_head(sk);
440 	if (WARN_ON_ONCE(!skb))
441 		return;
442 
443 	icsk->icsk_backoff--;
444 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
445 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
446 
447 	tcp_mstamp_refresh(tp);
448 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
449 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
450 
451 	if (remaining > 0) {
452 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
453 					  remaining, TCP_RTO_MAX);
454 	} else {
455 		/* RTO revert clocked out retransmission.
456 		 * Will retransmit now.
457 		 */
458 		tcp_retransmit_timer(sk);
459 	}
460 }
461 EXPORT_SYMBOL(tcp_ld_RTO_revert);
462 
463 /*
464  * This routine is called by the ICMP module when it gets some
465  * sort of error condition.  If err < 0 then the socket should
466  * be closed and the error returned to the user.  If err > 0
467  * it's just the icmp type << 8 | icmp code.  After adjustment
468  * header points to the first 8 bytes of the tcp header.  We need
469  * to find the appropriate port.
470  *
471  * The locking strategy used here is very "optimistic". When
472  * someone else accesses the socket the ICMP is just dropped
473  * and for some paths there is no check at all.
474  * A more general error queue to queue errors for later handling
475  * is probably better.
476  *
477  */
478 
tcp_v4_err(struct sk_buff * skb,u32 info)479 int tcp_v4_err(struct sk_buff *skb, u32 info)
480 {
481 	const struct iphdr *iph = (const struct iphdr *)skb->data;
482 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
483 	struct tcp_sock *tp;
484 	struct inet_sock *inet;
485 	const int type = icmp_hdr(skb)->type;
486 	const int code = icmp_hdr(skb)->code;
487 	struct sock *sk;
488 	struct request_sock *fastopen;
489 	u32 seq, snd_una;
490 	int err;
491 	struct net *net = dev_net(skb->dev);
492 
493 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
494 				       th->dest, iph->saddr, ntohs(th->source),
495 				       inet_iif(skb), 0);
496 	if (!sk) {
497 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
498 		return -ENOENT;
499 	}
500 	if (sk->sk_state == TCP_TIME_WAIT) {
501 		inet_twsk_put(inet_twsk(sk));
502 		return 0;
503 	}
504 	seq = ntohl(th->seq);
505 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
506 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
507 				     type == ICMP_TIME_EXCEEDED ||
508 				     (type == ICMP_DEST_UNREACH &&
509 				      (code == ICMP_NET_UNREACH ||
510 				       code == ICMP_HOST_UNREACH)));
511 		return 0;
512 	}
513 
514 	bh_lock_sock(sk);
515 	/* If too many ICMPs get dropped on busy
516 	 * servers this needs to be solved differently.
517 	 * We do take care of PMTU discovery (RFC1191) special case :
518 	 * we can receive locally generated ICMP messages while socket is held.
519 	 */
520 	if (sock_owned_by_user(sk)) {
521 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
522 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
523 	}
524 	if (sk->sk_state == TCP_CLOSE)
525 		goto out;
526 
527 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
528 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
529 		goto out;
530 	}
531 
532 	tp = tcp_sk(sk);
533 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
534 	fastopen = rcu_dereference(tp->fastopen_rsk);
535 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
536 	if (sk->sk_state != TCP_LISTEN &&
537 	    !between(seq, snd_una, tp->snd_nxt)) {
538 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
539 		goto out;
540 	}
541 
542 	switch (type) {
543 	case ICMP_REDIRECT:
544 		if (!sock_owned_by_user(sk))
545 			do_redirect(skb, sk);
546 		goto out;
547 	case ICMP_SOURCE_QUENCH:
548 		/* Just silently ignore these. */
549 		goto out;
550 	case ICMP_PARAMETERPROB:
551 		err = EPROTO;
552 		break;
553 	case ICMP_DEST_UNREACH:
554 		if (code > NR_ICMP_UNREACH)
555 			goto out;
556 
557 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
558 			/* We are not interested in TCP_LISTEN and open_requests
559 			 * (SYN-ACKs send out by Linux are always <576bytes so
560 			 * they should go through unfragmented).
561 			 */
562 			if (sk->sk_state == TCP_LISTEN)
563 				goto out;
564 
565 			WRITE_ONCE(tp->mtu_info, info);
566 			if (!sock_owned_by_user(sk)) {
567 				tcp_v4_mtu_reduced(sk);
568 			} else {
569 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
570 					sock_hold(sk);
571 			}
572 			goto out;
573 		}
574 
575 		err = icmp_err_convert[code].errno;
576 		/* check if this ICMP message allows revert of backoff.
577 		 * (see RFC 6069)
578 		 */
579 		if (!fastopen &&
580 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
581 			tcp_ld_RTO_revert(sk, seq);
582 		break;
583 	case ICMP_TIME_EXCEEDED:
584 		err = EHOSTUNREACH;
585 		break;
586 	default:
587 		goto out;
588 	}
589 
590 	switch (sk->sk_state) {
591 	case TCP_SYN_SENT:
592 	case TCP_SYN_RECV:
593 		/* Only in fast or simultaneous open. If a fast open socket is
594 		 * already accepted it is treated as a connected one below.
595 		 */
596 		if (fastopen && !fastopen->sk)
597 			break;
598 
599 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
600 
601 		if (!sock_owned_by_user(sk)) {
602 			sk->sk_err = err;
603 
604 			sk->sk_error_report(sk);
605 
606 			tcp_done(sk);
607 		} else {
608 			sk->sk_err_soft = err;
609 		}
610 		goto out;
611 	}
612 
613 	/* If we've already connected we will keep trying
614 	 * until we time out, or the user gives up.
615 	 *
616 	 * rfc1122 4.2.3.9 allows to consider as hard errors
617 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
618 	 * but it is obsoleted by pmtu discovery).
619 	 *
620 	 * Note, that in modern internet, where routing is unreliable
621 	 * and in each dark corner broken firewalls sit, sending random
622 	 * errors ordered by their masters even this two messages finally lose
623 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
624 	 *
625 	 * Now we are in compliance with RFCs.
626 	 *							--ANK (980905)
627 	 */
628 
629 	inet = inet_sk(sk);
630 	if (!sock_owned_by_user(sk) && inet->recverr) {
631 		sk->sk_err = err;
632 		sk->sk_error_report(sk);
633 	} else	{ /* Only an error on timeout */
634 		sk->sk_err_soft = err;
635 	}
636 
637 out:
638 	bh_unlock_sock(sk);
639 	sock_put(sk);
640 	return 0;
641 }
642 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)643 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
644 {
645 	struct tcphdr *th = tcp_hdr(skb);
646 
647 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
648 	skb->csum_start = skb_transport_header(skb) - skb->head;
649 	skb->csum_offset = offsetof(struct tcphdr, check);
650 }
651 
652 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)653 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
654 {
655 	const struct inet_sock *inet = inet_sk(sk);
656 
657 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
658 }
659 EXPORT_SYMBOL(tcp_v4_send_check);
660 
661 /*
662  *	This routine will send an RST to the other tcp.
663  *
664  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
665  *		      for reset.
666  *	Answer: if a packet caused RST, it is not for a socket
667  *		existing in our system, if it is matched to a socket,
668  *		it is just duplicate segment or bug in other side's TCP.
669  *		So that we build reply only basing on parameters
670  *		arrived with segment.
671  *	Exception: precedence violation. We do not implement it in any case.
672  */
673 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)674 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct tcphdr *th = tcp_hdr(skb);
677 	struct {
678 		struct tcphdr th;
679 #ifdef CONFIG_TCP_MD5SIG
680 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
681 #endif
682 	} rep;
683 	struct ip_reply_arg arg;
684 #ifdef CONFIG_TCP_MD5SIG
685 	struct tcp_md5sig_key *key = NULL;
686 	const __u8 *hash_location = NULL;
687 	unsigned char newhash[16];
688 	int genhash;
689 	struct sock *sk1 = NULL;
690 #endif
691 	u64 transmit_time = 0;
692 	struct sock *ctl_sk;
693 	struct net *net;
694 
695 	/* Never send a reset in response to a reset. */
696 	if (th->rst)
697 		return;
698 
699 	/* If sk not NULL, it means we did a successful lookup and incoming
700 	 * route had to be correct. prequeue might have dropped our dst.
701 	 */
702 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
703 		return;
704 
705 	/* Swap the send and the receive. */
706 	memset(&rep, 0, sizeof(rep));
707 	rep.th.dest   = th->source;
708 	rep.th.source = th->dest;
709 	rep.th.doff   = sizeof(struct tcphdr) / 4;
710 	rep.th.rst    = 1;
711 
712 	if (th->ack) {
713 		rep.th.seq = th->ack_seq;
714 	} else {
715 		rep.th.ack = 1;
716 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
717 				       skb->len - (th->doff << 2));
718 	}
719 
720 	memset(&arg, 0, sizeof(arg));
721 	arg.iov[0].iov_base = (unsigned char *)&rep;
722 	arg.iov[0].iov_len  = sizeof(rep.th);
723 
724 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
725 #ifdef CONFIG_TCP_MD5SIG
726 	rcu_read_lock();
727 	hash_location = tcp_parse_md5sig_option(th);
728 	if (sk && sk_fullsock(sk)) {
729 		const union tcp_md5_addr *addr;
730 		int l3index;
731 
732 		/* sdif set, means packet ingressed via a device
733 		 * in an L3 domain and inet_iif is set to it.
734 		 */
735 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
736 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
737 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
738 	} else if (hash_location) {
739 		const union tcp_md5_addr *addr;
740 		int sdif = tcp_v4_sdif(skb);
741 		int dif = inet_iif(skb);
742 		int l3index;
743 
744 		/*
745 		 * active side is lost. Try to find listening socket through
746 		 * source port, and then find md5 key through listening socket.
747 		 * we are not loose security here:
748 		 * Incoming packet is checked with md5 hash with finding key,
749 		 * no RST generated if md5 hash doesn't match.
750 		 */
751 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
752 					     ip_hdr(skb)->saddr,
753 					     th->source, ip_hdr(skb)->daddr,
754 					     ntohs(th->source), dif, sdif);
755 		/* don't send rst if it can't find key */
756 		if (!sk1)
757 			goto out;
758 
759 		/* sdif set, means packet ingressed via a device
760 		 * in an L3 domain and dif is set to it.
761 		 */
762 		l3index = sdif ? dif : 0;
763 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
764 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
765 		if (!key)
766 			goto out;
767 
768 
769 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
770 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
771 			goto out;
772 
773 	}
774 
775 	if (key) {
776 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
777 				   (TCPOPT_NOP << 16) |
778 				   (TCPOPT_MD5SIG << 8) |
779 				   TCPOLEN_MD5SIG);
780 		/* Update length and the length the header thinks exists */
781 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
782 		rep.th.doff = arg.iov[0].iov_len / 4;
783 
784 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
785 				     key, ip_hdr(skb)->saddr,
786 				     ip_hdr(skb)->daddr, &rep.th);
787 	}
788 #endif
789 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
790 				      ip_hdr(skb)->saddr, /* XXX */
791 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
792 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
793 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
794 
795 	/* When socket is gone, all binding information is lost.
796 	 * routing might fail in this case. No choice here, if we choose to force
797 	 * input interface, we will misroute in case of asymmetric route.
798 	 */
799 	if (sk) {
800 		arg.bound_dev_if = sk->sk_bound_dev_if;
801 		if (sk_fullsock(sk))
802 			trace_tcp_send_reset(sk, skb);
803 	}
804 
805 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
806 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
807 
808 	arg.tos = ip_hdr(skb)->tos;
809 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
810 	local_bh_disable();
811 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
812 	sock_net_set(ctl_sk, net);
813 	if (sk) {
814 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
815 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
816 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
817 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
818 		transmit_time = tcp_transmit_time(sk);
819 		xfrm_sk_clone_policy(ctl_sk, sk);
820 	} else {
821 		ctl_sk->sk_mark = 0;
822 		ctl_sk->sk_priority = 0;
823 	}
824 	ip_send_unicast_reply(ctl_sk,
825 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 			      &arg, arg.iov[0].iov_len,
828 			      transmit_time);
829 
830 	xfrm_sk_free_policy(ctl_sk);
831 	sock_net_set(ctl_sk, &init_net);
832 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
833 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
834 	local_bh_enable();
835 
836 #ifdef CONFIG_TCP_MD5SIG
837 out:
838 	rcu_read_unlock();
839 #endif
840 }
841 
842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
843    outside socket context is ugly, certainly. What can I do?
844  */
845 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)846 static void tcp_v4_send_ack(const struct sock *sk,
847 			    struct sk_buff *skb, u32 seq, u32 ack,
848 			    u32 win, u32 tsval, u32 tsecr, int oif,
849 			    struct tcp_md5sig_key *key,
850 			    int reply_flags, u8 tos)
851 {
852 	const struct tcphdr *th = tcp_hdr(skb);
853 	struct {
854 		struct tcphdr th;
855 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
856 #ifdef CONFIG_TCP_MD5SIG
857 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
858 #endif
859 			];
860 	} rep;
861 	struct net *net = sock_net(sk);
862 	struct ip_reply_arg arg;
863 	struct sock *ctl_sk;
864 	u64 transmit_time;
865 
866 	memset(&rep.th, 0, sizeof(struct tcphdr));
867 	memset(&arg, 0, sizeof(arg));
868 
869 	arg.iov[0].iov_base = (unsigned char *)&rep;
870 	arg.iov[0].iov_len  = sizeof(rep.th);
871 	if (tsecr) {
872 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
873 				   (TCPOPT_TIMESTAMP << 8) |
874 				   TCPOLEN_TIMESTAMP);
875 		rep.opt[1] = htonl(tsval);
876 		rep.opt[2] = htonl(tsecr);
877 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
878 	}
879 
880 	/* Swap the send and the receive. */
881 	rep.th.dest    = th->source;
882 	rep.th.source  = th->dest;
883 	rep.th.doff    = arg.iov[0].iov_len / 4;
884 	rep.th.seq     = htonl(seq);
885 	rep.th.ack_seq = htonl(ack);
886 	rep.th.ack     = 1;
887 	rep.th.window  = htons(win);
888 
889 #ifdef CONFIG_TCP_MD5SIG
890 	if (key) {
891 		int offset = (tsecr) ? 3 : 0;
892 
893 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
894 					  (TCPOPT_NOP << 16) |
895 					  (TCPOPT_MD5SIG << 8) |
896 					  TCPOLEN_MD5SIG);
897 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
898 		rep.th.doff = arg.iov[0].iov_len/4;
899 
900 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
901 				    key, ip_hdr(skb)->saddr,
902 				    ip_hdr(skb)->daddr, &rep.th);
903 	}
904 #endif
905 	arg.flags = reply_flags;
906 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
907 				      ip_hdr(skb)->saddr, /* XXX */
908 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
909 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
910 	if (oif)
911 		arg.bound_dev_if = oif;
912 	arg.tos = tos;
913 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
914 	local_bh_disable();
915 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
916 	sock_net_set(ctl_sk, net);
917 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
918 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
919 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
920 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
921 	transmit_time = tcp_transmit_time(sk);
922 	ip_send_unicast_reply(ctl_sk,
923 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
924 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
925 			      &arg, arg.iov[0].iov_len,
926 			      transmit_time);
927 
928 	sock_net_set(ctl_sk, &init_net);
929 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
930 	local_bh_enable();
931 }
932 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
934 {
935 	struct inet_timewait_sock *tw = inet_twsk(sk);
936 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
937 
938 	tcp_v4_send_ack(sk, skb,
939 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
942 			tcptw->tw_ts_recent,
943 			tw->tw_bound_dev_if,
944 			tcp_twsk_md5_key(tcptw),
945 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 			tw->tw_tos
947 			);
948 
949 	inet_twsk_put(tw);
950 }
951 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 				  struct request_sock *req)
954 {
955 	const union tcp_md5_addr *addr;
956 	int l3index;
957 
958 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
959 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
960 	 */
961 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
962 					     tcp_sk(sk)->snd_nxt;
963 
964 	/* RFC 7323 2.3
965 	 * The window field (SEG.WND) of every outgoing segment, with the
966 	 * exception of <SYN> segments, MUST be right-shifted by
967 	 * Rcv.Wind.Shift bits:
968 	 */
969 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 	tcp_v4_send_ack(sk, skb, seq,
972 			tcp_rsk(req)->rcv_nxt,
973 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
975 			READ_ONCE(req->ts_recent),
976 			0,
977 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
979 			ip_hdr(skb)->tos);
980 }
981 
982 /*
983  *	Send a SYN-ACK after having received a SYN.
984  *	This still operates on a request_sock only, not on a big
985  *	socket.
986  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
988 			      struct flowi *fl,
989 			      struct request_sock *req,
990 			      struct tcp_fastopen_cookie *foc,
991 			      enum tcp_synack_type synack_type,
992 			      struct sk_buff *syn_skb)
993 {
994 	const struct inet_request_sock *ireq = inet_rsk(req);
995 	struct flowi4 fl4;
996 	int err = -1;
997 	struct sk_buff *skb;
998 	u8 tos;
999 
1000 	/* First, grab a route. */
1001 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002 		return -1;
1003 
1004 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005 
1006 	if (skb) {
1007 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008 
1009 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1010 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1012 				inet_sk(sk)->tos;
1013 
1014 		if (!INET_ECN_is_capable(tos) &&
1015 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 			tos |= INET_ECN_ECT_0;
1017 
1018 		rcu_read_lock();
1019 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020 					    ireq->ir_rmt_addr,
1021 					    rcu_dereference(ireq->ireq_opt),
1022 					    tos);
1023 		rcu_read_unlock();
1024 		err = net_xmit_eval(err);
1025 	}
1026 
1027 	return err;
1028 }
1029 
1030 /*
1031  *	IPv4 request_sock destructor.
1032  */
tcp_v4_reqsk_destructor(struct request_sock * req)1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037 
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040  * RFC2385 MD5 checksumming requires a mapping of
1041  * IP address->MD5 Key.
1042  * We need to maintain these in the sk structure.
1043  */
1044 
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050 	if (!old)
1051 		return true;
1052 
1053 	/* l3index always overrides non-l3index */
1054 	if (old->l3index && new->l3index == 0)
1055 		return false;
1056 	if (old->l3index == 0 && new->l3index)
1057 		return true;
1058 
1059 	return old->prefixlen < new->prefixlen;
1060 }
1061 
1062 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 					   const union tcp_md5_addr *addr,
1065 					   int family)
1066 {
1067 	const struct tcp_sock *tp = tcp_sk(sk);
1068 	struct tcp_md5sig_key *key;
1069 	const struct tcp_md5sig_info *md5sig;
1070 	__be32 mask;
1071 	struct tcp_md5sig_key *best_match = NULL;
1072 	bool match;
1073 
1074 	/* caller either holds rcu_read_lock() or socket lock */
1075 	md5sig = rcu_dereference_check(tp->md5sig_info,
1076 				       lockdep_sock_is_held(sk));
1077 	if (!md5sig)
1078 		return NULL;
1079 
1080 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 				 lockdep_sock_is_held(sk)) {
1082 		if (key->family != family)
1083 			continue;
1084 		if (key->l3index && key->l3index != l3index)
1085 			continue;
1086 		if (family == AF_INET) {
1087 			mask = inet_make_mask(key->prefixlen);
1088 			match = (key->addr.a4.s_addr & mask) ==
1089 				(addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091 		} else if (family == AF_INET6) {
1092 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 						  key->prefixlen);
1094 #endif
1095 		} else {
1096 			match = false;
1097 		}
1098 
1099 		if (match && better_md5_match(best_match, key))
1100 			best_match = key;
1101 	}
1102 	return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 						      const union tcp_md5_addr *addr,
1108 						      int family, u8 prefixlen,
1109 						      int l3index)
1110 {
1111 	const struct tcp_sock *tp = tcp_sk(sk);
1112 	struct tcp_md5sig_key *key;
1113 	unsigned int size = sizeof(struct in_addr);
1114 	const struct tcp_md5sig_info *md5sig;
1115 
1116 	/* caller either holds rcu_read_lock() or socket lock */
1117 	md5sig = rcu_dereference_check(tp->md5sig_info,
1118 				       lockdep_sock_is_held(sk));
1119 	if (!md5sig)
1120 		return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122 	if (family == AF_INET6)
1123 		size = sizeof(struct in6_addr);
1124 #endif
1125 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 				 lockdep_sock_is_held(sk)) {
1127 		if (key->family != family)
1128 			continue;
1129 		if (key->l3index != l3index)
1130 			continue;
1131 		if (!memcmp(&key->addr, addr, size) &&
1132 		    key->prefixlen == prefixlen)
1133 			return key;
1134 	}
1135 	return NULL;
1136 }
1137 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1138 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1139 					 const struct sock *addr_sk)
1140 {
1141 	const union tcp_md5_addr *addr;
1142 	int l3index;
1143 
1144 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1145 						 addr_sk->sk_bound_dev_if);
1146 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1147 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1148 }
1149 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1150 
1151 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1152 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1153 		   int family, u8 prefixlen, int l3index,
1154 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1155 {
1156 	/* Add Key to the list */
1157 	struct tcp_md5sig_key *key;
1158 	struct tcp_sock *tp = tcp_sk(sk);
1159 	struct tcp_md5sig_info *md5sig;
1160 
1161 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1162 	if (key) {
1163 		/* Pre-existing entry - just update that one.
1164 		 * Note that the key might be used concurrently.
1165 		 * data_race() is telling kcsan that we do not care of
1166 		 * key mismatches, since changing MD5 key on live flows
1167 		 * can lead to packet drops.
1168 		 */
1169 		data_race(memcpy(key->key, newkey, newkeylen));
1170 
1171 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1172 		 * Also note that a reader could catch new key->keylen value
1173 		 * but old key->key[], this is the reason we use __GFP_ZERO
1174 		 * at sock_kmalloc() time below these lines.
1175 		 */
1176 		WRITE_ONCE(key->keylen, newkeylen);
1177 
1178 		return 0;
1179 	}
1180 
1181 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1182 					   lockdep_sock_is_held(sk));
1183 	if (!md5sig) {
1184 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1185 		if (!md5sig)
1186 			return -ENOMEM;
1187 
1188 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1189 		INIT_HLIST_HEAD(&md5sig->head);
1190 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1191 	}
1192 
1193 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1194 	if (!key)
1195 		return -ENOMEM;
1196 	if (!tcp_alloc_md5sig_pool()) {
1197 		sock_kfree_s(sk, key, sizeof(*key));
1198 		return -ENOMEM;
1199 	}
1200 
1201 	memcpy(key->key, newkey, newkeylen);
1202 	key->keylen = newkeylen;
1203 	key->family = family;
1204 	key->prefixlen = prefixlen;
1205 	key->l3index = l3index;
1206 	memcpy(&key->addr, addr,
1207 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1208 				      sizeof(struct in_addr));
1209 	hlist_add_head_rcu(&key->node, &md5sig->head);
1210 	return 0;
1211 }
1212 EXPORT_SYMBOL(tcp_md5_do_add);
1213 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1214 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1215 		   u8 prefixlen, int l3index)
1216 {
1217 	struct tcp_md5sig_key *key;
1218 
1219 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1220 	if (!key)
1221 		return -ENOENT;
1222 	hlist_del_rcu(&key->node);
1223 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1224 	kfree_rcu(key, rcu);
1225 	return 0;
1226 }
1227 EXPORT_SYMBOL(tcp_md5_do_del);
1228 
tcp_clear_md5_list(struct sock * sk)1229 static void tcp_clear_md5_list(struct sock *sk)
1230 {
1231 	struct tcp_sock *tp = tcp_sk(sk);
1232 	struct tcp_md5sig_key *key;
1233 	struct hlist_node *n;
1234 	struct tcp_md5sig_info *md5sig;
1235 
1236 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1237 
1238 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1239 		hlist_del_rcu(&key->node);
1240 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1241 		kfree_rcu(key, rcu);
1242 	}
1243 }
1244 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1245 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1246 				 sockptr_t optval, int optlen)
1247 {
1248 	struct tcp_md5sig cmd;
1249 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1250 	const union tcp_md5_addr *addr;
1251 	u8 prefixlen = 32;
1252 	int l3index = 0;
1253 
1254 	if (optlen < sizeof(cmd))
1255 		return -EINVAL;
1256 
1257 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1258 		return -EFAULT;
1259 
1260 	if (sin->sin_family != AF_INET)
1261 		return -EINVAL;
1262 
1263 	if (optname == TCP_MD5SIG_EXT &&
1264 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1265 		prefixlen = cmd.tcpm_prefixlen;
1266 		if (prefixlen > 32)
1267 			return -EINVAL;
1268 	}
1269 
1270 	if (optname == TCP_MD5SIG_EXT &&
1271 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1272 		struct net_device *dev;
1273 
1274 		rcu_read_lock();
1275 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1276 		if (dev && netif_is_l3_master(dev))
1277 			l3index = dev->ifindex;
1278 
1279 		rcu_read_unlock();
1280 
1281 		/* ok to reference set/not set outside of rcu;
1282 		 * right now device MUST be an L3 master
1283 		 */
1284 		if (!dev || !l3index)
1285 			return -EINVAL;
1286 	}
1287 
1288 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1289 
1290 	if (!cmd.tcpm_keylen)
1291 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1292 
1293 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1294 		return -EINVAL;
1295 
1296 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1297 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1298 }
1299 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1300 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1301 				   __be32 daddr, __be32 saddr,
1302 				   const struct tcphdr *th, int nbytes)
1303 {
1304 	struct tcp4_pseudohdr *bp;
1305 	struct scatterlist sg;
1306 	struct tcphdr *_th;
1307 
1308 	bp = hp->scratch;
1309 	bp->saddr = saddr;
1310 	bp->daddr = daddr;
1311 	bp->pad = 0;
1312 	bp->protocol = IPPROTO_TCP;
1313 	bp->len = cpu_to_be16(nbytes);
1314 
1315 	_th = (struct tcphdr *)(bp + 1);
1316 	memcpy(_th, th, sizeof(*th));
1317 	_th->check = 0;
1318 
1319 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1320 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1321 				sizeof(*bp) + sizeof(*th));
1322 	return crypto_ahash_update(hp->md5_req);
1323 }
1324 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1325 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1326 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1327 {
1328 	struct tcp_md5sig_pool *hp;
1329 	struct ahash_request *req;
1330 
1331 	hp = tcp_get_md5sig_pool();
1332 	if (!hp)
1333 		goto clear_hash_noput;
1334 	req = hp->md5_req;
1335 
1336 	if (crypto_ahash_init(req))
1337 		goto clear_hash;
1338 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1339 		goto clear_hash;
1340 	if (tcp_md5_hash_key(hp, key))
1341 		goto clear_hash;
1342 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1343 	if (crypto_ahash_final(req))
1344 		goto clear_hash;
1345 
1346 	tcp_put_md5sig_pool();
1347 	return 0;
1348 
1349 clear_hash:
1350 	tcp_put_md5sig_pool();
1351 clear_hash_noput:
1352 	memset(md5_hash, 0, 16);
1353 	return 1;
1354 }
1355 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1356 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1357 			const struct sock *sk,
1358 			const struct sk_buff *skb)
1359 {
1360 	struct tcp_md5sig_pool *hp;
1361 	struct ahash_request *req;
1362 	const struct tcphdr *th = tcp_hdr(skb);
1363 	__be32 saddr, daddr;
1364 
1365 	if (sk) { /* valid for establish/request sockets */
1366 		saddr = sk->sk_rcv_saddr;
1367 		daddr = sk->sk_daddr;
1368 	} else {
1369 		const struct iphdr *iph = ip_hdr(skb);
1370 		saddr = iph->saddr;
1371 		daddr = iph->daddr;
1372 	}
1373 
1374 	hp = tcp_get_md5sig_pool();
1375 	if (!hp)
1376 		goto clear_hash_noput;
1377 	req = hp->md5_req;
1378 
1379 	if (crypto_ahash_init(req))
1380 		goto clear_hash;
1381 
1382 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1383 		goto clear_hash;
1384 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1385 		goto clear_hash;
1386 	if (tcp_md5_hash_key(hp, key))
1387 		goto clear_hash;
1388 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1389 	if (crypto_ahash_final(req))
1390 		goto clear_hash;
1391 
1392 	tcp_put_md5sig_pool();
1393 	return 0;
1394 
1395 clear_hash:
1396 	tcp_put_md5sig_pool();
1397 clear_hash_noput:
1398 	memset(md5_hash, 0, 16);
1399 	return 1;
1400 }
1401 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1402 
1403 #endif
1404 
1405 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1406 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1407 				    const struct sk_buff *skb,
1408 				    int dif, int sdif)
1409 {
1410 #ifdef CONFIG_TCP_MD5SIG
1411 	/*
1412 	 * This gets called for each TCP segment that arrives
1413 	 * so we want to be efficient.
1414 	 * We have 3 drop cases:
1415 	 * o No MD5 hash and one expected.
1416 	 * o MD5 hash and we're not expecting one.
1417 	 * o MD5 hash and its wrong.
1418 	 */
1419 	const __u8 *hash_location = NULL;
1420 	struct tcp_md5sig_key *hash_expected;
1421 	const struct iphdr *iph = ip_hdr(skb);
1422 	const struct tcphdr *th = tcp_hdr(skb);
1423 	const union tcp_md5_addr *addr;
1424 	unsigned char newhash[16];
1425 	int genhash, l3index;
1426 
1427 	/* sdif set, means packet ingressed via a device
1428 	 * in an L3 domain and dif is set to the l3mdev
1429 	 */
1430 	l3index = sdif ? dif : 0;
1431 
1432 	addr = (union tcp_md5_addr *)&iph->saddr;
1433 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1434 	hash_location = tcp_parse_md5sig_option(th);
1435 
1436 	/* We've parsed the options - do we have a hash? */
1437 	if (!hash_expected && !hash_location)
1438 		return false;
1439 
1440 	if (hash_expected && !hash_location) {
1441 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1442 		return true;
1443 	}
1444 
1445 	if (!hash_expected && hash_location) {
1446 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1447 		return true;
1448 	}
1449 
1450 	/* Okay, so this is hash_expected and hash_location -
1451 	 * so we need to calculate the checksum.
1452 	 */
1453 	genhash = tcp_v4_md5_hash_skb(newhash,
1454 				      hash_expected,
1455 				      NULL, skb);
1456 
1457 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1458 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1459 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1460 				     &iph->saddr, ntohs(th->source),
1461 				     &iph->daddr, ntohs(th->dest),
1462 				     genhash ? " tcp_v4_calc_md5_hash failed"
1463 				     : "", l3index);
1464 		return true;
1465 	}
1466 	return false;
1467 #endif
1468 	return false;
1469 }
1470 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1471 static void tcp_v4_init_req(struct request_sock *req,
1472 			    const struct sock *sk_listener,
1473 			    struct sk_buff *skb)
1474 {
1475 	struct inet_request_sock *ireq = inet_rsk(req);
1476 	struct net *net = sock_net(sk_listener);
1477 
1478 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1479 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1480 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1481 }
1482 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1483 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1484 					  struct flowi *fl,
1485 					  const struct request_sock *req)
1486 {
1487 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1488 }
1489 
1490 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1491 	.family		=	PF_INET,
1492 	.obj_size	=	sizeof(struct tcp_request_sock),
1493 	.rtx_syn_ack	=	tcp_rtx_synack,
1494 	.send_ack	=	tcp_v4_reqsk_send_ack,
1495 	.destructor	=	tcp_v4_reqsk_destructor,
1496 	.send_reset	=	tcp_v4_send_reset,
1497 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1498 };
1499 
1500 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1501 	.mss_clamp	=	TCP_MSS_DEFAULT,
1502 #ifdef CONFIG_TCP_MD5SIG
1503 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1504 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1505 #endif
1506 	.init_req	=	tcp_v4_init_req,
1507 #ifdef CONFIG_SYN_COOKIES
1508 	.cookie_init_seq =	cookie_v4_init_sequence,
1509 #endif
1510 	.route_req	=	tcp_v4_route_req,
1511 	.init_seq	=	tcp_v4_init_seq,
1512 	.init_ts_off	=	tcp_v4_init_ts_off,
1513 	.send_synack	=	tcp_v4_send_synack,
1514 };
1515 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1516 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1517 {
1518 	/* Never answer to SYNs send to broadcast or multicast */
1519 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1520 		goto drop;
1521 
1522 	return tcp_conn_request(&tcp_request_sock_ops,
1523 				&tcp_request_sock_ipv4_ops, sk, skb);
1524 
1525 drop:
1526 	tcp_listendrop(sk);
1527 	return 0;
1528 }
1529 EXPORT_SYMBOL(tcp_v4_conn_request);
1530 
1531 
1532 /*
1533  * The three way handshake has completed - we got a valid synack -
1534  * now create the new socket.
1535  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1536 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1537 				  struct request_sock *req,
1538 				  struct dst_entry *dst,
1539 				  struct request_sock *req_unhash,
1540 				  bool *own_req)
1541 {
1542 	struct inet_request_sock *ireq;
1543 	bool found_dup_sk = false;
1544 	struct inet_sock *newinet;
1545 	struct tcp_sock *newtp;
1546 	struct sock *newsk;
1547 #ifdef CONFIG_TCP_MD5SIG
1548 	const union tcp_md5_addr *addr;
1549 	struct tcp_md5sig_key *key;
1550 	int l3index;
1551 #endif
1552 	struct ip_options_rcu *inet_opt;
1553 
1554 	if (sk_acceptq_is_full(sk))
1555 		goto exit_overflow;
1556 
1557 	newsk = tcp_create_openreq_child(sk, req, skb);
1558 	if (!newsk)
1559 		goto exit_nonewsk;
1560 
1561 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1562 	inet_sk_rx_dst_set(newsk, skb);
1563 
1564 	newtp		      = tcp_sk(newsk);
1565 	newinet		      = inet_sk(newsk);
1566 	ireq		      = inet_rsk(req);
1567 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1568 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1569 	newsk->sk_bound_dev_if = ireq->ir_iif;
1570 	newinet->inet_saddr   = ireq->ir_loc_addr;
1571 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1572 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1573 	newinet->mc_index     = inet_iif(skb);
1574 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1575 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1576 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1577 	if (inet_opt)
1578 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1579 	newinet->inet_id = prandom_u32();
1580 
1581 	/* Set ToS of the new socket based upon the value of incoming SYN.
1582 	 * ECT bits are set later in tcp_init_transfer().
1583 	 */
1584 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1585 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1586 
1587 	if (!dst) {
1588 		dst = inet_csk_route_child_sock(sk, newsk, req);
1589 		if (!dst)
1590 			goto put_and_exit;
1591 	} else {
1592 		/* syncookie case : see end of cookie_v4_check() */
1593 	}
1594 	sk_setup_caps(newsk, dst);
1595 
1596 	tcp_ca_openreq_child(newsk, dst);
1597 
1598 	tcp_sync_mss(newsk, dst_mtu(dst));
1599 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1600 
1601 	tcp_initialize_rcv_mss(newsk);
1602 
1603 #ifdef CONFIG_TCP_MD5SIG
1604 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1605 	/* Copy over the MD5 key from the original socket */
1606 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1607 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1608 	if (key) {
1609 		/*
1610 		 * We're using one, so create a matching key
1611 		 * on the newsk structure. If we fail to get
1612 		 * memory, then we end up not copying the key
1613 		 * across. Shucks.
1614 		 */
1615 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1616 			       key->key, key->keylen, GFP_ATOMIC);
1617 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1618 	}
1619 #endif
1620 
1621 	if (__inet_inherit_port(sk, newsk) < 0)
1622 		goto put_and_exit;
1623 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1624 				       &found_dup_sk);
1625 	if (likely(*own_req)) {
1626 		tcp_move_syn(newtp, req);
1627 		ireq->ireq_opt = NULL;
1628 	} else {
1629 		newinet->inet_opt = NULL;
1630 
1631 		if (!req_unhash && found_dup_sk) {
1632 			/* This code path should only be executed in the
1633 			 * syncookie case only
1634 			 */
1635 			bh_unlock_sock(newsk);
1636 			sock_put(newsk);
1637 			newsk = NULL;
1638 		}
1639 	}
1640 	return newsk;
1641 
1642 exit_overflow:
1643 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1644 exit_nonewsk:
1645 	dst_release(dst);
1646 exit:
1647 	tcp_listendrop(sk);
1648 	return NULL;
1649 put_and_exit:
1650 	newinet->inet_opt = NULL;
1651 	inet_csk_prepare_forced_close(newsk);
1652 	tcp_done(newsk);
1653 	goto exit;
1654 }
1655 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1656 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1657 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1658 {
1659 #ifdef CONFIG_SYN_COOKIES
1660 	const struct tcphdr *th = tcp_hdr(skb);
1661 
1662 	if (!th->syn)
1663 		sk = cookie_v4_check(sk, skb);
1664 #endif
1665 	return sk;
1666 }
1667 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1668 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1669 			 struct tcphdr *th, u32 *cookie)
1670 {
1671 	u16 mss = 0;
1672 #ifdef CONFIG_SYN_COOKIES
1673 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1674 				    &tcp_request_sock_ipv4_ops, sk, th);
1675 	if (mss) {
1676 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1677 		tcp_synq_overflow(sk);
1678 	}
1679 #endif
1680 	return mss;
1681 }
1682 
1683 /* The socket must have it's spinlock held when we get
1684  * here, unless it is a TCP_LISTEN socket.
1685  *
1686  * We have a potential double-lock case here, so even when
1687  * doing backlog processing we use the BH locking scheme.
1688  * This is because we cannot sleep with the original spinlock
1689  * held.
1690  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1691 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1692 {
1693 	struct sock *rsk;
1694 
1695 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1696 		struct dst_entry *dst;
1697 
1698 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1699 						lockdep_sock_is_held(sk));
1700 
1701 		sock_rps_save_rxhash(sk, skb);
1702 		sk_mark_napi_id(sk, skb);
1703 		if (dst) {
1704 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1705 			    !dst->ops->check(dst, 0)) {
1706 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1707 				dst_release(dst);
1708 			}
1709 		}
1710 		tcp_rcv_established(sk, skb);
1711 		return 0;
1712 	}
1713 
1714 	if (tcp_checksum_complete(skb))
1715 		goto csum_err;
1716 
1717 	if (sk->sk_state == TCP_LISTEN) {
1718 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1719 
1720 		if (!nsk)
1721 			goto discard;
1722 		if (nsk != sk) {
1723 			if (tcp_child_process(sk, nsk, skb)) {
1724 				rsk = nsk;
1725 				goto reset;
1726 			}
1727 			return 0;
1728 		}
1729 	} else
1730 		sock_rps_save_rxhash(sk, skb);
1731 
1732 	if (tcp_rcv_state_process(sk, skb)) {
1733 		rsk = sk;
1734 		goto reset;
1735 	}
1736 	return 0;
1737 
1738 reset:
1739 	tcp_v4_send_reset(rsk, skb);
1740 discard:
1741 	kfree_skb(skb);
1742 	/* Be careful here. If this function gets more complicated and
1743 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1744 	 * might be destroyed here. This current version compiles correctly,
1745 	 * but you have been warned.
1746 	 */
1747 	return 0;
1748 
1749 csum_err:
1750 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1751 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1752 	goto discard;
1753 }
1754 EXPORT_SYMBOL(tcp_v4_do_rcv);
1755 
tcp_v4_early_demux(struct sk_buff * skb)1756 int tcp_v4_early_demux(struct sk_buff *skb)
1757 {
1758 	const struct iphdr *iph;
1759 	const struct tcphdr *th;
1760 	struct sock *sk;
1761 
1762 	if (skb->pkt_type != PACKET_HOST)
1763 		return 0;
1764 
1765 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1766 		return 0;
1767 
1768 	iph = ip_hdr(skb);
1769 	th = tcp_hdr(skb);
1770 
1771 	if (th->doff < sizeof(struct tcphdr) / 4)
1772 		return 0;
1773 
1774 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1775 				       iph->saddr, th->source,
1776 				       iph->daddr, ntohs(th->dest),
1777 				       skb->skb_iif, inet_sdif(skb));
1778 	if (sk) {
1779 		skb->sk = sk;
1780 		skb->destructor = sock_edemux;
1781 		if (sk_fullsock(sk)) {
1782 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1783 
1784 			if (dst)
1785 				dst = dst_check(dst, 0);
1786 			if (dst &&
1787 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1788 				skb_dst_set_noref(skb, dst);
1789 		}
1790 	}
1791 	return 0;
1792 }
1793 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1794 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1795 {
1796 	u32 limit, tail_gso_size, tail_gso_segs;
1797 	struct skb_shared_info *shinfo;
1798 	const struct tcphdr *th;
1799 	struct tcphdr *thtail;
1800 	struct sk_buff *tail;
1801 	unsigned int hdrlen;
1802 	bool fragstolen;
1803 	u32 gso_segs;
1804 	u32 gso_size;
1805 	int delta;
1806 
1807 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1808 	 * we can fix skb->truesize to its real value to avoid future drops.
1809 	 * This is valid because skb is not yet charged to the socket.
1810 	 * It has been noticed pure SACK packets were sometimes dropped
1811 	 * (if cooked by drivers without copybreak feature).
1812 	 */
1813 	skb_condense(skb);
1814 
1815 	skb_dst_drop(skb);
1816 
1817 	if (unlikely(tcp_checksum_complete(skb))) {
1818 		bh_unlock_sock(sk);
1819 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1820 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1821 		return true;
1822 	}
1823 
1824 	/* Attempt coalescing to last skb in backlog, even if we are
1825 	 * above the limits.
1826 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1827 	 */
1828 	th = (const struct tcphdr *)skb->data;
1829 	hdrlen = th->doff * 4;
1830 
1831 	tail = sk->sk_backlog.tail;
1832 	if (!tail)
1833 		goto no_coalesce;
1834 	thtail = (struct tcphdr *)tail->data;
1835 
1836 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1837 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1838 	    ((TCP_SKB_CB(tail)->tcp_flags |
1839 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1840 	    !((TCP_SKB_CB(tail)->tcp_flags &
1841 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1842 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1843 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1844 #ifdef CONFIG_TLS_DEVICE
1845 	    tail->decrypted != skb->decrypted ||
1846 #endif
1847 	    !mptcp_skb_can_collapse(tail, skb) ||
1848 	    thtail->doff != th->doff ||
1849 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1850 		goto no_coalesce;
1851 
1852 	__skb_pull(skb, hdrlen);
1853 
1854 	shinfo = skb_shinfo(skb);
1855 	gso_size = shinfo->gso_size ?: skb->len;
1856 	gso_segs = shinfo->gso_segs ?: 1;
1857 
1858 	shinfo = skb_shinfo(tail);
1859 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1860 	tail_gso_segs = shinfo->gso_segs ?: 1;
1861 
1862 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1863 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1864 
1865 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1866 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1867 			thtail->window = th->window;
1868 		}
1869 
1870 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1871 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1872 		 * is not entered if we append a packet with a FIN.
1873 		 * SYN, RST, URG are not present.
1874 		 * ACK is set on both packets.
1875 		 * PSH : we do not really care in TCP stack,
1876 		 *       at least for 'GRO' packets.
1877 		 */
1878 		thtail->fin |= th->fin;
1879 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1880 
1881 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1882 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1883 			tail->tstamp = skb->tstamp;
1884 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1885 		}
1886 
1887 		/* Not as strict as GRO. We only need to carry mss max value */
1888 		shinfo->gso_size = max(gso_size, tail_gso_size);
1889 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1890 
1891 		sk->sk_backlog.len += delta;
1892 		__NET_INC_STATS(sock_net(sk),
1893 				LINUX_MIB_TCPBACKLOGCOALESCE);
1894 		kfree_skb_partial(skb, fragstolen);
1895 		return false;
1896 	}
1897 	__skb_push(skb, hdrlen);
1898 
1899 no_coalesce:
1900 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1901 
1902 	/* Only socket owner can try to collapse/prune rx queues
1903 	 * to reduce memory overhead, so add a little headroom here.
1904 	 * Few sockets backlog are possibly concurrently non empty.
1905 	 */
1906 	limit += 64 * 1024;
1907 
1908 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1909 		bh_unlock_sock(sk);
1910 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1911 		return true;
1912 	}
1913 	return false;
1914 }
1915 EXPORT_SYMBOL(tcp_add_backlog);
1916 
tcp_filter(struct sock * sk,struct sk_buff * skb)1917 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1918 {
1919 	struct tcphdr *th = (struct tcphdr *)skb->data;
1920 
1921 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1922 }
1923 EXPORT_SYMBOL(tcp_filter);
1924 
tcp_v4_restore_cb(struct sk_buff * skb)1925 static void tcp_v4_restore_cb(struct sk_buff *skb)
1926 {
1927 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1928 		sizeof(struct inet_skb_parm));
1929 }
1930 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1931 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1932 			   const struct tcphdr *th)
1933 {
1934 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1935 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1936 	 */
1937 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1938 		sizeof(struct inet_skb_parm));
1939 	barrier();
1940 
1941 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1942 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1943 				    skb->len - th->doff * 4);
1944 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1945 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1946 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1947 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1948 	TCP_SKB_CB(skb)->sacked	 = 0;
1949 	TCP_SKB_CB(skb)->has_rxtstamp =
1950 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1951 }
1952 
1953 /*
1954  *	From tcp_input.c
1955  */
1956 
tcp_v4_rcv(struct sk_buff * skb)1957 int tcp_v4_rcv(struct sk_buff *skb)
1958 {
1959 	struct net *net = dev_net(skb->dev);
1960 	struct sk_buff *skb_to_free;
1961 	int sdif = inet_sdif(skb);
1962 	int dif = inet_iif(skb);
1963 	const struct iphdr *iph;
1964 	const struct tcphdr *th;
1965 	bool refcounted;
1966 	struct sock *sk;
1967 	int ret;
1968 
1969 	if (skb->pkt_type != PACKET_HOST)
1970 		goto discard_it;
1971 
1972 	/* Count it even if it's bad */
1973 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1974 
1975 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1976 		goto discard_it;
1977 
1978 	th = (const struct tcphdr *)skb->data;
1979 
1980 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1981 		goto bad_packet;
1982 	if (!pskb_may_pull(skb, th->doff * 4))
1983 		goto discard_it;
1984 
1985 	/* An explanation is required here, I think.
1986 	 * Packet length and doff are validated by header prediction,
1987 	 * provided case of th->doff==0 is eliminated.
1988 	 * So, we defer the checks. */
1989 
1990 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1991 		goto csum_error;
1992 
1993 	th = (const struct tcphdr *)skb->data;
1994 	iph = ip_hdr(skb);
1995 lookup:
1996 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1997 			       th->dest, sdif, &refcounted);
1998 	if (!sk)
1999 		goto no_tcp_socket;
2000 
2001 process:
2002 	if (sk->sk_state == TCP_TIME_WAIT)
2003 		goto do_time_wait;
2004 
2005 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2006 		struct request_sock *req = inet_reqsk(sk);
2007 		bool req_stolen = false;
2008 		struct sock *nsk;
2009 
2010 		sk = req->rsk_listener;
2011 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2012 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2013 			sk_drops_add(sk, skb);
2014 			reqsk_put(req);
2015 			goto discard_it;
2016 		}
2017 		if (tcp_checksum_complete(skb)) {
2018 			reqsk_put(req);
2019 			goto csum_error;
2020 		}
2021 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2022 			inet_csk_reqsk_queue_drop_and_put(sk, req);
2023 			goto lookup;
2024 		}
2025 		/* We own a reference on the listener, increase it again
2026 		 * as we might lose it too soon.
2027 		 */
2028 		sock_hold(sk);
2029 		refcounted = true;
2030 		nsk = NULL;
2031 		if (!tcp_filter(sk, skb)) {
2032 			th = (const struct tcphdr *)skb->data;
2033 			iph = ip_hdr(skb);
2034 			tcp_v4_fill_cb(skb, iph, th);
2035 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2036 		}
2037 		if (!nsk) {
2038 			reqsk_put(req);
2039 			if (req_stolen) {
2040 				/* Another cpu got exclusive access to req
2041 				 * and created a full blown socket.
2042 				 * Try to feed this packet to this socket
2043 				 * instead of discarding it.
2044 				 */
2045 				tcp_v4_restore_cb(skb);
2046 				sock_put(sk);
2047 				goto lookup;
2048 			}
2049 			goto discard_and_relse;
2050 		}
2051 		nf_reset_ct(skb);
2052 		if (nsk == sk) {
2053 			reqsk_put(req);
2054 			tcp_v4_restore_cb(skb);
2055 		} else if (tcp_child_process(sk, nsk, skb)) {
2056 			tcp_v4_send_reset(nsk, skb);
2057 			goto discard_and_relse;
2058 		} else {
2059 			sock_put(sk);
2060 			return 0;
2061 		}
2062 	}
2063 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2064 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2065 		goto discard_and_relse;
2066 	}
2067 
2068 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2069 		goto discard_and_relse;
2070 
2071 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2072 		goto discard_and_relse;
2073 
2074 	nf_reset_ct(skb);
2075 
2076 	if (tcp_filter(sk, skb))
2077 		goto discard_and_relse;
2078 	th = (const struct tcphdr *)skb->data;
2079 	iph = ip_hdr(skb);
2080 	tcp_v4_fill_cb(skb, iph, th);
2081 
2082 	skb->dev = NULL;
2083 
2084 	if (sk->sk_state == TCP_LISTEN) {
2085 		ret = tcp_v4_do_rcv(sk, skb);
2086 		goto put_and_return;
2087 	}
2088 
2089 	sk_incoming_cpu_update(sk);
2090 
2091 	bh_lock_sock_nested(sk);
2092 	tcp_segs_in(tcp_sk(sk), skb);
2093 	ret = 0;
2094 	if (!sock_owned_by_user(sk)) {
2095 		skb_to_free = sk->sk_rx_skb_cache;
2096 		sk->sk_rx_skb_cache = NULL;
2097 		ret = tcp_v4_do_rcv(sk, skb);
2098 	} else {
2099 		if (tcp_add_backlog(sk, skb))
2100 			goto discard_and_relse;
2101 		skb_to_free = NULL;
2102 	}
2103 	bh_unlock_sock(sk);
2104 	if (skb_to_free)
2105 		__kfree_skb(skb_to_free);
2106 
2107 put_and_return:
2108 	if (refcounted)
2109 		sock_put(sk);
2110 
2111 	return ret;
2112 
2113 no_tcp_socket:
2114 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2115 		goto discard_it;
2116 
2117 	tcp_v4_fill_cb(skb, iph, th);
2118 
2119 	if (tcp_checksum_complete(skb)) {
2120 csum_error:
2121 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2122 bad_packet:
2123 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2124 	} else {
2125 		tcp_v4_send_reset(NULL, skb);
2126 	}
2127 
2128 discard_it:
2129 	/* Discard frame. */
2130 	kfree_skb(skb);
2131 	return 0;
2132 
2133 discard_and_relse:
2134 	sk_drops_add(sk, skb);
2135 	if (refcounted)
2136 		sock_put(sk);
2137 	goto discard_it;
2138 
2139 do_time_wait:
2140 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2141 		inet_twsk_put(inet_twsk(sk));
2142 		goto discard_it;
2143 	}
2144 
2145 	tcp_v4_fill_cb(skb, iph, th);
2146 
2147 	if (tcp_checksum_complete(skb)) {
2148 		inet_twsk_put(inet_twsk(sk));
2149 		goto csum_error;
2150 	}
2151 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2152 	case TCP_TW_SYN: {
2153 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2154 							&tcp_hashinfo, skb,
2155 							__tcp_hdrlen(th),
2156 							iph->saddr, th->source,
2157 							iph->daddr, th->dest,
2158 							inet_iif(skb),
2159 							sdif);
2160 		if (sk2) {
2161 			inet_twsk_deschedule_put(inet_twsk(sk));
2162 			sk = sk2;
2163 			tcp_v4_restore_cb(skb);
2164 			refcounted = false;
2165 			goto process;
2166 		}
2167 	}
2168 		/* to ACK */
2169 		fallthrough;
2170 	case TCP_TW_ACK:
2171 		tcp_v4_timewait_ack(sk, skb);
2172 		break;
2173 	case TCP_TW_RST:
2174 		tcp_v4_send_reset(sk, skb);
2175 		inet_twsk_deschedule_put(inet_twsk(sk));
2176 		goto discard_it;
2177 	case TCP_TW_SUCCESS:;
2178 	}
2179 	goto discard_it;
2180 }
2181 
2182 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2183 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2184 	.twsk_unique	= tcp_twsk_unique,
2185 	.twsk_destructor= tcp_twsk_destructor,
2186 };
2187 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2188 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2189 {
2190 	struct dst_entry *dst = skb_dst(skb);
2191 
2192 	if (dst && dst_hold_safe(dst)) {
2193 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2194 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2195 	}
2196 }
2197 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2198 
2199 const struct inet_connection_sock_af_ops ipv4_specific = {
2200 	.queue_xmit	   = ip_queue_xmit,
2201 	.send_check	   = tcp_v4_send_check,
2202 	.rebuild_header	   = inet_sk_rebuild_header,
2203 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2204 	.conn_request	   = tcp_v4_conn_request,
2205 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2206 	.net_header_len	   = sizeof(struct iphdr),
2207 	.setsockopt	   = ip_setsockopt,
2208 	.getsockopt	   = ip_getsockopt,
2209 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2210 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2211 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2212 };
2213 EXPORT_SYMBOL(ipv4_specific);
2214 
2215 #ifdef CONFIG_TCP_MD5SIG
2216 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2217 	.md5_lookup		= tcp_v4_md5_lookup,
2218 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2219 	.md5_parse		= tcp_v4_parse_md5_keys,
2220 };
2221 #endif
2222 
2223 /* NOTE: A lot of things set to zero explicitly by call to
2224  *       sk_alloc() so need not be done here.
2225  */
tcp_v4_init_sock(struct sock * sk)2226 static int tcp_v4_init_sock(struct sock *sk)
2227 {
2228 	struct inet_connection_sock *icsk = inet_csk(sk);
2229 
2230 	tcp_init_sock(sk);
2231 
2232 	icsk->icsk_af_ops = &ipv4_specific;
2233 
2234 #ifdef CONFIG_TCP_MD5SIG
2235 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2236 #endif
2237 
2238 	return 0;
2239 }
2240 
tcp_v4_destroy_sock(struct sock * sk)2241 void tcp_v4_destroy_sock(struct sock *sk)
2242 {
2243 	struct tcp_sock *tp = tcp_sk(sk);
2244 
2245 	trace_tcp_destroy_sock(sk);
2246 
2247 	tcp_clear_xmit_timers(sk);
2248 
2249 	tcp_cleanup_congestion_control(sk);
2250 
2251 	tcp_cleanup_ulp(sk);
2252 
2253 	/* Cleanup up the write buffer. */
2254 	tcp_write_queue_purge(sk);
2255 
2256 	/* Check if we want to disable active TFO */
2257 	tcp_fastopen_active_disable_ofo_check(sk);
2258 
2259 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2260 	skb_rbtree_purge(&tp->out_of_order_queue);
2261 
2262 #ifdef CONFIG_TCP_MD5SIG
2263 	/* Clean up the MD5 key list, if any */
2264 	if (tp->md5sig_info) {
2265 		tcp_clear_md5_list(sk);
2266 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2267 		tp->md5sig_info = NULL;
2268 	}
2269 #endif
2270 
2271 	/* Clean up a referenced TCP bind bucket. */
2272 	if (inet_csk(sk)->icsk_bind_hash)
2273 		inet_put_port(sk);
2274 
2275 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2276 
2277 	/* If socket is aborted during connect operation */
2278 	tcp_free_fastopen_req(tp);
2279 	tcp_fastopen_destroy_cipher(sk);
2280 	tcp_saved_syn_free(tp);
2281 
2282 	sk_sockets_allocated_dec(sk);
2283 }
2284 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2285 
2286 #ifdef CONFIG_PROC_FS
2287 /* Proc filesystem TCP sock list dumping. */
2288 
2289 /*
2290  * Get next listener socket follow cur.  If cur is NULL, get first socket
2291  * starting from bucket given in st->bucket; when st->bucket is zero the
2292  * very first socket in the hash table is returned.
2293  */
listening_get_next(struct seq_file * seq,void * cur)2294 static void *listening_get_next(struct seq_file *seq, void *cur)
2295 {
2296 	struct tcp_seq_afinfo *afinfo;
2297 	struct tcp_iter_state *st = seq->private;
2298 	struct net *net = seq_file_net(seq);
2299 	struct inet_listen_hashbucket *ilb;
2300 	struct hlist_nulls_node *node;
2301 	struct sock *sk = cur;
2302 
2303 	if (st->bpf_seq_afinfo)
2304 		afinfo = st->bpf_seq_afinfo;
2305 	else
2306 		afinfo = PDE_DATA(file_inode(seq->file));
2307 
2308 	if (!sk) {
2309 get_head:
2310 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311 		spin_lock(&ilb->lock);
2312 		sk = sk_nulls_head(&ilb->nulls_head);
2313 		st->offset = 0;
2314 		goto get_sk;
2315 	}
2316 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2317 	++st->num;
2318 	++st->offset;
2319 
2320 	sk = sk_nulls_next(sk);
2321 get_sk:
2322 	sk_nulls_for_each_from(sk, node) {
2323 		if (!net_eq(sock_net(sk), net))
2324 			continue;
2325 		if (afinfo->family == AF_UNSPEC ||
2326 		    sk->sk_family == afinfo->family)
2327 			return sk;
2328 	}
2329 	spin_unlock(&ilb->lock);
2330 	st->offset = 0;
2331 	if (++st->bucket < INET_LHTABLE_SIZE)
2332 		goto get_head;
2333 	return NULL;
2334 }
2335 
listening_get_idx(struct seq_file * seq,loff_t * pos)2336 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 {
2338 	struct tcp_iter_state *st = seq->private;
2339 	void *rc;
2340 
2341 	st->bucket = 0;
2342 	st->offset = 0;
2343 	rc = listening_get_next(seq, NULL);
2344 
2345 	while (rc && *pos) {
2346 		rc = listening_get_next(seq, rc);
2347 		--*pos;
2348 	}
2349 	return rc;
2350 }
2351 
empty_bucket(const struct tcp_iter_state * st)2352 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 {
2354 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355 }
2356 
2357 /*
2358  * Get first established socket starting from bucket given in st->bucket.
2359  * If st->bucket is zero, the very first socket in the hash is returned.
2360  */
established_get_first(struct seq_file * seq)2361 static void *established_get_first(struct seq_file *seq)
2362 {
2363 	struct tcp_seq_afinfo *afinfo;
2364 	struct tcp_iter_state *st = seq->private;
2365 	struct net *net = seq_file_net(seq);
2366 	void *rc = NULL;
2367 
2368 	if (st->bpf_seq_afinfo)
2369 		afinfo = st->bpf_seq_afinfo;
2370 	else
2371 		afinfo = PDE_DATA(file_inode(seq->file));
2372 
2373 	st->offset = 0;
2374 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2375 		struct sock *sk;
2376 		struct hlist_nulls_node *node;
2377 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2378 
2379 		/* Lockless fast path for the common case of empty buckets */
2380 		if (empty_bucket(st))
2381 			continue;
2382 
2383 		spin_lock_bh(lock);
2384 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2385 			if ((afinfo->family != AF_UNSPEC &&
2386 			     sk->sk_family != afinfo->family) ||
2387 			    !net_eq(sock_net(sk), net)) {
2388 				continue;
2389 			}
2390 			rc = sk;
2391 			goto out;
2392 		}
2393 		spin_unlock_bh(lock);
2394 	}
2395 out:
2396 	return rc;
2397 }
2398 
established_get_next(struct seq_file * seq,void * cur)2399 static void *established_get_next(struct seq_file *seq, void *cur)
2400 {
2401 	struct tcp_seq_afinfo *afinfo;
2402 	struct sock *sk = cur;
2403 	struct hlist_nulls_node *node;
2404 	struct tcp_iter_state *st = seq->private;
2405 	struct net *net = seq_file_net(seq);
2406 
2407 	if (st->bpf_seq_afinfo)
2408 		afinfo = st->bpf_seq_afinfo;
2409 	else
2410 		afinfo = PDE_DATA(file_inode(seq->file));
2411 
2412 	++st->num;
2413 	++st->offset;
2414 
2415 	sk = sk_nulls_next(sk);
2416 
2417 	sk_nulls_for_each_from(sk, node) {
2418 		if ((afinfo->family == AF_UNSPEC ||
2419 		     sk->sk_family == afinfo->family) &&
2420 		    net_eq(sock_net(sk), net))
2421 			return sk;
2422 	}
2423 
2424 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2425 	++st->bucket;
2426 	return established_get_first(seq);
2427 }
2428 
established_get_idx(struct seq_file * seq,loff_t pos)2429 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2430 {
2431 	struct tcp_iter_state *st = seq->private;
2432 	void *rc;
2433 
2434 	st->bucket = 0;
2435 	rc = established_get_first(seq);
2436 
2437 	while (rc && pos) {
2438 		rc = established_get_next(seq, rc);
2439 		--pos;
2440 	}
2441 	return rc;
2442 }
2443 
tcp_get_idx(struct seq_file * seq,loff_t pos)2444 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2445 {
2446 	void *rc;
2447 	struct tcp_iter_state *st = seq->private;
2448 
2449 	st->state = TCP_SEQ_STATE_LISTENING;
2450 	rc	  = listening_get_idx(seq, &pos);
2451 
2452 	if (!rc) {
2453 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2454 		rc	  = established_get_idx(seq, pos);
2455 	}
2456 
2457 	return rc;
2458 }
2459 
tcp_seek_last_pos(struct seq_file * seq)2460 static void *tcp_seek_last_pos(struct seq_file *seq)
2461 {
2462 	struct tcp_iter_state *st = seq->private;
2463 	int bucket = st->bucket;
2464 	int offset = st->offset;
2465 	int orig_num = st->num;
2466 	void *rc = NULL;
2467 
2468 	switch (st->state) {
2469 	case TCP_SEQ_STATE_LISTENING:
2470 		if (st->bucket >= INET_LHTABLE_SIZE)
2471 			break;
2472 		st->state = TCP_SEQ_STATE_LISTENING;
2473 		rc = listening_get_next(seq, NULL);
2474 		while (offset-- && rc && bucket == st->bucket)
2475 			rc = listening_get_next(seq, rc);
2476 		if (rc)
2477 			break;
2478 		st->bucket = 0;
2479 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2480 		fallthrough;
2481 	case TCP_SEQ_STATE_ESTABLISHED:
2482 		if (st->bucket > tcp_hashinfo.ehash_mask)
2483 			break;
2484 		rc = established_get_first(seq);
2485 		while (offset-- && rc && bucket == st->bucket)
2486 			rc = established_get_next(seq, rc);
2487 	}
2488 
2489 	st->num = orig_num;
2490 
2491 	return rc;
2492 }
2493 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2494 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2495 {
2496 	struct tcp_iter_state *st = seq->private;
2497 	void *rc;
2498 
2499 	if (*pos && *pos == st->last_pos) {
2500 		rc = tcp_seek_last_pos(seq);
2501 		if (rc)
2502 			goto out;
2503 	}
2504 
2505 	st->state = TCP_SEQ_STATE_LISTENING;
2506 	st->num = 0;
2507 	st->bucket = 0;
2508 	st->offset = 0;
2509 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2510 
2511 out:
2512 	st->last_pos = *pos;
2513 	return rc;
2514 }
2515 EXPORT_SYMBOL(tcp_seq_start);
2516 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2517 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2518 {
2519 	struct tcp_iter_state *st = seq->private;
2520 	void *rc = NULL;
2521 
2522 	if (v == SEQ_START_TOKEN) {
2523 		rc = tcp_get_idx(seq, 0);
2524 		goto out;
2525 	}
2526 
2527 	switch (st->state) {
2528 	case TCP_SEQ_STATE_LISTENING:
2529 		rc = listening_get_next(seq, v);
2530 		if (!rc) {
2531 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2532 			st->bucket = 0;
2533 			st->offset = 0;
2534 			rc	  = established_get_first(seq);
2535 		}
2536 		break;
2537 	case TCP_SEQ_STATE_ESTABLISHED:
2538 		rc = established_get_next(seq, v);
2539 		break;
2540 	}
2541 out:
2542 	++*pos;
2543 	st->last_pos = *pos;
2544 	return rc;
2545 }
2546 EXPORT_SYMBOL(tcp_seq_next);
2547 
tcp_seq_stop(struct seq_file * seq,void * v)2548 void tcp_seq_stop(struct seq_file *seq, void *v)
2549 {
2550 	struct tcp_iter_state *st = seq->private;
2551 
2552 	switch (st->state) {
2553 	case TCP_SEQ_STATE_LISTENING:
2554 		if (v != SEQ_START_TOKEN)
2555 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2556 		break;
2557 	case TCP_SEQ_STATE_ESTABLISHED:
2558 		if (v)
2559 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2560 		break;
2561 	}
2562 }
2563 EXPORT_SYMBOL(tcp_seq_stop);
2564 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2565 static void get_openreq4(const struct request_sock *req,
2566 			 struct seq_file *f, int i)
2567 {
2568 	const struct inet_request_sock *ireq = inet_rsk(req);
2569 	long delta = req->rsk_timer.expires - jiffies;
2570 
2571 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2572 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2573 		i,
2574 		ireq->ir_loc_addr,
2575 		ireq->ir_num,
2576 		ireq->ir_rmt_addr,
2577 		ntohs(ireq->ir_rmt_port),
2578 		TCP_SYN_RECV,
2579 		0, 0, /* could print option size, but that is af dependent. */
2580 		1,    /* timers active (only the expire timer) */
2581 		jiffies_delta_to_clock_t(delta),
2582 		req->num_timeout,
2583 		from_kuid_munged(seq_user_ns(f),
2584 				 sock_i_uid(req->rsk_listener)),
2585 		0,  /* non standard timer */
2586 		0, /* open_requests have no inode */
2587 		0,
2588 		req);
2589 }
2590 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2591 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2592 {
2593 	int timer_active;
2594 	unsigned long timer_expires;
2595 	const struct tcp_sock *tp = tcp_sk(sk);
2596 	const struct inet_connection_sock *icsk = inet_csk(sk);
2597 	const struct inet_sock *inet = inet_sk(sk);
2598 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2599 	__be32 dest = inet->inet_daddr;
2600 	__be32 src = inet->inet_rcv_saddr;
2601 	__u16 destp = ntohs(inet->inet_dport);
2602 	__u16 srcp = ntohs(inet->inet_sport);
2603 	int rx_queue;
2604 	int state;
2605 
2606 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2607 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2608 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2609 		timer_active	= 1;
2610 		timer_expires	= icsk->icsk_timeout;
2611 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2612 		timer_active	= 4;
2613 		timer_expires	= icsk->icsk_timeout;
2614 	} else if (timer_pending(&sk->sk_timer)) {
2615 		timer_active	= 2;
2616 		timer_expires	= sk->sk_timer.expires;
2617 	} else {
2618 		timer_active	= 0;
2619 		timer_expires = jiffies;
2620 	}
2621 
2622 	state = inet_sk_state_load(sk);
2623 	if (state == TCP_LISTEN)
2624 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2625 	else
2626 		/* Because we don't lock the socket,
2627 		 * we might find a transient negative value.
2628 		 */
2629 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2630 				      READ_ONCE(tp->copied_seq), 0);
2631 
2632 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2633 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2634 		i, src, srcp, dest, destp, state,
2635 		READ_ONCE(tp->write_seq) - tp->snd_una,
2636 		rx_queue,
2637 		timer_active,
2638 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2639 		icsk->icsk_retransmits,
2640 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2641 		icsk->icsk_probes_out,
2642 		sock_i_ino(sk),
2643 		refcount_read(&sk->sk_refcnt), sk,
2644 		jiffies_to_clock_t(icsk->icsk_rto),
2645 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2646 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2647 		tp->snd_cwnd,
2648 		state == TCP_LISTEN ?
2649 		    fastopenq->max_qlen :
2650 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2651 }
2652 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2653 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2654 			       struct seq_file *f, int i)
2655 {
2656 	long delta = tw->tw_timer.expires - jiffies;
2657 	__be32 dest, src;
2658 	__u16 destp, srcp;
2659 
2660 	dest  = tw->tw_daddr;
2661 	src   = tw->tw_rcv_saddr;
2662 	destp = ntohs(tw->tw_dport);
2663 	srcp  = ntohs(tw->tw_sport);
2664 
2665 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2666 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2667 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2668 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2669 		refcount_read(&tw->tw_refcnt), tw);
2670 }
2671 
2672 #define TMPSZ 150
2673 
tcp4_seq_show(struct seq_file * seq,void * v)2674 static int tcp4_seq_show(struct seq_file *seq, void *v)
2675 {
2676 	struct tcp_iter_state *st;
2677 	struct sock *sk = v;
2678 
2679 	seq_setwidth(seq, TMPSZ - 1);
2680 	if (v == SEQ_START_TOKEN) {
2681 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2682 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2683 			   "inode");
2684 		goto out;
2685 	}
2686 	st = seq->private;
2687 
2688 	if (sk->sk_state == TCP_TIME_WAIT)
2689 		get_timewait4_sock(v, seq, st->num);
2690 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2691 		get_openreq4(v, seq, st->num);
2692 	else
2693 		get_tcp4_sock(v, seq, st->num);
2694 out:
2695 	seq_pad(seq, '\n');
2696 	return 0;
2697 }
2698 
2699 #ifdef CONFIG_BPF_SYSCALL
2700 struct bpf_iter__tcp {
2701 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2702 	__bpf_md_ptr(struct sock_common *, sk_common);
2703 	uid_t uid __aligned(8);
2704 };
2705 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2706 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2707 			     struct sock_common *sk_common, uid_t uid)
2708 {
2709 	struct bpf_iter__tcp ctx;
2710 
2711 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2712 	ctx.meta = meta;
2713 	ctx.sk_common = sk_common;
2714 	ctx.uid = uid;
2715 	return bpf_iter_run_prog(prog, &ctx);
2716 }
2717 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2718 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2719 {
2720 	struct bpf_iter_meta meta;
2721 	struct bpf_prog *prog;
2722 	struct sock *sk = v;
2723 	uid_t uid;
2724 
2725 	if (v == SEQ_START_TOKEN)
2726 		return 0;
2727 
2728 	if (sk->sk_state == TCP_TIME_WAIT) {
2729 		uid = 0;
2730 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2731 		const struct request_sock *req = v;
2732 
2733 		uid = from_kuid_munged(seq_user_ns(seq),
2734 				       sock_i_uid(req->rsk_listener));
2735 	} else {
2736 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2737 	}
2738 
2739 	meta.seq = seq;
2740 	prog = bpf_iter_get_info(&meta, false);
2741 	return tcp_prog_seq_show(prog, &meta, v, uid);
2742 }
2743 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2744 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2745 {
2746 	struct bpf_iter_meta meta;
2747 	struct bpf_prog *prog;
2748 
2749 	if (!v) {
2750 		meta.seq = seq;
2751 		prog = bpf_iter_get_info(&meta, true);
2752 		if (prog)
2753 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2754 	}
2755 
2756 	tcp_seq_stop(seq, v);
2757 }
2758 
2759 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2760 	.show		= bpf_iter_tcp_seq_show,
2761 	.start		= tcp_seq_start,
2762 	.next		= tcp_seq_next,
2763 	.stop		= bpf_iter_tcp_seq_stop,
2764 };
2765 #endif
2766 
2767 static const struct seq_operations tcp4_seq_ops = {
2768 	.show		= tcp4_seq_show,
2769 	.start		= tcp_seq_start,
2770 	.next		= tcp_seq_next,
2771 	.stop		= tcp_seq_stop,
2772 };
2773 
2774 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2775 	.family		= AF_INET,
2776 };
2777 
tcp4_proc_init_net(struct net * net)2778 static int __net_init tcp4_proc_init_net(struct net *net)
2779 {
2780 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2781 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2782 		return -ENOMEM;
2783 	return 0;
2784 }
2785 
tcp4_proc_exit_net(struct net * net)2786 static void __net_exit tcp4_proc_exit_net(struct net *net)
2787 {
2788 	remove_proc_entry("tcp", net->proc_net);
2789 }
2790 
2791 static struct pernet_operations tcp4_net_ops = {
2792 	.init = tcp4_proc_init_net,
2793 	.exit = tcp4_proc_exit_net,
2794 };
2795 
tcp4_proc_init(void)2796 int __init tcp4_proc_init(void)
2797 {
2798 	return register_pernet_subsys(&tcp4_net_ops);
2799 }
2800 
tcp4_proc_exit(void)2801 void tcp4_proc_exit(void)
2802 {
2803 	unregister_pernet_subsys(&tcp4_net_ops);
2804 }
2805 #endif /* CONFIG_PROC_FS */
2806 
2807 struct proto tcp_prot = {
2808 	.name			= "TCP",
2809 	.owner			= THIS_MODULE,
2810 	.close			= tcp_close,
2811 	.pre_connect		= tcp_v4_pre_connect,
2812 	.connect		= tcp_v4_connect,
2813 	.disconnect		= tcp_disconnect,
2814 	.accept			= inet_csk_accept,
2815 	.ioctl			= tcp_ioctl,
2816 	.init			= tcp_v4_init_sock,
2817 	.destroy		= tcp_v4_destroy_sock,
2818 	.shutdown		= tcp_shutdown,
2819 	.setsockopt		= tcp_setsockopt,
2820 	.getsockopt		= tcp_getsockopt,
2821 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
2822 	.keepalive		= tcp_set_keepalive,
2823 	.recvmsg		= tcp_recvmsg,
2824 	.sendmsg		= tcp_sendmsg,
2825 	.sendpage		= tcp_sendpage,
2826 	.backlog_rcv		= tcp_v4_do_rcv,
2827 	.release_cb		= tcp_release_cb,
2828 	.hash			= inet_hash,
2829 	.unhash			= inet_unhash,
2830 	.get_port		= inet_csk_get_port,
2831 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2832 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2833 	.stream_memory_free	= tcp_stream_memory_free,
2834 	.sockets_allocated	= &tcp_sockets_allocated,
2835 	.orphan_count		= &tcp_orphan_count,
2836 	.memory_allocated	= &tcp_memory_allocated,
2837 	.memory_pressure	= &tcp_memory_pressure,
2838 	.sysctl_mem		= sysctl_tcp_mem,
2839 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2840 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2841 	.max_header		= MAX_TCP_HEADER,
2842 	.obj_size		= sizeof(struct tcp_sock),
2843 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2844 	.twsk_prot		= &tcp_timewait_sock_ops,
2845 	.rsk_prot		= &tcp_request_sock_ops,
2846 	.h.hashinfo		= &tcp_hashinfo,
2847 	.no_autobind		= true,
2848 	.diag_destroy		= tcp_abort,
2849 };
2850 EXPORT_SYMBOL(tcp_prot);
2851 
tcp_sk_exit(struct net * net)2852 static void __net_exit tcp_sk_exit(struct net *net)
2853 {
2854 	if (net->ipv4.tcp_congestion_control)
2855 		bpf_module_put(net->ipv4.tcp_congestion_control,
2856 			       net->ipv4.tcp_congestion_control->owner);
2857 }
2858 
tcp_sk_init(struct net * net)2859 static int __net_init tcp_sk_init(struct net *net)
2860 {
2861 	int cnt;
2862 
2863 	net->ipv4.sysctl_tcp_ecn = 2;
2864 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2865 
2866 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2867 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2868 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2869 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2870 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2871 
2872 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2873 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2874 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2875 
2876 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2877 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2878 	net->ipv4.sysctl_tcp_syncookies = 1;
2879 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2880 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2881 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2882 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2883 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2884 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2885 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2886 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2887 
2888 	cnt = tcp_hashinfo.ehash_mask + 1;
2889 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2890 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2891 
2892 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2893 	net->ipv4.sysctl_tcp_sack = 1;
2894 	net->ipv4.sysctl_tcp_window_scaling = 1;
2895 	net->ipv4.sysctl_tcp_timestamps = 1;
2896 	net->ipv4.sysctl_tcp_early_retrans = 3;
2897 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2898 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2899 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2900 	net->ipv4.sysctl_tcp_max_reordering = 300;
2901 	net->ipv4.sysctl_tcp_dsack = 1;
2902 	net->ipv4.sysctl_tcp_app_win = 31;
2903 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2904 	net->ipv4.sysctl_tcp_frto = 2;
2905 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2906 	/* This limits the percentage of the congestion window which we
2907 	 * will allow a single TSO frame to consume.  Building TSO frames
2908 	 * which are too large can cause TCP streams to be bursty.
2909 	 */
2910 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2911 	/* Default TSQ limit of 16 TSO segments */
2912 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2913 	/* rfc5961 challenge ack rate limiting */
2914 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2915 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2916 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2917 	net->ipv4.sysctl_tcp_autocorking = 1;
2918 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2919 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2920 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2921 	if (net != &init_net) {
2922 		memcpy(net->ipv4.sysctl_tcp_rmem,
2923 		       init_net.ipv4.sysctl_tcp_rmem,
2924 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2925 		memcpy(net->ipv4.sysctl_tcp_wmem,
2926 		       init_net.ipv4.sysctl_tcp_wmem,
2927 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2928 	}
2929 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2930 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2931 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2932 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2933 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2934 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2935 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2936 
2937 	/* Reno is always built in */
2938 	if (!net_eq(net, &init_net) &&
2939 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2940 			       init_net.ipv4.tcp_congestion_control->owner))
2941 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2942 	else
2943 		net->ipv4.tcp_congestion_control = &tcp_reno;
2944 
2945 	return 0;
2946 }
2947 
tcp_sk_exit_batch(struct list_head * net_exit_list)2948 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2949 {
2950 	struct net *net;
2951 
2952 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2953 
2954 	list_for_each_entry(net, net_exit_list, exit_list)
2955 		tcp_fastopen_ctx_destroy(net);
2956 }
2957 
2958 static struct pernet_operations __net_initdata tcp_sk_ops = {
2959        .init	   = tcp_sk_init,
2960        .exit	   = tcp_sk_exit,
2961        .exit_batch = tcp_sk_exit_batch,
2962 };
2963 
2964 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2965 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2966 		     struct sock_common *sk_common, uid_t uid)
2967 
2968 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2969 {
2970 	struct tcp_iter_state *st = priv_data;
2971 	struct tcp_seq_afinfo *afinfo;
2972 	int ret;
2973 
2974 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2975 	if (!afinfo)
2976 		return -ENOMEM;
2977 
2978 	afinfo->family = AF_UNSPEC;
2979 	st->bpf_seq_afinfo = afinfo;
2980 	ret = bpf_iter_init_seq_net(priv_data, aux);
2981 	if (ret)
2982 		kfree(afinfo);
2983 	return ret;
2984 }
2985 
bpf_iter_fini_tcp(void * priv_data)2986 static void bpf_iter_fini_tcp(void *priv_data)
2987 {
2988 	struct tcp_iter_state *st = priv_data;
2989 
2990 	kfree(st->bpf_seq_afinfo);
2991 	bpf_iter_fini_seq_net(priv_data);
2992 }
2993 
2994 static const struct bpf_iter_seq_info tcp_seq_info = {
2995 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2996 	.init_seq_private	= bpf_iter_init_tcp,
2997 	.fini_seq_private	= bpf_iter_fini_tcp,
2998 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2999 };
3000 
3001 static struct bpf_iter_reg tcp_reg_info = {
3002 	.target			= "tcp",
3003 	.ctx_arg_info_size	= 1,
3004 	.ctx_arg_info		= {
3005 		{ offsetof(struct bpf_iter__tcp, sk_common),
3006 		  PTR_TO_BTF_ID_OR_NULL },
3007 	},
3008 	.seq_info		= &tcp_seq_info,
3009 };
3010 
bpf_iter_register(void)3011 static void __init bpf_iter_register(void)
3012 {
3013 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3014 	if (bpf_iter_reg_target(&tcp_reg_info))
3015 		pr_warn("Warning: could not register bpf iterator tcp\n");
3016 }
3017 
3018 #endif
3019 
tcp_v4_init(void)3020 void __init tcp_v4_init(void)
3021 {
3022 	int cpu, res;
3023 
3024 	for_each_possible_cpu(cpu) {
3025 		struct sock *sk;
3026 
3027 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3028 					   IPPROTO_TCP, &init_net);
3029 		if (res)
3030 			panic("Failed to create the TCP control socket.\n");
3031 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3032 
3033 		/* Please enforce IP_DF and IPID==0 for RST and
3034 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3035 		 */
3036 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3037 
3038 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3039 	}
3040 	if (register_pernet_subsys(&tcp_sk_ops))
3041 		panic("Failed to create the TCP control socket.\n");
3042 
3043 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3044 	bpf_iter_register();
3045 #endif
3046 }
3047