• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_sock *inet = inet_sk(sk);
204 	struct tcp_sock *tp = tcp_sk(sk);
205 	__be16 orig_sport, orig_dport;
206 	__be32 daddr, nexthop;
207 	struct flowi4 *fl4;
208 	struct rtable *rt;
209 	int err;
210 	struct ip_options_rcu *inet_opt;
211 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
212 
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	if (usin->sin_family != AF_INET)
217 		return -EAFNOSUPPORT;
218 
219 	nexthop = daddr = usin->sin_addr.s_addr;
220 	inet_opt = rcu_dereference_protected(inet->inet_opt,
221 					     lockdep_sock_is_held(sk));
222 	if (inet_opt && inet_opt->opt.srr) {
223 		if (!daddr)
224 			return -EINVAL;
225 		nexthop = inet_opt->opt.faddr;
226 	}
227 
228 	orig_sport = inet->inet_sport;
229 	orig_dport = usin->sin_port;
230 	fl4 = &inet->cork.fl.u.ip4;
231 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 			      IPPROTO_TCP,
234 			      orig_sport, orig_dport, sk);
235 	if (IS_ERR(rt)) {
236 		err = PTR_ERR(rt);
237 		if (err == -ENETUNREACH)
238 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 		return err;
240 	}
241 
242 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 		ip_rt_put(rt);
244 		return -ENETUNREACH;
245 	}
246 
247 	if (!inet_opt || !inet_opt->opt.srr)
248 		daddr = fl4->daddr;
249 
250 	if (!inet->inet_saddr)
251 		inet->inet_saddr = fl4->saddr;
252 	sk_rcv_saddr_set(sk, inet->inet_saddr);
253 
254 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 		/* Reset inherited state */
256 		tp->rx_opt.ts_recent	   = 0;
257 		tp->rx_opt.ts_recent_stamp = 0;
258 		if (likely(!tp->repair))
259 			WRITE_ONCE(tp->write_seq, 0);
260 	}
261 
262 	inet->inet_dport = usin->sin_port;
263 	sk_daddr_set(sk, daddr);
264 
265 	inet_csk(sk)->icsk_ext_hdr_len = 0;
266 	if (inet_opt)
267 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 
269 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 
271 	/* Socket identity is still unknown (sport may be zero).
272 	 * However we set state to SYN-SENT and not releasing socket
273 	 * lock select source port, enter ourselves into the hash tables and
274 	 * complete initialization after this.
275 	 */
276 	tcp_set_state(sk, TCP_SYN_SENT);
277 	err = inet_hash_connect(tcp_death_row, sk);
278 	if (err)
279 		goto failure;
280 
281 	sk_set_txhash(sk);
282 
283 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 			       inet->inet_sport, inet->inet_dport, sk);
285 	if (IS_ERR(rt)) {
286 		err = PTR_ERR(rt);
287 		rt = NULL;
288 		goto failure;
289 	}
290 	/* OK, now commit destination to socket.  */
291 	sk->sk_gso_type = SKB_GSO_TCPV4;
292 	sk_setup_caps(sk, &rt->dst);
293 	rt = NULL;
294 
295 	if (likely(!tp->repair)) {
296 		if (!tp->write_seq)
297 			WRITE_ONCE(tp->write_seq,
298 				   secure_tcp_seq(inet->inet_saddr,
299 						  inet->inet_daddr,
300 						  inet->inet_sport,
301 						  usin->sin_port));
302 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 						 inet->inet_saddr,
304 						 inet->inet_daddr);
305 	}
306 
307 	inet->inet_id = prandom_u32();
308 
309 	if (tcp_fastopen_defer_connect(sk, &err))
310 		return err;
311 	if (err)
312 		goto failure;
313 
314 	err = tcp_connect(sk);
315 
316 	if (err)
317 		goto failure;
318 
319 	return 0;
320 
321 failure:
322 	/*
323 	 * This unhashes the socket and releases the local port,
324 	 * if necessary.
325 	 */
326 	tcp_set_state(sk, TCP_CLOSE);
327 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
328 		inet_reset_saddr(sk);
329 	ip_rt_put(rt);
330 	sk->sk_route_caps = 0;
331 	inet->inet_dport = 0;
332 	return err;
333 }
334 EXPORT_SYMBOL(tcp_v4_connect);
335 
336 /*
337  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
338  * It can be called through tcp_release_cb() if socket was owned by user
339  * at the time tcp_v4_err() was called to handle ICMP message.
340  */
tcp_v4_mtu_reduced(struct sock * sk)341 void tcp_v4_mtu_reduced(struct sock *sk)
342 {
343 	struct inet_sock *inet = inet_sk(sk);
344 	struct dst_entry *dst;
345 	u32 mtu;
346 
347 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
348 		return;
349 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
350 	dst = inet_csk_update_pmtu(sk, mtu);
351 	if (!dst)
352 		return;
353 
354 	/* Something is about to be wrong... Remember soft error
355 	 * for the case, if this connection will not able to recover.
356 	 */
357 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
358 		sk->sk_err_soft = EMSGSIZE;
359 
360 	mtu = dst_mtu(dst);
361 
362 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
363 	    ip_sk_accept_pmtu(sk) &&
364 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
365 		tcp_sync_mss(sk, mtu);
366 
367 		/* Resend the TCP packet because it's
368 		 * clear that the old packet has been
369 		 * dropped. This is the new "fast" path mtu
370 		 * discovery.
371 		 */
372 		tcp_simple_retransmit(sk);
373 	} /* else let the usual retransmit timer handle it */
374 }
375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
376 
do_redirect(struct sk_buff * skb,struct sock * sk)377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
378 {
379 	struct dst_entry *dst = __sk_dst_check(sk, 0);
380 
381 	if (dst)
382 		dst->ops->redirect(dst, sk, skb);
383 }
384 
385 
386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
388 {
389 	struct request_sock *req = inet_reqsk(sk);
390 	struct net *net = sock_net(sk);
391 
392 	/* ICMPs are not backlogged, hence we cannot get
393 	 * an established socket here.
394 	 */
395 	if (seq != tcp_rsk(req)->snt_isn) {
396 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 	} else if (abort) {
398 		/*
399 		 * Still in SYN_RECV, just remove it silently.
400 		 * There is no good way to pass the error to the newly
401 		 * created socket, and POSIX does not want network
402 		 * errors returned from accept().
403 		 */
404 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
405 		tcp_listendrop(req->rsk_listener);
406 	}
407 	reqsk_put(req);
408 }
409 EXPORT_SYMBOL(tcp_req_err);
410 
411 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
413 {
414 	struct inet_connection_sock *icsk = inet_csk(sk);
415 	struct tcp_sock *tp = tcp_sk(sk);
416 	struct sk_buff *skb;
417 	s32 remaining;
418 	u32 delta_us;
419 
420 	if (sock_owned_by_user(sk))
421 		return;
422 
423 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
424 	    !icsk->icsk_backoff)
425 		return;
426 
427 	skb = tcp_rtx_queue_head(sk);
428 	if (WARN_ON_ONCE(!skb))
429 		return;
430 
431 	icsk->icsk_backoff--;
432 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
433 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
434 
435 	tcp_mstamp_refresh(tp);
436 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
437 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
438 
439 	if (remaining > 0) {
440 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
441 					  remaining, TCP_RTO_MAX);
442 	} else {
443 		/* RTO revert clocked out retransmission.
444 		 * Will retransmit now.
445 		 */
446 		tcp_retransmit_timer(sk);
447 	}
448 }
449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
450 
451 /*
452  * This routine is called by the ICMP module when it gets some
453  * sort of error condition.  If err < 0 then the socket should
454  * be closed and the error returned to the user.  If err > 0
455  * it's just the icmp type << 8 | icmp code.  After adjustment
456  * header points to the first 8 bytes of the tcp header.  We need
457  * to find the appropriate port.
458  *
459  * The locking strategy used here is very "optimistic". When
460  * someone else accesses the socket the ICMP is just dropped
461  * and for some paths there is no check at all.
462  * A more general error queue to queue errors for later handling
463  * is probably better.
464  *
465  */
466 
tcp_v4_err(struct sk_buff * skb,u32 info)467 int tcp_v4_err(struct sk_buff *skb, u32 info)
468 {
469 	const struct iphdr *iph = (const struct iphdr *)skb->data;
470 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
471 	struct tcp_sock *tp;
472 	struct inet_sock *inet;
473 	const int type = icmp_hdr(skb)->type;
474 	const int code = icmp_hdr(skb)->code;
475 	struct sock *sk;
476 	struct request_sock *fastopen;
477 	u32 seq, snd_una;
478 	int err;
479 	struct net *net = dev_net(skb->dev);
480 
481 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
482 				       th->dest, iph->saddr, ntohs(th->source),
483 				       inet_iif(skb), 0);
484 	if (!sk) {
485 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
486 		return -ENOENT;
487 	}
488 	if (sk->sk_state == TCP_TIME_WAIT) {
489 		inet_twsk_put(inet_twsk(sk));
490 		return 0;
491 	}
492 	seq = ntohl(th->seq);
493 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
494 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
495 				     type == ICMP_TIME_EXCEEDED ||
496 				     (type == ICMP_DEST_UNREACH &&
497 				      (code == ICMP_NET_UNREACH ||
498 				       code == ICMP_HOST_UNREACH)));
499 		return 0;
500 	}
501 
502 	bh_lock_sock(sk);
503 	/* If too many ICMPs get dropped on busy
504 	 * servers this needs to be solved differently.
505 	 * We do take care of PMTU discovery (RFC1191) special case :
506 	 * we can receive locally generated ICMP messages while socket is held.
507 	 */
508 	if (sock_owned_by_user(sk)) {
509 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
510 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
511 	}
512 	if (sk->sk_state == TCP_CLOSE)
513 		goto out;
514 
515 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
516 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 		goto out;
518 	}
519 
520 	tp = tcp_sk(sk);
521 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 	fastopen = rcu_dereference(tp->fastopen_rsk);
523 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 	if (sk->sk_state != TCP_LISTEN &&
525 	    !between(seq, snd_una, tp->snd_nxt)) {
526 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 		goto out;
528 	}
529 
530 	switch (type) {
531 	case ICMP_REDIRECT:
532 		if (!sock_owned_by_user(sk))
533 			do_redirect(skb, sk);
534 		goto out;
535 	case ICMP_SOURCE_QUENCH:
536 		/* Just silently ignore these. */
537 		goto out;
538 	case ICMP_PARAMETERPROB:
539 		err = EPROTO;
540 		break;
541 	case ICMP_DEST_UNREACH:
542 		if (code > NR_ICMP_UNREACH)
543 			goto out;
544 
545 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 			/* We are not interested in TCP_LISTEN and open_requests
547 			 * (SYN-ACKs send out by Linux are always <576bytes so
548 			 * they should go through unfragmented).
549 			 */
550 			if (sk->sk_state == TCP_LISTEN)
551 				goto out;
552 
553 			WRITE_ONCE(tp->mtu_info, info);
554 			if (!sock_owned_by_user(sk)) {
555 				tcp_v4_mtu_reduced(sk);
556 			} else {
557 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 					sock_hold(sk);
559 			}
560 			goto out;
561 		}
562 
563 		err = icmp_err_convert[code].errno;
564 		/* check if this ICMP message allows revert of backoff.
565 		 * (see RFC 6069)
566 		 */
567 		if (!fastopen &&
568 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 			tcp_ld_RTO_revert(sk, seq);
570 		break;
571 	case ICMP_TIME_EXCEEDED:
572 		err = EHOSTUNREACH;
573 		break;
574 	default:
575 		goto out;
576 	}
577 
578 	switch (sk->sk_state) {
579 	case TCP_SYN_SENT:
580 	case TCP_SYN_RECV:
581 		/* Only in fast or simultaneous open. If a fast open socket is
582 		 * already accepted it is treated as a connected one below.
583 		 */
584 		if (fastopen && !fastopen->sk)
585 			break;
586 
587 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588 
589 		if (!sock_owned_by_user(sk)) {
590 			sk->sk_err = err;
591 
592 			sk->sk_error_report(sk);
593 
594 			tcp_done(sk);
595 		} else {
596 			sk->sk_err_soft = err;
597 		}
598 		goto out;
599 	}
600 
601 	/* If we've already connected we will keep trying
602 	 * until we time out, or the user gives up.
603 	 *
604 	 * rfc1122 4.2.3.9 allows to consider as hard errors
605 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 	 * but it is obsoleted by pmtu discovery).
607 	 *
608 	 * Note, that in modern internet, where routing is unreliable
609 	 * and in each dark corner broken firewalls sit, sending random
610 	 * errors ordered by their masters even this two messages finally lose
611 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 	 *
613 	 * Now we are in compliance with RFCs.
614 	 *							--ANK (980905)
615 	 */
616 
617 	inet = inet_sk(sk);
618 	if (!sock_owned_by_user(sk) && inet->recverr) {
619 		sk->sk_err = err;
620 		sk->sk_error_report(sk);
621 	} else	{ /* Only an error on timeout */
622 		sk->sk_err_soft = err;
623 	}
624 
625 out:
626 	bh_unlock_sock(sk);
627 	sock_put(sk);
628 	return 0;
629 }
630 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 	struct tcphdr *th = tcp_hdr(skb);
634 
635 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 	skb->csum_start = skb_transport_header(skb) - skb->head;
637 	skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639 
640 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 	const struct inet_sock *inet = inet_sk(sk);
644 
645 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648 
649 /*
650  *	This routine will send an RST to the other tcp.
651  *
652  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653  *		      for reset.
654  *	Answer: if a packet caused RST, it is not for a socket
655  *		existing in our system, if it is matched to a socket,
656  *		it is just duplicate segment or bug in other side's TCP.
657  *		So that we build reply only basing on parameters
658  *		arrived with segment.
659  *	Exception: precedence violation. We do not implement it in any case.
660  */
661 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)662 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
663 {
664 	const struct tcphdr *th = tcp_hdr(skb);
665 	struct {
666 		struct tcphdr th;
667 #ifdef CONFIG_TCP_MD5SIG
668 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
669 #endif
670 	} rep;
671 	struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673 	struct tcp_md5sig_key *key = NULL;
674 	const __u8 *hash_location = NULL;
675 	unsigned char newhash[16];
676 	int genhash;
677 	struct sock *sk1 = NULL;
678 #endif
679 	u64 transmit_time = 0;
680 	struct sock *ctl_sk;
681 	struct net *net;
682 
683 	/* Never send a reset in response to a reset. */
684 	if (th->rst)
685 		return;
686 
687 	/* If sk not NULL, it means we did a successful lookup and incoming
688 	 * route had to be correct. prequeue might have dropped our dst.
689 	 */
690 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 		return;
692 
693 	/* Swap the send and the receive. */
694 	memset(&rep, 0, sizeof(rep));
695 	rep.th.dest   = th->source;
696 	rep.th.source = th->dest;
697 	rep.th.doff   = sizeof(struct tcphdr) / 4;
698 	rep.th.rst    = 1;
699 
700 	if (th->ack) {
701 		rep.th.seq = th->ack_seq;
702 	} else {
703 		rep.th.ack = 1;
704 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 				       skb->len - (th->doff << 2));
706 	}
707 
708 	memset(&arg, 0, sizeof(arg));
709 	arg.iov[0].iov_base = (unsigned char *)&rep;
710 	arg.iov[0].iov_len  = sizeof(rep.th);
711 
712 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
714 	rcu_read_lock();
715 	hash_location = tcp_parse_md5sig_option(th);
716 	if (sk && sk_fullsock(sk)) {
717 		const union tcp_md5_addr *addr;
718 		int l3index;
719 
720 		/* sdif set, means packet ingressed via a device
721 		 * in an L3 domain and inet_iif is set to it.
722 		 */
723 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 	} else if (hash_location) {
727 		const union tcp_md5_addr *addr;
728 		int sdif = tcp_v4_sdif(skb);
729 		int dif = inet_iif(skb);
730 		int l3index;
731 
732 		/*
733 		 * active side is lost. Try to find listening socket through
734 		 * source port, and then find md5 key through listening socket.
735 		 * we are not loose security here:
736 		 * Incoming packet is checked with md5 hash with finding key,
737 		 * no RST generated if md5 hash doesn't match.
738 		 */
739 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 					     ip_hdr(skb)->saddr,
741 					     th->source, ip_hdr(skb)->daddr,
742 					     ntohs(th->source), dif, sdif);
743 		/* don't send rst if it can't find key */
744 		if (!sk1)
745 			goto out;
746 
747 		/* sdif set, means packet ingressed via a device
748 		 * in an L3 domain and dif is set to it.
749 		 */
750 		l3index = sdif ? dif : 0;
751 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 		if (!key)
754 			goto out;
755 
756 
757 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 			goto out;
760 
761 	}
762 
763 	if (key) {
764 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 				   (TCPOPT_NOP << 16) |
766 				   (TCPOPT_MD5SIG << 8) |
767 				   TCPOLEN_MD5SIG);
768 		/* Update length and the length the header thinks exists */
769 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 		rep.th.doff = arg.iov[0].iov_len / 4;
771 
772 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 				     key, ip_hdr(skb)->saddr,
774 				     ip_hdr(skb)->daddr, &rep.th);
775 	}
776 #endif
777 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
778 				      ip_hdr(skb)->saddr, /* XXX */
779 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
780 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
781 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
782 
783 	/* When socket is gone, all binding information is lost.
784 	 * routing might fail in this case. No choice here, if we choose to force
785 	 * input interface, we will misroute in case of asymmetric route.
786 	 */
787 	if (sk) {
788 		arg.bound_dev_if = sk->sk_bound_dev_if;
789 		if (sk_fullsock(sk))
790 			trace_tcp_send_reset(sk, skb);
791 	}
792 
793 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
794 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
795 
796 	arg.tos = ip_hdr(skb)->tos;
797 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
798 	local_bh_disable();
799 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
800 	sock_net_set(ctl_sk, net);
801 	if (sk) {
802 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
803 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
804 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
805 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
806 		transmit_time = tcp_transmit_time(sk);
807 		xfrm_sk_clone_policy(ctl_sk, sk);
808 	} else {
809 		ctl_sk->sk_mark = 0;
810 		ctl_sk->sk_priority = 0;
811 	}
812 	ip_send_unicast_reply(ctl_sk,
813 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
814 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
815 			      &arg, arg.iov[0].iov_len,
816 			      transmit_time);
817 
818 	xfrm_sk_free_policy(ctl_sk);
819 	sock_net_set(ctl_sk, &init_net);
820 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
821 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
822 	local_bh_enable();
823 
824 #ifdef CONFIG_TCP_MD5SIG
825 out:
826 	rcu_read_unlock();
827 #endif
828 }
829 
830 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
831    outside socket context is ugly, certainly. What can I do?
832  */
833 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)834 static void tcp_v4_send_ack(const struct sock *sk,
835 			    struct sk_buff *skb, u32 seq, u32 ack,
836 			    u32 win, u32 tsval, u32 tsecr, int oif,
837 			    struct tcp_md5sig_key *key,
838 			    int reply_flags, u8 tos)
839 {
840 	const struct tcphdr *th = tcp_hdr(skb);
841 	struct {
842 		struct tcphdr th;
843 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
844 #ifdef CONFIG_TCP_MD5SIG
845 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
846 #endif
847 			];
848 	} rep;
849 	struct net *net = sock_net(sk);
850 	struct ip_reply_arg arg;
851 	struct sock *ctl_sk;
852 	u64 transmit_time;
853 
854 	memset(&rep.th, 0, sizeof(struct tcphdr));
855 	memset(&arg, 0, sizeof(arg));
856 
857 	arg.iov[0].iov_base = (unsigned char *)&rep;
858 	arg.iov[0].iov_len  = sizeof(rep.th);
859 	if (tsecr) {
860 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
861 				   (TCPOPT_TIMESTAMP << 8) |
862 				   TCPOLEN_TIMESTAMP);
863 		rep.opt[1] = htonl(tsval);
864 		rep.opt[2] = htonl(tsecr);
865 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
866 	}
867 
868 	/* Swap the send and the receive. */
869 	rep.th.dest    = th->source;
870 	rep.th.source  = th->dest;
871 	rep.th.doff    = arg.iov[0].iov_len / 4;
872 	rep.th.seq     = htonl(seq);
873 	rep.th.ack_seq = htonl(ack);
874 	rep.th.ack     = 1;
875 	rep.th.window  = htons(win);
876 
877 #ifdef CONFIG_TCP_MD5SIG
878 	if (key) {
879 		int offset = (tsecr) ? 3 : 0;
880 
881 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
882 					  (TCPOPT_NOP << 16) |
883 					  (TCPOPT_MD5SIG << 8) |
884 					  TCPOLEN_MD5SIG);
885 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
886 		rep.th.doff = arg.iov[0].iov_len/4;
887 
888 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
889 				    key, ip_hdr(skb)->saddr,
890 				    ip_hdr(skb)->daddr, &rep.th);
891 	}
892 #endif
893 	arg.flags = reply_flags;
894 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
895 				      ip_hdr(skb)->saddr, /* XXX */
896 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
897 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
898 	if (oif)
899 		arg.bound_dev_if = oif;
900 	arg.tos = tos;
901 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
902 	local_bh_disable();
903 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
904 	sock_net_set(ctl_sk, net);
905 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
906 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
907 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
908 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
909 	transmit_time = tcp_transmit_time(sk);
910 	ip_send_unicast_reply(ctl_sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time);
915 
916 	sock_net_set(ctl_sk, &init_net);
917 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
918 	local_bh_enable();
919 }
920 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)921 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
922 {
923 	struct inet_timewait_sock *tw = inet_twsk(sk);
924 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
925 
926 	tcp_v4_send_ack(sk, skb,
927 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
928 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
929 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
930 			tcptw->tw_ts_recent,
931 			tw->tw_bound_dev_if,
932 			tcp_twsk_md5_key(tcptw),
933 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
934 			tw->tw_tos
935 			);
936 
937 	inet_twsk_put(tw);
938 }
939 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)940 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
941 				  struct request_sock *req)
942 {
943 	const union tcp_md5_addr *addr;
944 	int l3index;
945 
946 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
947 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
948 	 */
949 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
950 					     tcp_sk(sk)->snd_nxt;
951 
952 	/* RFC 7323 2.3
953 	 * The window field (SEG.WND) of every outgoing segment, with the
954 	 * exception of <SYN> segments, MUST be right-shifted by
955 	 * Rcv.Wind.Shift bits:
956 	 */
957 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
958 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
959 	tcp_v4_send_ack(sk, skb, seq,
960 			tcp_rsk(req)->rcv_nxt,
961 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
962 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
963 			READ_ONCE(req->ts_recent),
964 			0,
965 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
966 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
967 			ip_hdr(skb)->tos);
968 }
969 
970 /*
971  *	Send a SYN-ACK after having received a SYN.
972  *	This still operates on a request_sock only, not on a big
973  *	socket.
974  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)975 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
976 			      struct flowi *fl,
977 			      struct request_sock *req,
978 			      struct tcp_fastopen_cookie *foc,
979 			      enum tcp_synack_type synack_type,
980 			      struct sk_buff *syn_skb)
981 {
982 	const struct inet_request_sock *ireq = inet_rsk(req);
983 	struct flowi4 fl4;
984 	int err = -1;
985 	struct sk_buff *skb;
986 	u8 tos;
987 
988 	/* First, grab a route. */
989 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
990 		return -1;
991 
992 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
993 
994 	if (skb) {
995 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
996 
997 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
998 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
999 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1000 				inet_sk(sk)->tos;
1001 
1002 		if (!INET_ECN_is_capable(tos) &&
1003 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1004 			tos |= INET_ECN_ECT_0;
1005 
1006 		rcu_read_lock();
1007 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1008 					    ireq->ir_rmt_addr,
1009 					    rcu_dereference(ireq->ireq_opt),
1010 					    tos);
1011 		rcu_read_unlock();
1012 		err = net_xmit_eval(err);
1013 	}
1014 
1015 	return err;
1016 }
1017 
1018 /*
1019  *	IPv4 request_sock destructor.
1020  */
tcp_v4_reqsk_destructor(struct request_sock * req)1021 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1022 {
1023 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1024 }
1025 
1026 #ifdef CONFIG_TCP_MD5SIG
1027 /*
1028  * RFC2385 MD5 checksumming requires a mapping of
1029  * IP address->MD5 Key.
1030  * We need to maintain these in the sk structure.
1031  */
1032 
1033 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1034 EXPORT_SYMBOL(tcp_md5_needed);
1035 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1036 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1037 {
1038 	if (!old)
1039 		return true;
1040 
1041 	/* l3index always overrides non-l3index */
1042 	if (old->l3index && new->l3index == 0)
1043 		return false;
1044 	if (old->l3index == 0 && new->l3index)
1045 		return true;
1046 
1047 	return old->prefixlen < new->prefixlen;
1048 }
1049 
1050 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1051 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1052 					   const union tcp_md5_addr *addr,
1053 					   int family)
1054 {
1055 	const struct tcp_sock *tp = tcp_sk(sk);
1056 	struct tcp_md5sig_key *key;
1057 	const struct tcp_md5sig_info *md5sig;
1058 	__be32 mask;
1059 	struct tcp_md5sig_key *best_match = NULL;
1060 	bool match;
1061 
1062 	/* caller either holds rcu_read_lock() or socket lock */
1063 	md5sig = rcu_dereference_check(tp->md5sig_info,
1064 				       lockdep_sock_is_held(sk));
1065 	if (!md5sig)
1066 		return NULL;
1067 
1068 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069 				 lockdep_sock_is_held(sk)) {
1070 		if (key->family != family)
1071 			continue;
1072 		if (key->l3index && key->l3index != l3index)
1073 			continue;
1074 		if (family == AF_INET) {
1075 			mask = inet_make_mask(key->prefixlen);
1076 			match = (key->addr.a4.s_addr & mask) ==
1077 				(addr->a4.s_addr & mask);
1078 #if IS_ENABLED(CONFIG_IPV6)
1079 		} else if (family == AF_INET6) {
1080 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1081 						  key->prefixlen);
1082 #endif
1083 		} else {
1084 			match = false;
1085 		}
1086 
1087 		if (match && better_md5_match(best_match, key))
1088 			best_match = key;
1089 	}
1090 	return best_match;
1091 }
1092 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1093 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1094 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1095 						      const union tcp_md5_addr *addr,
1096 						      int family, u8 prefixlen,
1097 						      int l3index)
1098 {
1099 	const struct tcp_sock *tp = tcp_sk(sk);
1100 	struct tcp_md5sig_key *key;
1101 	unsigned int size = sizeof(struct in_addr);
1102 	const struct tcp_md5sig_info *md5sig;
1103 
1104 	/* caller either holds rcu_read_lock() or socket lock */
1105 	md5sig = rcu_dereference_check(tp->md5sig_info,
1106 				       lockdep_sock_is_held(sk));
1107 	if (!md5sig)
1108 		return NULL;
1109 #if IS_ENABLED(CONFIG_IPV6)
1110 	if (family == AF_INET6)
1111 		size = sizeof(struct in6_addr);
1112 #endif
1113 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1114 				 lockdep_sock_is_held(sk)) {
1115 		if (key->family != family)
1116 			continue;
1117 		if (key->l3index != l3index)
1118 			continue;
1119 		if (!memcmp(&key->addr, addr, size) &&
1120 		    key->prefixlen == prefixlen)
1121 			return key;
1122 	}
1123 	return NULL;
1124 }
1125 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1126 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1127 					 const struct sock *addr_sk)
1128 {
1129 	const union tcp_md5_addr *addr;
1130 	int l3index;
1131 
1132 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1133 						 addr_sk->sk_bound_dev_if);
1134 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1135 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1136 }
1137 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1138 
1139 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1140 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1141 		   int family, u8 prefixlen, int l3index,
1142 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1143 {
1144 	/* Add Key to the list */
1145 	struct tcp_md5sig_key *key;
1146 	struct tcp_sock *tp = tcp_sk(sk);
1147 	struct tcp_md5sig_info *md5sig;
1148 
1149 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1150 	if (key) {
1151 		/* Pre-existing entry - just update that one.
1152 		 * Note that the key might be used concurrently.
1153 		 * data_race() is telling kcsan that we do not care of
1154 		 * key mismatches, since changing MD5 key on live flows
1155 		 * can lead to packet drops.
1156 		 */
1157 		data_race(memcpy(key->key, newkey, newkeylen));
1158 
1159 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1160 		 * Also note that a reader could catch new key->keylen value
1161 		 * but old key->key[], this is the reason we use __GFP_ZERO
1162 		 * at sock_kmalloc() time below these lines.
1163 		 */
1164 		WRITE_ONCE(key->keylen, newkeylen);
1165 
1166 		return 0;
1167 	}
1168 
1169 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1170 					   lockdep_sock_is_held(sk));
1171 	if (!md5sig) {
1172 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1173 		if (!md5sig)
1174 			return -ENOMEM;
1175 
1176 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1177 		INIT_HLIST_HEAD(&md5sig->head);
1178 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1179 	}
1180 
1181 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1182 	if (!key)
1183 		return -ENOMEM;
1184 	if (!tcp_alloc_md5sig_pool()) {
1185 		sock_kfree_s(sk, key, sizeof(*key));
1186 		return -ENOMEM;
1187 	}
1188 
1189 	memcpy(key->key, newkey, newkeylen);
1190 	key->keylen = newkeylen;
1191 	key->family = family;
1192 	key->prefixlen = prefixlen;
1193 	key->l3index = l3index;
1194 	memcpy(&key->addr, addr,
1195 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1196 				      sizeof(struct in_addr));
1197 	hlist_add_head_rcu(&key->node, &md5sig->head);
1198 	return 0;
1199 }
1200 EXPORT_SYMBOL(tcp_md5_do_add);
1201 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1202 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1203 		   u8 prefixlen, int l3index)
1204 {
1205 	struct tcp_md5sig_key *key;
1206 
1207 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1208 	if (!key)
1209 		return -ENOENT;
1210 	hlist_del_rcu(&key->node);
1211 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1212 	kfree_rcu(key, rcu);
1213 	return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_del);
1216 
tcp_clear_md5_list(struct sock * sk)1217 static void tcp_clear_md5_list(struct sock *sk)
1218 {
1219 	struct tcp_sock *tp = tcp_sk(sk);
1220 	struct tcp_md5sig_key *key;
1221 	struct hlist_node *n;
1222 	struct tcp_md5sig_info *md5sig;
1223 
1224 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1225 
1226 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1227 		hlist_del_rcu(&key->node);
1228 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 		kfree_rcu(key, rcu);
1230 	}
1231 }
1232 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1233 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1234 				 sockptr_t optval, int optlen)
1235 {
1236 	struct tcp_md5sig cmd;
1237 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1238 	const union tcp_md5_addr *addr;
1239 	u8 prefixlen = 32;
1240 	int l3index = 0;
1241 
1242 	if (optlen < sizeof(cmd))
1243 		return -EINVAL;
1244 
1245 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1246 		return -EFAULT;
1247 
1248 	if (sin->sin_family != AF_INET)
1249 		return -EINVAL;
1250 
1251 	if (optname == TCP_MD5SIG_EXT &&
1252 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1253 		prefixlen = cmd.tcpm_prefixlen;
1254 		if (prefixlen > 32)
1255 			return -EINVAL;
1256 	}
1257 
1258 	if (optname == TCP_MD5SIG_EXT &&
1259 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1260 		struct net_device *dev;
1261 
1262 		rcu_read_lock();
1263 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1264 		if (dev && netif_is_l3_master(dev))
1265 			l3index = dev->ifindex;
1266 
1267 		rcu_read_unlock();
1268 
1269 		/* ok to reference set/not set outside of rcu;
1270 		 * right now device MUST be an L3 master
1271 		 */
1272 		if (!dev || !l3index)
1273 			return -EINVAL;
1274 	}
1275 
1276 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1277 
1278 	if (!cmd.tcpm_keylen)
1279 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1280 
1281 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1282 		return -EINVAL;
1283 
1284 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1285 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1286 }
1287 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1288 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1289 				   __be32 daddr, __be32 saddr,
1290 				   const struct tcphdr *th, int nbytes)
1291 {
1292 	struct tcp4_pseudohdr *bp;
1293 	struct scatterlist sg;
1294 	struct tcphdr *_th;
1295 
1296 	bp = hp->scratch;
1297 	bp->saddr = saddr;
1298 	bp->daddr = daddr;
1299 	bp->pad = 0;
1300 	bp->protocol = IPPROTO_TCP;
1301 	bp->len = cpu_to_be16(nbytes);
1302 
1303 	_th = (struct tcphdr *)(bp + 1);
1304 	memcpy(_th, th, sizeof(*th));
1305 	_th->check = 0;
1306 
1307 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1308 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1309 				sizeof(*bp) + sizeof(*th));
1310 	return crypto_ahash_update(hp->md5_req);
1311 }
1312 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1313 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1314 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1315 {
1316 	struct tcp_md5sig_pool *hp;
1317 	struct ahash_request *req;
1318 
1319 	hp = tcp_get_md5sig_pool();
1320 	if (!hp)
1321 		goto clear_hash_noput;
1322 	req = hp->md5_req;
1323 
1324 	if (crypto_ahash_init(req))
1325 		goto clear_hash;
1326 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1327 		goto clear_hash;
1328 	if (tcp_md5_hash_key(hp, key))
1329 		goto clear_hash;
1330 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1331 	if (crypto_ahash_final(req))
1332 		goto clear_hash;
1333 
1334 	tcp_put_md5sig_pool();
1335 	return 0;
1336 
1337 clear_hash:
1338 	tcp_put_md5sig_pool();
1339 clear_hash_noput:
1340 	memset(md5_hash, 0, 16);
1341 	return 1;
1342 }
1343 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1344 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1345 			const struct sock *sk,
1346 			const struct sk_buff *skb)
1347 {
1348 	struct tcp_md5sig_pool *hp;
1349 	struct ahash_request *req;
1350 	const struct tcphdr *th = tcp_hdr(skb);
1351 	__be32 saddr, daddr;
1352 
1353 	if (sk) { /* valid for establish/request sockets */
1354 		saddr = sk->sk_rcv_saddr;
1355 		daddr = sk->sk_daddr;
1356 	} else {
1357 		const struct iphdr *iph = ip_hdr(skb);
1358 		saddr = iph->saddr;
1359 		daddr = iph->daddr;
1360 	}
1361 
1362 	hp = tcp_get_md5sig_pool();
1363 	if (!hp)
1364 		goto clear_hash_noput;
1365 	req = hp->md5_req;
1366 
1367 	if (crypto_ahash_init(req))
1368 		goto clear_hash;
1369 
1370 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1371 		goto clear_hash;
1372 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1373 		goto clear_hash;
1374 	if (tcp_md5_hash_key(hp, key))
1375 		goto clear_hash;
1376 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1377 	if (crypto_ahash_final(req))
1378 		goto clear_hash;
1379 
1380 	tcp_put_md5sig_pool();
1381 	return 0;
1382 
1383 clear_hash:
1384 	tcp_put_md5sig_pool();
1385 clear_hash_noput:
1386 	memset(md5_hash, 0, 16);
1387 	return 1;
1388 }
1389 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1390 
1391 #endif
1392 
1393 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1394 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1395 				    const struct sk_buff *skb,
1396 				    int dif, int sdif)
1397 {
1398 #ifdef CONFIG_TCP_MD5SIG
1399 	/*
1400 	 * This gets called for each TCP segment that arrives
1401 	 * so we want to be efficient.
1402 	 * We have 3 drop cases:
1403 	 * o No MD5 hash and one expected.
1404 	 * o MD5 hash and we're not expecting one.
1405 	 * o MD5 hash and its wrong.
1406 	 */
1407 	const __u8 *hash_location = NULL;
1408 	struct tcp_md5sig_key *hash_expected;
1409 	const struct iphdr *iph = ip_hdr(skb);
1410 	const struct tcphdr *th = tcp_hdr(skb);
1411 	const union tcp_md5_addr *addr;
1412 	unsigned char newhash[16];
1413 	int genhash, l3index;
1414 
1415 	/* sdif set, means packet ingressed via a device
1416 	 * in an L3 domain and dif is set to the l3mdev
1417 	 */
1418 	l3index = sdif ? dif : 0;
1419 
1420 	addr = (union tcp_md5_addr *)&iph->saddr;
1421 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1422 	hash_location = tcp_parse_md5sig_option(th);
1423 
1424 	/* We've parsed the options - do we have a hash? */
1425 	if (!hash_expected && !hash_location)
1426 		return false;
1427 
1428 	if (hash_expected && !hash_location) {
1429 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1430 		return true;
1431 	}
1432 
1433 	if (!hash_expected && hash_location) {
1434 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1435 		return true;
1436 	}
1437 
1438 	/* Okay, so this is hash_expected and hash_location -
1439 	 * so we need to calculate the checksum.
1440 	 */
1441 	genhash = tcp_v4_md5_hash_skb(newhash,
1442 				      hash_expected,
1443 				      NULL, skb);
1444 
1445 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1446 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1447 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1448 				     &iph->saddr, ntohs(th->source),
1449 				     &iph->daddr, ntohs(th->dest),
1450 				     genhash ? " tcp_v4_calc_md5_hash failed"
1451 				     : "", l3index);
1452 		return true;
1453 	}
1454 	return false;
1455 #endif
1456 	return false;
1457 }
1458 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1459 static void tcp_v4_init_req(struct request_sock *req,
1460 			    const struct sock *sk_listener,
1461 			    struct sk_buff *skb)
1462 {
1463 	struct inet_request_sock *ireq = inet_rsk(req);
1464 	struct net *net = sock_net(sk_listener);
1465 
1466 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1467 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1468 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1469 }
1470 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1471 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1472 					  struct flowi *fl,
1473 					  const struct request_sock *req)
1474 {
1475 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1476 }
1477 
1478 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1479 	.family		=	PF_INET,
1480 	.obj_size	=	sizeof(struct tcp_request_sock),
1481 	.rtx_syn_ack	=	tcp_rtx_synack,
1482 	.send_ack	=	tcp_v4_reqsk_send_ack,
1483 	.destructor	=	tcp_v4_reqsk_destructor,
1484 	.send_reset	=	tcp_v4_send_reset,
1485 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1486 };
1487 
1488 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1489 	.mss_clamp	=	TCP_MSS_DEFAULT,
1490 #ifdef CONFIG_TCP_MD5SIG
1491 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1492 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1493 #endif
1494 	.init_req	=	tcp_v4_init_req,
1495 #ifdef CONFIG_SYN_COOKIES
1496 	.cookie_init_seq =	cookie_v4_init_sequence,
1497 #endif
1498 	.route_req	=	tcp_v4_route_req,
1499 	.init_seq	=	tcp_v4_init_seq,
1500 	.init_ts_off	=	tcp_v4_init_ts_off,
1501 	.send_synack	=	tcp_v4_send_synack,
1502 };
1503 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1504 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1505 {
1506 	/* Never answer to SYNs send to broadcast or multicast */
1507 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1508 		goto drop;
1509 
1510 	return tcp_conn_request(&tcp_request_sock_ops,
1511 				&tcp_request_sock_ipv4_ops, sk, skb);
1512 
1513 drop:
1514 	tcp_listendrop(sk);
1515 	return 0;
1516 }
1517 EXPORT_SYMBOL(tcp_v4_conn_request);
1518 
1519 
1520 /*
1521  * The three way handshake has completed - we got a valid synack -
1522  * now create the new socket.
1523  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1524 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1525 				  struct request_sock *req,
1526 				  struct dst_entry *dst,
1527 				  struct request_sock *req_unhash,
1528 				  bool *own_req)
1529 {
1530 	struct inet_request_sock *ireq;
1531 	bool found_dup_sk = false;
1532 	struct inet_sock *newinet;
1533 	struct tcp_sock *newtp;
1534 	struct sock *newsk;
1535 #ifdef CONFIG_TCP_MD5SIG
1536 	const union tcp_md5_addr *addr;
1537 	struct tcp_md5sig_key *key;
1538 	int l3index;
1539 #endif
1540 	struct ip_options_rcu *inet_opt;
1541 
1542 	if (sk_acceptq_is_full(sk))
1543 		goto exit_overflow;
1544 
1545 	newsk = tcp_create_openreq_child(sk, req, skb);
1546 	if (!newsk)
1547 		goto exit_nonewsk;
1548 
1549 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1550 	inet_sk_rx_dst_set(newsk, skb);
1551 
1552 	newtp		      = tcp_sk(newsk);
1553 	newinet		      = inet_sk(newsk);
1554 	ireq		      = inet_rsk(req);
1555 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1556 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1557 	newsk->sk_bound_dev_if = ireq->ir_iif;
1558 	newinet->inet_saddr   = ireq->ir_loc_addr;
1559 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1560 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1561 	newinet->mc_index     = inet_iif(skb);
1562 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1563 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1564 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1565 	if (inet_opt)
1566 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1567 	newinet->inet_id = prandom_u32();
1568 
1569 	/* Set ToS of the new socket based upon the value of incoming SYN.
1570 	 * ECT bits are set later in tcp_init_transfer().
1571 	 */
1572 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1573 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1574 
1575 	if (!dst) {
1576 		dst = inet_csk_route_child_sock(sk, newsk, req);
1577 		if (!dst)
1578 			goto put_and_exit;
1579 	} else {
1580 		/* syncookie case : see end of cookie_v4_check() */
1581 	}
1582 	sk_setup_caps(newsk, dst);
1583 
1584 	tcp_ca_openreq_child(newsk, dst);
1585 
1586 	tcp_sync_mss(newsk, dst_mtu(dst));
1587 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1588 
1589 	tcp_initialize_rcv_mss(newsk);
1590 
1591 #ifdef CONFIG_TCP_MD5SIG
1592 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1593 	/* Copy over the MD5 key from the original socket */
1594 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1595 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1596 	if (key) {
1597 		/*
1598 		 * We're using one, so create a matching key
1599 		 * on the newsk structure. If we fail to get
1600 		 * memory, then we end up not copying the key
1601 		 * across. Shucks.
1602 		 */
1603 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1604 			       key->key, key->keylen, GFP_ATOMIC);
1605 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1606 	}
1607 #endif
1608 
1609 	if (__inet_inherit_port(sk, newsk) < 0)
1610 		goto put_and_exit;
1611 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1612 				       &found_dup_sk);
1613 	if (likely(*own_req)) {
1614 		tcp_move_syn(newtp, req);
1615 		ireq->ireq_opt = NULL;
1616 	} else {
1617 		newinet->inet_opt = NULL;
1618 
1619 		if (!req_unhash && found_dup_sk) {
1620 			/* This code path should only be executed in the
1621 			 * syncookie case only
1622 			 */
1623 			bh_unlock_sock(newsk);
1624 			sock_put(newsk);
1625 			newsk = NULL;
1626 		}
1627 	}
1628 	return newsk;
1629 
1630 exit_overflow:
1631 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1632 exit_nonewsk:
1633 	dst_release(dst);
1634 exit:
1635 	tcp_listendrop(sk);
1636 	return NULL;
1637 put_and_exit:
1638 	newinet->inet_opt = NULL;
1639 	inet_csk_prepare_forced_close(newsk);
1640 	tcp_done(newsk);
1641 	goto exit;
1642 }
1643 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1644 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1645 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1646 {
1647 #ifdef CONFIG_SYN_COOKIES
1648 	const struct tcphdr *th = tcp_hdr(skb);
1649 
1650 	if (!th->syn)
1651 		sk = cookie_v4_check(sk, skb);
1652 #endif
1653 	return sk;
1654 }
1655 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1656 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1657 			 struct tcphdr *th, u32 *cookie)
1658 {
1659 	u16 mss = 0;
1660 #ifdef CONFIG_SYN_COOKIES
1661 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1662 				    &tcp_request_sock_ipv4_ops, sk, th);
1663 	if (mss) {
1664 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1665 		tcp_synq_overflow(sk);
1666 	}
1667 #endif
1668 	return mss;
1669 }
1670 
1671 /* The socket must have it's spinlock held when we get
1672  * here, unless it is a TCP_LISTEN socket.
1673  *
1674  * We have a potential double-lock case here, so even when
1675  * doing backlog processing we use the BH locking scheme.
1676  * This is because we cannot sleep with the original spinlock
1677  * held.
1678  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1679 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1680 {
1681 	struct sock *rsk;
1682 
1683 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1684 		struct dst_entry *dst;
1685 
1686 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1687 						lockdep_sock_is_held(sk));
1688 
1689 		sock_rps_save_rxhash(sk, skb);
1690 		sk_mark_napi_id(sk, skb);
1691 		if (dst) {
1692 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1693 			    !dst->ops->check(dst, 0)) {
1694 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1695 				dst_release(dst);
1696 			}
1697 		}
1698 		tcp_rcv_established(sk, skb);
1699 		return 0;
1700 	}
1701 
1702 	if (tcp_checksum_complete(skb))
1703 		goto csum_err;
1704 
1705 	if (sk->sk_state == TCP_LISTEN) {
1706 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1707 
1708 		if (!nsk)
1709 			goto discard;
1710 		if (nsk != sk) {
1711 			if (tcp_child_process(sk, nsk, skb)) {
1712 				rsk = nsk;
1713 				goto reset;
1714 			}
1715 			return 0;
1716 		}
1717 	} else
1718 		sock_rps_save_rxhash(sk, skb);
1719 
1720 	if (tcp_rcv_state_process(sk, skb)) {
1721 		rsk = sk;
1722 		goto reset;
1723 	}
1724 	return 0;
1725 
1726 reset:
1727 	tcp_v4_send_reset(rsk, skb);
1728 discard:
1729 	kfree_skb(skb);
1730 	/* Be careful here. If this function gets more complicated and
1731 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1732 	 * might be destroyed here. This current version compiles correctly,
1733 	 * but you have been warned.
1734 	 */
1735 	return 0;
1736 
1737 csum_err:
1738 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1739 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1740 	goto discard;
1741 }
1742 EXPORT_SYMBOL(tcp_v4_do_rcv);
1743 
tcp_v4_early_demux(struct sk_buff * skb)1744 int tcp_v4_early_demux(struct sk_buff *skb)
1745 {
1746 	const struct iphdr *iph;
1747 	const struct tcphdr *th;
1748 	struct sock *sk;
1749 
1750 	if (skb->pkt_type != PACKET_HOST)
1751 		return 0;
1752 
1753 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1754 		return 0;
1755 
1756 	iph = ip_hdr(skb);
1757 	th = tcp_hdr(skb);
1758 
1759 	if (th->doff < sizeof(struct tcphdr) / 4)
1760 		return 0;
1761 
1762 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1763 				       iph->saddr, th->source,
1764 				       iph->daddr, ntohs(th->dest),
1765 				       skb->skb_iif, inet_sdif(skb));
1766 	if (sk) {
1767 		skb->sk = sk;
1768 		skb->destructor = sock_edemux;
1769 		if (sk_fullsock(sk)) {
1770 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1771 
1772 			if (dst)
1773 				dst = dst_check(dst, 0);
1774 			if (dst &&
1775 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1776 				skb_dst_set_noref(skb, dst);
1777 		}
1778 	}
1779 	return 0;
1780 }
1781 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1782 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1783 {
1784 	u32 limit, tail_gso_size, tail_gso_segs;
1785 	struct skb_shared_info *shinfo;
1786 	const struct tcphdr *th;
1787 	struct tcphdr *thtail;
1788 	struct sk_buff *tail;
1789 	unsigned int hdrlen;
1790 	bool fragstolen;
1791 	u32 gso_segs;
1792 	u32 gso_size;
1793 	int delta;
1794 
1795 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1796 	 * we can fix skb->truesize to its real value to avoid future drops.
1797 	 * This is valid because skb is not yet charged to the socket.
1798 	 * It has been noticed pure SACK packets were sometimes dropped
1799 	 * (if cooked by drivers without copybreak feature).
1800 	 */
1801 	skb_condense(skb);
1802 
1803 	skb_dst_drop(skb);
1804 
1805 	if (unlikely(tcp_checksum_complete(skb))) {
1806 		bh_unlock_sock(sk);
1807 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1808 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1809 		return true;
1810 	}
1811 
1812 	/* Attempt coalescing to last skb in backlog, even if we are
1813 	 * above the limits.
1814 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1815 	 */
1816 	th = (const struct tcphdr *)skb->data;
1817 	hdrlen = th->doff * 4;
1818 
1819 	tail = sk->sk_backlog.tail;
1820 	if (!tail)
1821 		goto no_coalesce;
1822 	thtail = (struct tcphdr *)tail->data;
1823 
1824 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1825 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1826 	    ((TCP_SKB_CB(tail)->tcp_flags |
1827 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1828 	    !((TCP_SKB_CB(tail)->tcp_flags &
1829 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1830 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1831 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1832 #ifdef CONFIG_TLS_DEVICE
1833 	    tail->decrypted != skb->decrypted ||
1834 #endif
1835 	    thtail->doff != th->doff ||
1836 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1837 		goto no_coalesce;
1838 
1839 	__skb_pull(skb, hdrlen);
1840 
1841 	shinfo = skb_shinfo(skb);
1842 	gso_size = shinfo->gso_size ?: skb->len;
1843 	gso_segs = shinfo->gso_segs ?: 1;
1844 
1845 	shinfo = skb_shinfo(tail);
1846 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1847 	tail_gso_segs = shinfo->gso_segs ?: 1;
1848 
1849 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1850 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1851 
1852 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1853 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1854 			thtail->window = th->window;
1855 		}
1856 
1857 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1858 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1859 		 * is not entered if we append a packet with a FIN.
1860 		 * SYN, RST, URG are not present.
1861 		 * ACK is set on both packets.
1862 		 * PSH : we do not really care in TCP stack,
1863 		 *       at least for 'GRO' packets.
1864 		 */
1865 		thtail->fin |= th->fin;
1866 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1867 
1868 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1869 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1870 			tail->tstamp = skb->tstamp;
1871 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1872 		}
1873 
1874 		/* Not as strict as GRO. We only need to carry mss max value */
1875 		shinfo->gso_size = max(gso_size, tail_gso_size);
1876 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1877 
1878 		sk->sk_backlog.len += delta;
1879 		__NET_INC_STATS(sock_net(sk),
1880 				LINUX_MIB_TCPBACKLOGCOALESCE);
1881 		kfree_skb_partial(skb, fragstolen);
1882 		return false;
1883 	}
1884 	__skb_push(skb, hdrlen);
1885 
1886 no_coalesce:
1887 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1888 
1889 	/* Only socket owner can try to collapse/prune rx queues
1890 	 * to reduce memory overhead, so add a little headroom here.
1891 	 * Few sockets backlog are possibly concurrently non empty.
1892 	 */
1893 	limit += 64 * 1024;
1894 
1895 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1896 		bh_unlock_sock(sk);
1897 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1898 		return true;
1899 	}
1900 	return false;
1901 }
1902 EXPORT_SYMBOL(tcp_add_backlog);
1903 
tcp_filter(struct sock * sk,struct sk_buff * skb)1904 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1905 {
1906 	struct tcphdr *th = (struct tcphdr *)skb->data;
1907 
1908 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1909 }
1910 EXPORT_SYMBOL(tcp_filter);
1911 
tcp_v4_restore_cb(struct sk_buff * skb)1912 static void tcp_v4_restore_cb(struct sk_buff *skb)
1913 {
1914 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1915 		sizeof(struct inet_skb_parm));
1916 }
1917 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1918 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1919 			   const struct tcphdr *th)
1920 {
1921 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1922 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1923 	 */
1924 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1925 		sizeof(struct inet_skb_parm));
1926 	barrier();
1927 
1928 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1929 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1930 				    skb->len - th->doff * 4);
1931 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1932 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1933 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1934 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1935 	TCP_SKB_CB(skb)->sacked	 = 0;
1936 	TCP_SKB_CB(skb)->has_rxtstamp =
1937 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1938 }
1939 
1940 /*
1941  *	From tcp_input.c
1942  */
1943 
tcp_v4_rcv(struct sk_buff * skb)1944 int tcp_v4_rcv(struct sk_buff *skb)
1945 {
1946 	struct net *net = dev_net(skb->dev);
1947 	struct sk_buff *skb_to_free;
1948 	int sdif = inet_sdif(skb);
1949 	int dif = inet_iif(skb);
1950 	const struct iphdr *iph;
1951 	const struct tcphdr *th;
1952 	bool refcounted;
1953 	struct sock *sk;
1954 	int ret;
1955 
1956 	if (skb->pkt_type != PACKET_HOST)
1957 		goto discard_it;
1958 
1959 	/* Count it even if it's bad */
1960 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1961 
1962 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1963 		goto discard_it;
1964 
1965 	th = (const struct tcphdr *)skb->data;
1966 
1967 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1968 		goto bad_packet;
1969 	if (!pskb_may_pull(skb, th->doff * 4))
1970 		goto discard_it;
1971 
1972 	/* An explanation is required here, I think.
1973 	 * Packet length and doff are validated by header prediction,
1974 	 * provided case of th->doff==0 is eliminated.
1975 	 * So, we defer the checks. */
1976 
1977 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1978 		goto csum_error;
1979 
1980 	th = (const struct tcphdr *)skb->data;
1981 	iph = ip_hdr(skb);
1982 lookup:
1983 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1984 			       th->dest, sdif, &refcounted);
1985 	if (!sk)
1986 		goto no_tcp_socket;
1987 
1988 process:
1989 	if (sk->sk_state == TCP_TIME_WAIT)
1990 		goto do_time_wait;
1991 
1992 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1993 		struct request_sock *req = inet_reqsk(sk);
1994 		bool req_stolen = false;
1995 		struct sock *nsk;
1996 
1997 		sk = req->rsk_listener;
1998 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1999 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2000 			sk_drops_add(sk, skb);
2001 			reqsk_put(req);
2002 			goto discard_it;
2003 		}
2004 		if (tcp_checksum_complete(skb)) {
2005 			reqsk_put(req);
2006 			goto csum_error;
2007 		}
2008 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2009 			inet_csk_reqsk_queue_drop_and_put(sk, req);
2010 			goto lookup;
2011 		}
2012 		/* We own a reference on the listener, increase it again
2013 		 * as we might lose it too soon.
2014 		 */
2015 		sock_hold(sk);
2016 		refcounted = true;
2017 		nsk = NULL;
2018 		if (!tcp_filter(sk, skb)) {
2019 			th = (const struct tcphdr *)skb->data;
2020 			iph = ip_hdr(skb);
2021 			tcp_v4_fill_cb(skb, iph, th);
2022 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2023 		}
2024 		if (!nsk) {
2025 			reqsk_put(req);
2026 			if (req_stolen) {
2027 				/* Another cpu got exclusive access to req
2028 				 * and created a full blown socket.
2029 				 * Try to feed this packet to this socket
2030 				 * instead of discarding it.
2031 				 */
2032 				tcp_v4_restore_cb(skb);
2033 				sock_put(sk);
2034 				goto lookup;
2035 			}
2036 			goto discard_and_relse;
2037 		}
2038 		nf_reset_ct(skb);
2039 		if (nsk == sk) {
2040 			reqsk_put(req);
2041 			tcp_v4_restore_cb(skb);
2042 		} else if (tcp_child_process(sk, nsk, skb)) {
2043 			tcp_v4_send_reset(nsk, skb);
2044 			goto discard_and_relse;
2045 		} else {
2046 			sock_put(sk);
2047 			return 0;
2048 		}
2049 	}
2050 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2051 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2052 		goto discard_and_relse;
2053 	}
2054 
2055 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2056 		goto discard_and_relse;
2057 
2058 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2059 		goto discard_and_relse;
2060 
2061 	nf_reset_ct(skb);
2062 
2063 	if (tcp_filter(sk, skb))
2064 		goto discard_and_relse;
2065 	th = (const struct tcphdr *)skb->data;
2066 	iph = ip_hdr(skb);
2067 	tcp_v4_fill_cb(skb, iph, th);
2068 
2069 	skb->dev = NULL;
2070 
2071 	if (sk->sk_state == TCP_LISTEN) {
2072 		ret = tcp_v4_do_rcv(sk, skb);
2073 		goto put_and_return;
2074 	}
2075 
2076 	sk_incoming_cpu_update(sk);
2077 
2078 	bh_lock_sock_nested(sk);
2079 	tcp_segs_in(tcp_sk(sk), skb);
2080 	ret = 0;
2081 	if (!sock_owned_by_user(sk)) {
2082 		skb_to_free = sk->sk_rx_skb_cache;
2083 		sk->sk_rx_skb_cache = NULL;
2084 		ret = tcp_v4_do_rcv(sk, skb);
2085 	} else {
2086 		if (tcp_add_backlog(sk, skb))
2087 			goto discard_and_relse;
2088 		skb_to_free = NULL;
2089 	}
2090 	bh_unlock_sock(sk);
2091 	if (skb_to_free)
2092 		__kfree_skb(skb_to_free);
2093 
2094 put_and_return:
2095 	if (refcounted)
2096 		sock_put(sk);
2097 
2098 	return ret;
2099 
2100 no_tcp_socket:
2101 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2102 		goto discard_it;
2103 
2104 	tcp_v4_fill_cb(skb, iph, th);
2105 
2106 	if (tcp_checksum_complete(skb)) {
2107 csum_error:
2108 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2109 bad_packet:
2110 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2111 	} else {
2112 		tcp_v4_send_reset(NULL, skb);
2113 	}
2114 
2115 discard_it:
2116 	/* Discard frame. */
2117 	kfree_skb(skb);
2118 	return 0;
2119 
2120 discard_and_relse:
2121 	sk_drops_add(sk, skb);
2122 	if (refcounted)
2123 		sock_put(sk);
2124 	goto discard_it;
2125 
2126 do_time_wait:
2127 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2128 		inet_twsk_put(inet_twsk(sk));
2129 		goto discard_it;
2130 	}
2131 
2132 	tcp_v4_fill_cb(skb, iph, th);
2133 
2134 	if (tcp_checksum_complete(skb)) {
2135 		inet_twsk_put(inet_twsk(sk));
2136 		goto csum_error;
2137 	}
2138 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2139 	case TCP_TW_SYN: {
2140 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2141 							&tcp_hashinfo, skb,
2142 							__tcp_hdrlen(th),
2143 							iph->saddr, th->source,
2144 							iph->daddr, th->dest,
2145 							inet_iif(skb),
2146 							sdif);
2147 		if (sk2) {
2148 			inet_twsk_deschedule_put(inet_twsk(sk));
2149 			sk = sk2;
2150 			tcp_v4_restore_cb(skb);
2151 			refcounted = false;
2152 			goto process;
2153 		}
2154 	}
2155 		/* to ACK */
2156 		fallthrough;
2157 	case TCP_TW_ACK:
2158 		tcp_v4_timewait_ack(sk, skb);
2159 		break;
2160 	case TCP_TW_RST:
2161 		tcp_v4_send_reset(sk, skb);
2162 		inet_twsk_deschedule_put(inet_twsk(sk));
2163 		goto discard_it;
2164 	case TCP_TW_SUCCESS:;
2165 	}
2166 	goto discard_it;
2167 }
2168 
2169 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2170 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2171 	.twsk_unique	= tcp_twsk_unique,
2172 	.twsk_destructor= tcp_twsk_destructor,
2173 };
2174 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2175 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2176 {
2177 	struct dst_entry *dst = skb_dst(skb);
2178 
2179 	if (dst && dst_hold_safe(dst)) {
2180 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2181 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2182 	}
2183 }
2184 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2185 
2186 const struct inet_connection_sock_af_ops ipv4_specific = {
2187 	.queue_xmit	   = ip_queue_xmit,
2188 	.send_check	   = tcp_v4_send_check,
2189 	.rebuild_header	   = inet_sk_rebuild_header,
2190 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2191 	.conn_request	   = tcp_v4_conn_request,
2192 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2193 	.net_header_len	   = sizeof(struct iphdr),
2194 	.setsockopt	   = ip_setsockopt,
2195 	.getsockopt	   = ip_getsockopt,
2196 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2197 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2198 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2199 };
2200 EXPORT_SYMBOL(ipv4_specific);
2201 
2202 #ifdef CONFIG_TCP_MD5SIG
2203 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2204 	.md5_lookup		= tcp_v4_md5_lookup,
2205 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2206 	.md5_parse		= tcp_v4_parse_md5_keys,
2207 };
2208 #endif
2209 
2210 /* NOTE: A lot of things set to zero explicitly by call to
2211  *       sk_alloc() so need not be done here.
2212  */
tcp_v4_init_sock(struct sock * sk)2213 static int tcp_v4_init_sock(struct sock *sk)
2214 {
2215 	struct inet_connection_sock *icsk = inet_csk(sk);
2216 
2217 	tcp_init_sock(sk);
2218 
2219 	icsk->icsk_af_ops = &ipv4_specific;
2220 
2221 #ifdef CONFIG_TCP_MD5SIG
2222 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2223 #endif
2224 
2225 	return 0;
2226 }
2227 
tcp_v4_destroy_sock(struct sock * sk)2228 void tcp_v4_destroy_sock(struct sock *sk)
2229 {
2230 	struct tcp_sock *tp = tcp_sk(sk);
2231 
2232 	trace_tcp_destroy_sock(sk);
2233 
2234 	tcp_clear_xmit_timers(sk);
2235 
2236 	tcp_cleanup_congestion_control(sk);
2237 
2238 	tcp_cleanup_ulp(sk);
2239 
2240 	/* Cleanup up the write buffer. */
2241 	tcp_write_queue_purge(sk);
2242 
2243 	/* Check if we want to disable active TFO */
2244 	tcp_fastopen_active_disable_ofo_check(sk);
2245 
2246 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2247 	skb_rbtree_purge(&tp->out_of_order_queue);
2248 
2249 #ifdef CONFIG_TCP_MD5SIG
2250 	/* Clean up the MD5 key list, if any */
2251 	if (tp->md5sig_info) {
2252 		tcp_clear_md5_list(sk);
2253 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2254 		tp->md5sig_info = NULL;
2255 	}
2256 #endif
2257 
2258 	/* Clean up a referenced TCP bind bucket. */
2259 	if (inet_csk(sk)->icsk_bind_hash)
2260 		inet_put_port(sk);
2261 
2262 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2263 
2264 	/* If socket is aborted during connect operation */
2265 	tcp_free_fastopen_req(tp);
2266 	tcp_fastopen_destroy_cipher(sk);
2267 	tcp_saved_syn_free(tp);
2268 
2269 	sk_sockets_allocated_dec(sk);
2270 }
2271 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2272 
2273 #ifdef CONFIG_PROC_FS
2274 /* Proc filesystem TCP sock list dumping. */
2275 
2276 /*
2277  * Get next listener socket follow cur.  If cur is NULL, get first socket
2278  * starting from bucket given in st->bucket; when st->bucket is zero the
2279  * very first socket in the hash table is returned.
2280  */
listening_get_next(struct seq_file * seq,void * cur)2281 static void *listening_get_next(struct seq_file *seq, void *cur)
2282 {
2283 	struct tcp_seq_afinfo *afinfo;
2284 	struct tcp_iter_state *st = seq->private;
2285 	struct net *net = seq_file_net(seq);
2286 	struct inet_listen_hashbucket *ilb;
2287 	struct hlist_nulls_node *node;
2288 	struct sock *sk = cur;
2289 
2290 	if (st->bpf_seq_afinfo)
2291 		afinfo = st->bpf_seq_afinfo;
2292 	else
2293 		afinfo = PDE_DATA(file_inode(seq->file));
2294 
2295 	if (!sk) {
2296 get_head:
2297 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2298 		spin_lock(&ilb->lock);
2299 		sk = sk_nulls_head(&ilb->nulls_head);
2300 		st->offset = 0;
2301 		goto get_sk;
2302 	}
2303 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2304 	++st->num;
2305 	++st->offset;
2306 
2307 	sk = sk_nulls_next(sk);
2308 get_sk:
2309 	sk_nulls_for_each_from(sk, node) {
2310 		if (!net_eq(sock_net(sk), net))
2311 			continue;
2312 		if (afinfo->family == AF_UNSPEC ||
2313 		    sk->sk_family == afinfo->family)
2314 			return sk;
2315 	}
2316 	spin_unlock(&ilb->lock);
2317 	st->offset = 0;
2318 	if (++st->bucket < INET_LHTABLE_SIZE)
2319 		goto get_head;
2320 	return NULL;
2321 }
2322 
listening_get_idx(struct seq_file * seq,loff_t * pos)2323 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2324 {
2325 	struct tcp_iter_state *st = seq->private;
2326 	void *rc;
2327 
2328 	st->bucket = 0;
2329 	st->offset = 0;
2330 	rc = listening_get_next(seq, NULL);
2331 
2332 	while (rc && *pos) {
2333 		rc = listening_get_next(seq, rc);
2334 		--*pos;
2335 	}
2336 	return rc;
2337 }
2338 
empty_bucket(const struct tcp_iter_state * st)2339 static inline bool empty_bucket(const struct tcp_iter_state *st)
2340 {
2341 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2342 }
2343 
2344 /*
2345  * Get first established socket starting from bucket given in st->bucket.
2346  * If st->bucket is zero, the very first socket in the hash is returned.
2347  */
established_get_first(struct seq_file * seq)2348 static void *established_get_first(struct seq_file *seq)
2349 {
2350 	struct tcp_seq_afinfo *afinfo;
2351 	struct tcp_iter_state *st = seq->private;
2352 	struct net *net = seq_file_net(seq);
2353 	void *rc = NULL;
2354 
2355 	if (st->bpf_seq_afinfo)
2356 		afinfo = st->bpf_seq_afinfo;
2357 	else
2358 		afinfo = PDE_DATA(file_inode(seq->file));
2359 
2360 	st->offset = 0;
2361 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2362 		struct sock *sk;
2363 		struct hlist_nulls_node *node;
2364 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2365 
2366 		/* Lockless fast path for the common case of empty buckets */
2367 		if (empty_bucket(st))
2368 			continue;
2369 
2370 		spin_lock_bh(lock);
2371 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2372 			if ((afinfo->family != AF_UNSPEC &&
2373 			     sk->sk_family != afinfo->family) ||
2374 			    !net_eq(sock_net(sk), net)) {
2375 				continue;
2376 			}
2377 			rc = sk;
2378 			goto out;
2379 		}
2380 		spin_unlock_bh(lock);
2381 	}
2382 out:
2383 	return rc;
2384 }
2385 
established_get_next(struct seq_file * seq,void * cur)2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388 	struct tcp_seq_afinfo *afinfo;
2389 	struct sock *sk = cur;
2390 	struct hlist_nulls_node *node;
2391 	struct tcp_iter_state *st = seq->private;
2392 	struct net *net = seq_file_net(seq);
2393 
2394 	if (st->bpf_seq_afinfo)
2395 		afinfo = st->bpf_seq_afinfo;
2396 	else
2397 		afinfo = PDE_DATA(file_inode(seq->file));
2398 
2399 	++st->num;
2400 	++st->offset;
2401 
2402 	sk = sk_nulls_next(sk);
2403 
2404 	sk_nulls_for_each_from(sk, node) {
2405 		if ((afinfo->family == AF_UNSPEC ||
2406 		     sk->sk_family == afinfo->family) &&
2407 		    net_eq(sock_net(sk), net))
2408 			return sk;
2409 	}
2410 
2411 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2412 	++st->bucket;
2413 	return established_get_first(seq);
2414 }
2415 
established_get_idx(struct seq_file * seq,loff_t pos)2416 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2417 {
2418 	struct tcp_iter_state *st = seq->private;
2419 	void *rc;
2420 
2421 	st->bucket = 0;
2422 	rc = established_get_first(seq);
2423 
2424 	while (rc && pos) {
2425 		rc = established_get_next(seq, rc);
2426 		--pos;
2427 	}
2428 	return rc;
2429 }
2430 
tcp_get_idx(struct seq_file * seq,loff_t pos)2431 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2432 {
2433 	void *rc;
2434 	struct tcp_iter_state *st = seq->private;
2435 
2436 	st->state = TCP_SEQ_STATE_LISTENING;
2437 	rc	  = listening_get_idx(seq, &pos);
2438 
2439 	if (!rc) {
2440 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2441 		rc	  = established_get_idx(seq, pos);
2442 	}
2443 
2444 	return rc;
2445 }
2446 
tcp_seek_last_pos(struct seq_file * seq)2447 static void *tcp_seek_last_pos(struct seq_file *seq)
2448 {
2449 	struct tcp_iter_state *st = seq->private;
2450 	int bucket = st->bucket;
2451 	int offset = st->offset;
2452 	int orig_num = st->num;
2453 	void *rc = NULL;
2454 
2455 	switch (st->state) {
2456 	case TCP_SEQ_STATE_LISTENING:
2457 		if (st->bucket >= INET_LHTABLE_SIZE)
2458 			break;
2459 		st->state = TCP_SEQ_STATE_LISTENING;
2460 		rc = listening_get_next(seq, NULL);
2461 		while (offset-- && rc && bucket == st->bucket)
2462 			rc = listening_get_next(seq, rc);
2463 		if (rc)
2464 			break;
2465 		st->bucket = 0;
2466 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2467 		fallthrough;
2468 	case TCP_SEQ_STATE_ESTABLISHED:
2469 		if (st->bucket > tcp_hashinfo.ehash_mask)
2470 			break;
2471 		rc = established_get_first(seq);
2472 		while (offset-- && rc && bucket == st->bucket)
2473 			rc = established_get_next(seq, rc);
2474 	}
2475 
2476 	st->num = orig_num;
2477 
2478 	return rc;
2479 }
2480 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2481 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2482 {
2483 	struct tcp_iter_state *st = seq->private;
2484 	void *rc;
2485 
2486 	if (*pos && *pos == st->last_pos) {
2487 		rc = tcp_seek_last_pos(seq);
2488 		if (rc)
2489 			goto out;
2490 	}
2491 
2492 	st->state = TCP_SEQ_STATE_LISTENING;
2493 	st->num = 0;
2494 	st->bucket = 0;
2495 	st->offset = 0;
2496 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2497 
2498 out:
2499 	st->last_pos = *pos;
2500 	return rc;
2501 }
2502 EXPORT_SYMBOL(tcp_seq_start);
2503 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2504 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2505 {
2506 	struct tcp_iter_state *st = seq->private;
2507 	void *rc = NULL;
2508 
2509 	if (v == SEQ_START_TOKEN) {
2510 		rc = tcp_get_idx(seq, 0);
2511 		goto out;
2512 	}
2513 
2514 	switch (st->state) {
2515 	case TCP_SEQ_STATE_LISTENING:
2516 		rc = listening_get_next(seq, v);
2517 		if (!rc) {
2518 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2519 			st->bucket = 0;
2520 			st->offset = 0;
2521 			rc	  = established_get_first(seq);
2522 		}
2523 		break;
2524 	case TCP_SEQ_STATE_ESTABLISHED:
2525 		rc = established_get_next(seq, v);
2526 		break;
2527 	}
2528 out:
2529 	++*pos;
2530 	st->last_pos = *pos;
2531 	return rc;
2532 }
2533 EXPORT_SYMBOL(tcp_seq_next);
2534 
tcp_seq_stop(struct seq_file * seq,void * v)2535 void tcp_seq_stop(struct seq_file *seq, void *v)
2536 {
2537 	struct tcp_iter_state *st = seq->private;
2538 
2539 	switch (st->state) {
2540 	case TCP_SEQ_STATE_LISTENING:
2541 		if (v != SEQ_START_TOKEN)
2542 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2543 		break;
2544 	case TCP_SEQ_STATE_ESTABLISHED:
2545 		if (v)
2546 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2547 		break;
2548 	}
2549 }
2550 EXPORT_SYMBOL(tcp_seq_stop);
2551 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2552 static void get_openreq4(const struct request_sock *req,
2553 			 struct seq_file *f, int i)
2554 {
2555 	const struct inet_request_sock *ireq = inet_rsk(req);
2556 	long delta = req->rsk_timer.expires - jiffies;
2557 
2558 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2559 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2560 		i,
2561 		ireq->ir_loc_addr,
2562 		ireq->ir_num,
2563 		ireq->ir_rmt_addr,
2564 		ntohs(ireq->ir_rmt_port),
2565 		TCP_SYN_RECV,
2566 		0, 0, /* could print option size, but that is af dependent. */
2567 		1,    /* timers active (only the expire timer) */
2568 		jiffies_delta_to_clock_t(delta),
2569 		req->num_timeout,
2570 		from_kuid_munged(seq_user_ns(f),
2571 				 sock_i_uid(req->rsk_listener)),
2572 		0,  /* non standard timer */
2573 		0, /* open_requests have no inode */
2574 		0,
2575 		req);
2576 }
2577 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2578 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2579 {
2580 	int timer_active;
2581 	unsigned long timer_expires;
2582 	const struct tcp_sock *tp = tcp_sk(sk);
2583 	const struct inet_connection_sock *icsk = inet_csk(sk);
2584 	const struct inet_sock *inet = inet_sk(sk);
2585 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2586 	__be32 dest = inet->inet_daddr;
2587 	__be32 src = inet->inet_rcv_saddr;
2588 	__u16 destp = ntohs(inet->inet_dport);
2589 	__u16 srcp = ntohs(inet->inet_sport);
2590 	int rx_queue;
2591 	int state;
2592 
2593 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2594 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2595 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2596 		timer_active	= 1;
2597 		timer_expires	= icsk->icsk_timeout;
2598 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2599 		timer_active	= 4;
2600 		timer_expires	= icsk->icsk_timeout;
2601 	} else if (timer_pending(&sk->sk_timer)) {
2602 		timer_active	= 2;
2603 		timer_expires	= sk->sk_timer.expires;
2604 	} else {
2605 		timer_active	= 0;
2606 		timer_expires = jiffies;
2607 	}
2608 
2609 	state = inet_sk_state_load(sk);
2610 	if (state == TCP_LISTEN)
2611 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2612 	else
2613 		/* Because we don't lock the socket,
2614 		 * we might find a transient negative value.
2615 		 */
2616 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2617 				      READ_ONCE(tp->copied_seq), 0);
2618 
2619 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2620 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2621 		i, src, srcp, dest, destp, state,
2622 		READ_ONCE(tp->write_seq) - tp->snd_una,
2623 		rx_queue,
2624 		timer_active,
2625 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2626 		icsk->icsk_retransmits,
2627 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2628 		icsk->icsk_probes_out,
2629 		sock_i_ino(sk),
2630 		refcount_read(&sk->sk_refcnt), sk,
2631 		jiffies_to_clock_t(icsk->icsk_rto),
2632 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2633 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2634 		tp->snd_cwnd,
2635 		state == TCP_LISTEN ?
2636 		    fastopenq->max_qlen :
2637 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2638 }
2639 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2640 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2641 			       struct seq_file *f, int i)
2642 {
2643 	long delta = tw->tw_timer.expires - jiffies;
2644 	__be32 dest, src;
2645 	__u16 destp, srcp;
2646 
2647 	dest  = tw->tw_daddr;
2648 	src   = tw->tw_rcv_saddr;
2649 	destp = ntohs(tw->tw_dport);
2650 	srcp  = ntohs(tw->tw_sport);
2651 
2652 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2653 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2654 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2655 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2656 		refcount_read(&tw->tw_refcnt), tw);
2657 }
2658 
2659 #define TMPSZ 150
2660 
tcp4_seq_show(struct seq_file * seq,void * v)2661 static int tcp4_seq_show(struct seq_file *seq, void *v)
2662 {
2663 	struct tcp_iter_state *st;
2664 	struct sock *sk = v;
2665 
2666 	seq_setwidth(seq, TMPSZ - 1);
2667 	if (v == SEQ_START_TOKEN) {
2668 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2669 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2670 			   "inode");
2671 		goto out;
2672 	}
2673 	st = seq->private;
2674 
2675 	if (sk->sk_state == TCP_TIME_WAIT)
2676 		get_timewait4_sock(v, seq, st->num);
2677 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2678 		get_openreq4(v, seq, st->num);
2679 	else
2680 		get_tcp4_sock(v, seq, st->num);
2681 out:
2682 	seq_pad(seq, '\n');
2683 	return 0;
2684 }
2685 
2686 #ifdef CONFIG_BPF_SYSCALL
2687 struct bpf_iter__tcp {
2688 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2689 	__bpf_md_ptr(struct sock_common *, sk_common);
2690 	uid_t uid __aligned(8);
2691 };
2692 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2693 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694 			     struct sock_common *sk_common, uid_t uid)
2695 {
2696 	struct bpf_iter__tcp ctx;
2697 
2698 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2699 	ctx.meta = meta;
2700 	ctx.sk_common = sk_common;
2701 	ctx.uid = uid;
2702 	return bpf_iter_run_prog(prog, &ctx);
2703 }
2704 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2705 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2706 {
2707 	struct bpf_iter_meta meta;
2708 	struct bpf_prog *prog;
2709 	struct sock *sk = v;
2710 	uid_t uid;
2711 
2712 	if (v == SEQ_START_TOKEN)
2713 		return 0;
2714 
2715 	if (sk->sk_state == TCP_TIME_WAIT) {
2716 		uid = 0;
2717 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2718 		const struct request_sock *req = v;
2719 
2720 		uid = from_kuid_munged(seq_user_ns(seq),
2721 				       sock_i_uid(req->rsk_listener));
2722 	} else {
2723 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2724 	}
2725 
2726 	meta.seq = seq;
2727 	prog = bpf_iter_get_info(&meta, false);
2728 	return tcp_prog_seq_show(prog, &meta, v, uid);
2729 }
2730 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2731 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2732 {
2733 	struct bpf_iter_meta meta;
2734 	struct bpf_prog *prog;
2735 
2736 	if (!v) {
2737 		meta.seq = seq;
2738 		prog = bpf_iter_get_info(&meta, true);
2739 		if (prog)
2740 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2741 	}
2742 
2743 	tcp_seq_stop(seq, v);
2744 }
2745 
2746 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2747 	.show		= bpf_iter_tcp_seq_show,
2748 	.start		= tcp_seq_start,
2749 	.next		= tcp_seq_next,
2750 	.stop		= bpf_iter_tcp_seq_stop,
2751 };
2752 #endif
2753 
2754 static const struct seq_operations tcp4_seq_ops = {
2755 	.show		= tcp4_seq_show,
2756 	.start		= tcp_seq_start,
2757 	.next		= tcp_seq_next,
2758 	.stop		= tcp_seq_stop,
2759 };
2760 
2761 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2762 	.family		= AF_INET,
2763 };
2764 
tcp4_proc_init_net(struct net * net)2765 static int __net_init tcp4_proc_init_net(struct net *net)
2766 {
2767 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2768 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2769 		return -ENOMEM;
2770 	return 0;
2771 }
2772 
tcp4_proc_exit_net(struct net * net)2773 static void __net_exit tcp4_proc_exit_net(struct net *net)
2774 {
2775 	remove_proc_entry("tcp", net->proc_net);
2776 }
2777 
2778 static struct pernet_operations tcp4_net_ops = {
2779 	.init = tcp4_proc_init_net,
2780 	.exit = tcp4_proc_exit_net,
2781 };
2782 
tcp4_proc_init(void)2783 int __init tcp4_proc_init(void)
2784 {
2785 	return register_pernet_subsys(&tcp4_net_ops);
2786 }
2787 
tcp4_proc_exit(void)2788 void tcp4_proc_exit(void)
2789 {
2790 	unregister_pernet_subsys(&tcp4_net_ops);
2791 }
2792 #endif /* CONFIG_PROC_FS */
2793 
2794 struct proto tcp_prot = {
2795 	.name			= "TCP",
2796 	.owner			= THIS_MODULE,
2797 	.close			= tcp_close,
2798 	.pre_connect		= tcp_v4_pre_connect,
2799 	.connect		= tcp_v4_connect,
2800 	.disconnect		= tcp_disconnect,
2801 	.accept			= inet_csk_accept,
2802 	.ioctl			= tcp_ioctl,
2803 	.init			= tcp_v4_init_sock,
2804 	.destroy		= tcp_v4_destroy_sock,
2805 	.shutdown		= tcp_shutdown,
2806 	.setsockopt		= tcp_setsockopt,
2807 	.getsockopt		= tcp_getsockopt,
2808 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
2809 	.keepalive		= tcp_set_keepalive,
2810 	.recvmsg		= tcp_recvmsg,
2811 	.sendmsg		= tcp_sendmsg,
2812 	.sendpage		= tcp_sendpage,
2813 	.backlog_rcv		= tcp_v4_do_rcv,
2814 	.release_cb		= tcp_release_cb,
2815 	.hash			= inet_hash,
2816 	.unhash			= inet_unhash,
2817 	.get_port		= inet_csk_get_port,
2818 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2819 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2820 	.stream_memory_free	= tcp_stream_memory_free,
2821 	.sockets_allocated	= &tcp_sockets_allocated,
2822 	.orphan_count		= &tcp_orphan_count,
2823 	.memory_allocated	= &tcp_memory_allocated,
2824 	.memory_pressure	= &tcp_memory_pressure,
2825 	.sysctl_mem		= sysctl_tcp_mem,
2826 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2827 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2828 	.max_header		= MAX_TCP_HEADER,
2829 	.obj_size		= sizeof(struct tcp_sock),
2830 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2831 	.twsk_prot		= &tcp_timewait_sock_ops,
2832 	.rsk_prot		= &tcp_request_sock_ops,
2833 	.h.hashinfo		= &tcp_hashinfo,
2834 	.no_autobind		= true,
2835 	.diag_destroy		= tcp_abort,
2836 };
2837 EXPORT_SYMBOL(tcp_prot);
2838 
tcp_sk_exit(struct net * net)2839 static void __net_exit tcp_sk_exit(struct net *net)
2840 {
2841 	if (net->ipv4.tcp_congestion_control)
2842 		bpf_module_put(net->ipv4.tcp_congestion_control,
2843 			       net->ipv4.tcp_congestion_control->owner);
2844 }
2845 
tcp_sk_init(struct net * net)2846 static int __net_init tcp_sk_init(struct net *net)
2847 {
2848 	int cnt;
2849 
2850 	net->ipv4.sysctl_tcp_ecn = 2;
2851 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2852 
2853 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2854 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2855 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2856 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2857 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2858 
2859 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2860 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2861 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2862 
2863 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2864 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2865 	net->ipv4.sysctl_tcp_syncookies = 1;
2866 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2867 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2868 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2869 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2870 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2871 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2872 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2873 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2874 
2875 	cnt = tcp_hashinfo.ehash_mask + 1;
2876 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2877 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2878 
2879 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2880 	net->ipv4.sysctl_tcp_sack = 1;
2881 	net->ipv4.sysctl_tcp_window_scaling = 1;
2882 	net->ipv4.sysctl_tcp_timestamps = 1;
2883 	net->ipv4.sysctl_tcp_early_retrans = 3;
2884 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2885 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2886 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2887 	net->ipv4.sysctl_tcp_max_reordering = 300;
2888 	net->ipv4.sysctl_tcp_dsack = 1;
2889 	net->ipv4.sysctl_tcp_app_win = 31;
2890 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2891 	net->ipv4.sysctl_tcp_frto = 2;
2892 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2893 	/* This limits the percentage of the congestion window which we
2894 	 * will allow a single TSO frame to consume.  Building TSO frames
2895 	 * which are too large can cause TCP streams to be bursty.
2896 	 */
2897 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2898 	/* Default TSQ limit of 16 TSO segments */
2899 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2900 	/* rfc5961 challenge ack rate limiting */
2901 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2902 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2903 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2904 	net->ipv4.sysctl_tcp_autocorking = 1;
2905 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2906 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2907 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2908 	if (net != &init_net) {
2909 		memcpy(net->ipv4.sysctl_tcp_rmem,
2910 		       init_net.ipv4.sysctl_tcp_rmem,
2911 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2912 		memcpy(net->ipv4.sysctl_tcp_wmem,
2913 		       init_net.ipv4.sysctl_tcp_wmem,
2914 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2915 	}
2916 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2917 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2918 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2919 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2920 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2921 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2922 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2923 
2924 	/* Reno is always built in */
2925 	if (!net_eq(net, &init_net) &&
2926 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2927 			       init_net.ipv4.tcp_congestion_control->owner))
2928 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2929 	else
2930 		net->ipv4.tcp_congestion_control = &tcp_reno;
2931 
2932 	return 0;
2933 }
2934 
tcp_sk_exit_batch(struct list_head * net_exit_list)2935 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2936 {
2937 	struct net *net;
2938 
2939 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2940 
2941 	list_for_each_entry(net, net_exit_list, exit_list)
2942 		tcp_fastopen_ctx_destroy(net);
2943 }
2944 
2945 static struct pernet_operations __net_initdata tcp_sk_ops = {
2946        .init	   = tcp_sk_init,
2947        .exit	   = tcp_sk_exit,
2948        .exit_batch = tcp_sk_exit_batch,
2949 };
2950 
2951 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2952 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2953 		     struct sock_common *sk_common, uid_t uid)
2954 
2955 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2956 {
2957 	struct tcp_iter_state *st = priv_data;
2958 	struct tcp_seq_afinfo *afinfo;
2959 	int ret;
2960 
2961 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2962 	if (!afinfo)
2963 		return -ENOMEM;
2964 
2965 	afinfo->family = AF_UNSPEC;
2966 	st->bpf_seq_afinfo = afinfo;
2967 	ret = bpf_iter_init_seq_net(priv_data, aux);
2968 	if (ret)
2969 		kfree(afinfo);
2970 	return ret;
2971 }
2972 
bpf_iter_fini_tcp(void * priv_data)2973 static void bpf_iter_fini_tcp(void *priv_data)
2974 {
2975 	struct tcp_iter_state *st = priv_data;
2976 
2977 	kfree(st->bpf_seq_afinfo);
2978 	bpf_iter_fini_seq_net(priv_data);
2979 }
2980 
2981 static const struct bpf_iter_seq_info tcp_seq_info = {
2982 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2983 	.init_seq_private	= bpf_iter_init_tcp,
2984 	.fini_seq_private	= bpf_iter_fini_tcp,
2985 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2986 };
2987 
2988 static struct bpf_iter_reg tcp_reg_info = {
2989 	.target			= "tcp",
2990 	.ctx_arg_info_size	= 1,
2991 	.ctx_arg_info		= {
2992 		{ offsetof(struct bpf_iter__tcp, sk_common),
2993 		  PTR_TO_BTF_ID_OR_NULL },
2994 	},
2995 	.seq_info		= &tcp_seq_info,
2996 };
2997 
bpf_iter_register(void)2998 static void __init bpf_iter_register(void)
2999 {
3000 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3001 	if (bpf_iter_reg_target(&tcp_reg_info))
3002 		pr_warn("Warning: could not register bpf iterator tcp\n");
3003 }
3004 
3005 #endif
3006 
tcp_v4_init(void)3007 void __init tcp_v4_init(void)
3008 {
3009 	int cpu, res;
3010 
3011 	for_each_possible_cpu(cpu) {
3012 		struct sock *sk;
3013 
3014 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3015 					   IPPROTO_TCP, &init_net);
3016 		if (res)
3017 			panic("Failed to create the TCP control socket.\n");
3018 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3019 
3020 		/* Please enforce IP_DF and IPID==0 for RST and
3021 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3022 		 */
3023 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3024 
3025 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3026 	}
3027 	if (register_pernet_subsys(&tcp_sk_ops))
3028 		panic("Failed to create the TCP control socket.\n");
3029 
3030 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3031 	bpf_iter_register();
3032 #endif
3033 }
3034