• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_sock *inet = inet_sk(sk);
204 	struct tcp_sock *tp = tcp_sk(sk);
205 	__be16 orig_sport, orig_dport;
206 	__be32 daddr, nexthop;
207 	struct flowi4 *fl4;
208 	struct rtable *rt;
209 	int err;
210 	struct ip_options_rcu *inet_opt;
211 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
212 
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	if (usin->sin_family != AF_INET)
217 		return -EAFNOSUPPORT;
218 
219 	nexthop = daddr = usin->sin_addr.s_addr;
220 	inet_opt = rcu_dereference_protected(inet->inet_opt,
221 					     lockdep_sock_is_held(sk));
222 	if (inet_opt && inet_opt->opt.srr) {
223 		if (!daddr)
224 			return -EINVAL;
225 		nexthop = inet_opt->opt.faddr;
226 	}
227 
228 	orig_sport = inet->inet_sport;
229 	orig_dport = usin->sin_port;
230 	fl4 = &inet->cork.fl.u.ip4;
231 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 			      IPPROTO_TCP,
234 			      orig_sport, orig_dport, sk);
235 	if (IS_ERR(rt)) {
236 		err = PTR_ERR(rt);
237 		if (err == -ENETUNREACH)
238 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 		return err;
240 	}
241 
242 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 		ip_rt_put(rt);
244 		return -ENETUNREACH;
245 	}
246 
247 	if (!inet_opt || !inet_opt->opt.srr)
248 		daddr = fl4->daddr;
249 
250 	if (!inet->inet_saddr)
251 		inet->inet_saddr = fl4->saddr;
252 	sk_rcv_saddr_set(sk, inet->inet_saddr);
253 
254 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 		/* Reset inherited state */
256 		tp->rx_opt.ts_recent	   = 0;
257 		tp->rx_opt.ts_recent_stamp = 0;
258 		if (likely(!tp->repair))
259 			WRITE_ONCE(tp->write_seq, 0);
260 	}
261 
262 	inet->inet_dport = usin->sin_port;
263 	sk_daddr_set(sk, daddr);
264 
265 	inet_csk(sk)->icsk_ext_hdr_len = 0;
266 	if (inet_opt)
267 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 
269 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 
271 	/* Socket identity is still unknown (sport may be zero).
272 	 * However we set state to SYN-SENT and not releasing socket
273 	 * lock select source port, enter ourselves into the hash tables and
274 	 * complete initialization after this.
275 	 */
276 	tcp_set_state(sk, TCP_SYN_SENT);
277 	err = inet_hash_connect(tcp_death_row, sk);
278 	if (err)
279 		goto failure;
280 
281 	sk_set_txhash(sk);
282 
283 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 			       inet->inet_sport, inet->inet_dport, sk);
285 	if (IS_ERR(rt)) {
286 		err = PTR_ERR(rt);
287 		rt = NULL;
288 		goto failure;
289 	}
290 	/* OK, now commit destination to socket.  */
291 	sk->sk_gso_type = SKB_GSO_TCPV4;
292 	sk_setup_caps(sk, &rt->dst);
293 	rt = NULL;
294 
295 	if (likely(!tp->repair)) {
296 		if (!tp->write_seq)
297 			WRITE_ONCE(tp->write_seq,
298 				   secure_tcp_seq(inet->inet_saddr,
299 						  inet->inet_daddr,
300 						  inet->inet_sport,
301 						  usin->sin_port));
302 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 						 inet->inet_saddr,
304 						 inet->inet_daddr);
305 	}
306 
307 	inet->inet_id = prandom_u32();
308 
309 	if (tcp_fastopen_defer_connect(sk, &err))
310 		return err;
311 	if (err)
312 		goto failure;
313 
314 	err = tcp_connect(sk);
315 
316 	if (err)
317 		goto failure;
318 
319 	return 0;
320 
321 failure:
322 	/*
323 	 * This unhashes the socket and releases the local port,
324 	 * if necessary.
325 	 */
326 	tcp_set_state(sk, TCP_CLOSE);
327 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
328 		inet_reset_saddr(sk);
329 	ip_rt_put(rt);
330 	sk->sk_route_caps = 0;
331 	inet->inet_dport = 0;
332 	return err;
333 }
334 EXPORT_SYMBOL(tcp_v4_connect);
335 
336 /*
337  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
338  * It can be called through tcp_release_cb() if socket was owned by user
339  * at the time tcp_v4_err() was called to handle ICMP message.
340  */
tcp_v4_mtu_reduced(struct sock * sk)341 void tcp_v4_mtu_reduced(struct sock *sk)
342 {
343 	struct inet_sock *inet = inet_sk(sk);
344 	struct dst_entry *dst;
345 	u32 mtu;
346 
347 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
348 		return;
349 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
350 	dst = inet_csk_update_pmtu(sk, mtu);
351 	if (!dst)
352 		return;
353 
354 	/* Something is about to be wrong... Remember soft error
355 	 * for the case, if this connection will not able to recover.
356 	 */
357 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
358 		sk->sk_err_soft = EMSGSIZE;
359 
360 	mtu = dst_mtu(dst);
361 
362 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
363 	    ip_sk_accept_pmtu(sk) &&
364 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
365 		tcp_sync_mss(sk, mtu);
366 
367 		/* Resend the TCP packet because it's
368 		 * clear that the old packet has been
369 		 * dropped. This is the new "fast" path mtu
370 		 * discovery.
371 		 */
372 		tcp_simple_retransmit(sk);
373 	} /* else let the usual retransmit timer handle it */
374 }
375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
376 
do_redirect(struct sk_buff * skb,struct sock * sk)377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
378 {
379 	struct dst_entry *dst = __sk_dst_check(sk, 0);
380 
381 	if (dst)
382 		dst->ops->redirect(dst, sk, skb);
383 }
384 
385 
386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
388 {
389 	struct request_sock *req = inet_reqsk(sk);
390 	struct net *net = sock_net(sk);
391 
392 	/* ICMPs are not backlogged, hence we cannot get
393 	 * an established socket here.
394 	 */
395 	if (seq != tcp_rsk(req)->snt_isn) {
396 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 	} else if (abort) {
398 		/*
399 		 * Still in SYN_RECV, just remove it silently.
400 		 * There is no good way to pass the error to the newly
401 		 * created socket, and POSIX does not want network
402 		 * errors returned from accept().
403 		 */
404 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
405 		tcp_listendrop(req->rsk_listener);
406 	}
407 	reqsk_put(req);
408 }
409 EXPORT_SYMBOL(tcp_req_err);
410 
411 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
413 {
414 	struct inet_connection_sock *icsk = inet_csk(sk);
415 	struct tcp_sock *tp = tcp_sk(sk);
416 	struct sk_buff *skb;
417 	s32 remaining;
418 	u32 delta_us;
419 
420 	if (sock_owned_by_user(sk))
421 		return;
422 
423 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
424 	    !icsk->icsk_backoff)
425 		return;
426 
427 	skb = tcp_rtx_queue_head(sk);
428 	if (WARN_ON_ONCE(!skb))
429 		return;
430 
431 	icsk->icsk_backoff--;
432 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
433 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
434 
435 	tcp_mstamp_refresh(tp);
436 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
437 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
438 
439 	if (remaining > 0) {
440 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
441 					  remaining, TCP_RTO_MAX);
442 	} else {
443 		/* RTO revert clocked out retransmission.
444 		 * Will retransmit now.
445 		 */
446 		tcp_retransmit_timer(sk);
447 	}
448 }
449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
450 
451 /*
452  * This routine is called by the ICMP module when it gets some
453  * sort of error condition.  If err < 0 then the socket should
454  * be closed and the error returned to the user.  If err > 0
455  * it's just the icmp type << 8 | icmp code.  After adjustment
456  * header points to the first 8 bytes of the tcp header.  We need
457  * to find the appropriate port.
458  *
459  * The locking strategy used here is very "optimistic". When
460  * someone else accesses the socket the ICMP is just dropped
461  * and for some paths there is no check at all.
462  * A more general error queue to queue errors for later handling
463  * is probably better.
464  *
465  */
466 
tcp_v4_err(struct sk_buff * skb,u32 info)467 int tcp_v4_err(struct sk_buff *skb, u32 info)
468 {
469 	const struct iphdr *iph = (const struct iphdr *)skb->data;
470 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
471 	struct tcp_sock *tp;
472 	struct inet_sock *inet;
473 	const int type = icmp_hdr(skb)->type;
474 	const int code = icmp_hdr(skb)->code;
475 	struct sock *sk;
476 	struct request_sock *fastopen;
477 	u32 seq, snd_una;
478 	int err;
479 	struct net *net = dev_net(skb->dev);
480 
481 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
482 				       th->dest, iph->saddr, ntohs(th->source),
483 				       inet_iif(skb), 0);
484 	if (!sk) {
485 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
486 		return -ENOENT;
487 	}
488 	if (sk->sk_state == TCP_TIME_WAIT) {
489 		inet_twsk_put(inet_twsk(sk));
490 		return 0;
491 	}
492 	seq = ntohl(th->seq);
493 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
494 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
495 				     type == ICMP_TIME_EXCEEDED ||
496 				     (type == ICMP_DEST_UNREACH &&
497 				      (code == ICMP_NET_UNREACH ||
498 				       code == ICMP_HOST_UNREACH)));
499 		return 0;
500 	}
501 
502 	bh_lock_sock(sk);
503 	/* If too many ICMPs get dropped on busy
504 	 * servers this needs to be solved differently.
505 	 * We do take care of PMTU discovery (RFC1191) special case :
506 	 * we can receive locally generated ICMP messages while socket is held.
507 	 */
508 	if (sock_owned_by_user(sk)) {
509 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
510 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
511 	}
512 	if (sk->sk_state == TCP_CLOSE)
513 		goto out;
514 
515 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
516 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 		goto out;
518 	}
519 
520 	tp = tcp_sk(sk);
521 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 	fastopen = rcu_dereference(tp->fastopen_rsk);
523 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 	if (sk->sk_state != TCP_LISTEN &&
525 	    !between(seq, snd_una, tp->snd_nxt)) {
526 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 		goto out;
528 	}
529 
530 	switch (type) {
531 	case ICMP_REDIRECT:
532 		if (!sock_owned_by_user(sk))
533 			do_redirect(skb, sk);
534 		goto out;
535 	case ICMP_SOURCE_QUENCH:
536 		/* Just silently ignore these. */
537 		goto out;
538 	case ICMP_PARAMETERPROB:
539 		err = EPROTO;
540 		break;
541 	case ICMP_DEST_UNREACH:
542 		if (code > NR_ICMP_UNREACH)
543 			goto out;
544 
545 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 			/* We are not interested in TCP_LISTEN and open_requests
547 			 * (SYN-ACKs send out by Linux are always <576bytes so
548 			 * they should go through unfragmented).
549 			 */
550 			if (sk->sk_state == TCP_LISTEN)
551 				goto out;
552 
553 			WRITE_ONCE(tp->mtu_info, info);
554 			if (!sock_owned_by_user(sk)) {
555 				tcp_v4_mtu_reduced(sk);
556 			} else {
557 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 					sock_hold(sk);
559 			}
560 			goto out;
561 		}
562 
563 		err = icmp_err_convert[code].errno;
564 		/* check if this ICMP message allows revert of backoff.
565 		 * (see RFC 6069)
566 		 */
567 		if (!fastopen &&
568 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 			tcp_ld_RTO_revert(sk, seq);
570 		break;
571 	case ICMP_TIME_EXCEEDED:
572 		err = EHOSTUNREACH;
573 		break;
574 	default:
575 		goto out;
576 	}
577 
578 	switch (sk->sk_state) {
579 	case TCP_SYN_SENT:
580 	case TCP_SYN_RECV:
581 		/* Only in fast or simultaneous open. If a fast open socket is
582 		 * already accepted it is treated as a connected one below.
583 		 */
584 		if (fastopen && !fastopen->sk)
585 			break;
586 
587 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588 
589 		if (!sock_owned_by_user(sk)) {
590 			sk->sk_err = err;
591 
592 			sk_error_report(sk);
593 
594 			tcp_done(sk);
595 		} else {
596 			sk->sk_err_soft = err;
597 		}
598 		goto out;
599 	}
600 
601 	/* If we've already connected we will keep trying
602 	 * until we time out, or the user gives up.
603 	 *
604 	 * rfc1122 4.2.3.9 allows to consider as hard errors
605 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 	 * but it is obsoleted by pmtu discovery).
607 	 *
608 	 * Note, that in modern internet, where routing is unreliable
609 	 * and in each dark corner broken firewalls sit, sending random
610 	 * errors ordered by their masters even this two messages finally lose
611 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 	 *
613 	 * Now we are in compliance with RFCs.
614 	 *							--ANK (980905)
615 	 */
616 
617 	inet = inet_sk(sk);
618 	if (!sock_owned_by_user(sk) && inet->recverr) {
619 		sk->sk_err = err;
620 		sk_error_report(sk);
621 	} else	{ /* Only an error on timeout */
622 		sk->sk_err_soft = err;
623 	}
624 
625 out:
626 	bh_unlock_sock(sk);
627 	sock_put(sk);
628 	return 0;
629 }
630 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 	struct tcphdr *th = tcp_hdr(skb);
634 
635 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 	skb->csum_start = skb_transport_header(skb) - skb->head;
637 	skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639 
640 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 	const struct inet_sock *inet = inet_sk(sk);
644 
645 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648 
649 /*
650  *	This routine will send an RST to the other tcp.
651  *
652  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653  *		      for reset.
654  *	Answer: if a packet caused RST, it is not for a socket
655  *		existing in our system, if it is matched to a socket,
656  *		it is just duplicate segment or bug in other side's TCP.
657  *		So that we build reply only basing on parameters
658  *		arrived with segment.
659  *	Exception: precedence violation. We do not implement it in any case.
660  */
661 
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #else
665 #define OPTION_BYTES sizeof(__be32)
666 #endif
667 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 	const struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
674 	} rep;
675 	struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 	struct tcp_md5sig_key *key = NULL;
678 	const __u8 *hash_location = NULL;
679 	unsigned char newhash[16];
680 	int genhash;
681 	struct sock *sk1 = NULL;
682 #endif
683 	u64 transmit_time = 0;
684 	struct sock *ctl_sk;
685 	struct net *net;
686 
687 	/* Never send a reset in response to a reset. */
688 	if (th->rst)
689 		return;
690 
691 	/* If sk not NULL, it means we did a successful lookup and incoming
692 	 * route had to be correct. prequeue might have dropped our dst.
693 	 */
694 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 		return;
696 
697 	/* Swap the send and the receive. */
698 	memset(&rep, 0, sizeof(rep));
699 	rep.th.dest   = th->source;
700 	rep.th.source = th->dest;
701 	rep.th.doff   = sizeof(struct tcphdr) / 4;
702 	rep.th.rst    = 1;
703 
704 	if (th->ack) {
705 		rep.th.seq = th->ack_seq;
706 	} else {
707 		rep.th.ack = 1;
708 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 				       skb->len - (th->doff << 2));
710 	}
711 
712 	memset(&arg, 0, sizeof(arg));
713 	arg.iov[0].iov_base = (unsigned char *)&rep;
714 	arg.iov[0].iov_len  = sizeof(rep.th);
715 
716 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
718 	rcu_read_lock();
719 	hash_location = tcp_parse_md5sig_option(th);
720 	if (sk && sk_fullsock(sk)) {
721 		const union tcp_md5_addr *addr;
722 		int l3index;
723 
724 		/* sdif set, means packet ingressed via a device
725 		 * in an L3 domain and inet_iif is set to it.
726 		 */
727 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 	} else if (hash_location) {
731 		const union tcp_md5_addr *addr;
732 		int sdif = tcp_v4_sdif(skb);
733 		int dif = inet_iif(skb);
734 		int l3index;
735 
736 		/*
737 		 * active side is lost. Try to find listening socket through
738 		 * source port, and then find md5 key through listening socket.
739 		 * we are not loose security here:
740 		 * Incoming packet is checked with md5 hash with finding key,
741 		 * no RST generated if md5 hash doesn't match.
742 		 */
743 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 					     ip_hdr(skb)->saddr,
745 					     th->source, ip_hdr(skb)->daddr,
746 					     ntohs(th->source), dif, sdif);
747 		/* don't send rst if it can't find key */
748 		if (!sk1)
749 			goto out;
750 
751 		/* sdif set, means packet ingressed via a device
752 		 * in an L3 domain and dif is set to it.
753 		 */
754 		l3index = sdif ? dif : 0;
755 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 		if (!key)
758 			goto out;
759 
760 
761 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 			goto out;
764 
765 	}
766 
767 	if (key) {
768 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 				   (TCPOPT_NOP << 16) |
770 				   (TCPOPT_MD5SIG << 8) |
771 				   TCPOLEN_MD5SIG);
772 		/* Update length and the length the header thinks exists */
773 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 		rep.th.doff = arg.iov[0].iov_len / 4;
775 
776 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 				     key, ip_hdr(skb)->saddr,
778 				     ip_hdr(skb)->daddr, &rep.th);
779 	}
780 #endif
781 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 	if (rep.opt[0] == 0) {
783 		__be32 mrst = mptcp_reset_option(skb);
784 
785 		if (mrst) {
786 			rep.opt[0] = mrst;
787 			arg.iov[0].iov_len += sizeof(mrst);
788 			rep.th.doff = arg.iov[0].iov_len / 4;
789 		}
790 	}
791 
792 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 				      ip_hdr(skb)->saddr, /* XXX */
794 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797 
798 	/* When socket is gone, all binding information is lost.
799 	 * routing might fail in this case. No choice here, if we choose to force
800 	 * input interface, we will misroute in case of asymmetric route.
801 	 */
802 	if (sk) {
803 		arg.bound_dev_if = sk->sk_bound_dev_if;
804 		if (sk_fullsock(sk))
805 			trace_tcp_send_reset(sk, skb);
806 	}
807 
808 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810 
811 	arg.tos = ip_hdr(skb)->tos;
812 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 	local_bh_disable();
814 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 	sock_net_set(ctl_sk, net);
816 	if (sk) {
817 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
819 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
821 		transmit_time = tcp_transmit_time(sk);
822 		xfrm_sk_clone_policy(ctl_sk, sk);
823 	} else {
824 		ctl_sk->sk_mark = 0;
825 		ctl_sk->sk_priority = 0;
826 	}
827 	ip_send_unicast_reply(ctl_sk,
828 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
829 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
830 			      &arg, arg.iov[0].iov_len,
831 			      transmit_time);
832 
833 	xfrm_sk_free_policy(ctl_sk);
834 	sock_net_set(ctl_sk, &init_net);
835 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
836 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
837 	local_bh_enable();
838 
839 #ifdef CONFIG_TCP_MD5SIG
840 out:
841 	rcu_read_unlock();
842 #endif
843 }
844 
845 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
846    outside socket context is ugly, certainly. What can I do?
847  */
848 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)849 static void tcp_v4_send_ack(const struct sock *sk,
850 			    struct sk_buff *skb, u32 seq, u32 ack,
851 			    u32 win, u32 tsval, u32 tsecr, int oif,
852 			    struct tcp_md5sig_key *key,
853 			    int reply_flags, u8 tos)
854 {
855 	const struct tcphdr *th = tcp_hdr(skb);
856 	struct {
857 		struct tcphdr th;
858 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
859 #ifdef CONFIG_TCP_MD5SIG
860 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
861 #endif
862 			];
863 	} rep;
864 	struct net *net = sock_net(sk);
865 	struct ip_reply_arg arg;
866 	struct sock *ctl_sk;
867 	u64 transmit_time;
868 
869 	memset(&rep.th, 0, sizeof(struct tcphdr));
870 	memset(&arg, 0, sizeof(arg));
871 
872 	arg.iov[0].iov_base = (unsigned char *)&rep;
873 	arg.iov[0].iov_len  = sizeof(rep.th);
874 	if (tsecr) {
875 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
876 				   (TCPOPT_TIMESTAMP << 8) |
877 				   TCPOLEN_TIMESTAMP);
878 		rep.opt[1] = htonl(tsval);
879 		rep.opt[2] = htonl(tsecr);
880 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
881 	}
882 
883 	/* Swap the send and the receive. */
884 	rep.th.dest    = th->source;
885 	rep.th.source  = th->dest;
886 	rep.th.doff    = arg.iov[0].iov_len / 4;
887 	rep.th.seq     = htonl(seq);
888 	rep.th.ack_seq = htonl(ack);
889 	rep.th.ack     = 1;
890 	rep.th.window  = htons(win);
891 
892 #ifdef CONFIG_TCP_MD5SIG
893 	if (key) {
894 		int offset = (tsecr) ? 3 : 0;
895 
896 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
897 					  (TCPOPT_NOP << 16) |
898 					  (TCPOPT_MD5SIG << 8) |
899 					  TCPOLEN_MD5SIG);
900 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
901 		rep.th.doff = arg.iov[0].iov_len/4;
902 
903 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
904 				    key, ip_hdr(skb)->saddr,
905 				    ip_hdr(skb)->daddr, &rep.th);
906 	}
907 #endif
908 	arg.flags = reply_flags;
909 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
910 				      ip_hdr(skb)->saddr, /* XXX */
911 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
912 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
913 	if (oif)
914 		arg.bound_dev_if = oif;
915 	arg.tos = tos;
916 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
917 	local_bh_disable();
918 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
919 	sock_net_set(ctl_sk, net);
920 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
921 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
922 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
923 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
924 	transmit_time = tcp_transmit_time(sk);
925 	ip_send_unicast_reply(ctl_sk,
926 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
927 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
928 			      &arg, arg.iov[0].iov_len,
929 			      transmit_time);
930 
931 	sock_net_set(ctl_sk, &init_net);
932 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
933 	local_bh_enable();
934 }
935 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)936 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
937 {
938 	struct inet_timewait_sock *tw = inet_twsk(sk);
939 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
940 
941 	tcp_v4_send_ack(sk, skb,
942 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
943 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
944 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
945 			tcptw->tw_ts_recent,
946 			tw->tw_bound_dev_if,
947 			tcp_twsk_md5_key(tcptw),
948 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
949 			tw->tw_tos
950 			);
951 
952 	inet_twsk_put(tw);
953 }
954 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)955 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
956 				  struct request_sock *req)
957 {
958 	const union tcp_md5_addr *addr;
959 	int l3index;
960 
961 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
962 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
963 	 */
964 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
965 					     tcp_sk(sk)->snd_nxt;
966 
967 	/* RFC 7323 2.3
968 	 * The window field (SEG.WND) of every outgoing segment, with the
969 	 * exception of <SYN> segments, MUST be right-shifted by
970 	 * Rcv.Wind.Shift bits:
971 	 */
972 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
973 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
974 	tcp_v4_send_ack(sk, skb, seq,
975 			tcp_rsk(req)->rcv_nxt,
976 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
977 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
978 			READ_ONCE(req->ts_recent),
979 			0,
980 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
981 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
982 			ip_hdr(skb)->tos);
983 }
984 
985 /*
986  *	Send a SYN-ACK after having received a SYN.
987  *	This still operates on a request_sock only, not on a big
988  *	socket.
989  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)990 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
991 			      struct flowi *fl,
992 			      struct request_sock *req,
993 			      struct tcp_fastopen_cookie *foc,
994 			      enum tcp_synack_type synack_type,
995 			      struct sk_buff *syn_skb)
996 {
997 	const struct inet_request_sock *ireq = inet_rsk(req);
998 	struct flowi4 fl4;
999 	int err = -1;
1000 	struct sk_buff *skb;
1001 	u8 tos;
1002 
1003 	/* First, grab a route. */
1004 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1005 		return -1;
1006 
1007 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1008 
1009 	if (skb) {
1010 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1011 
1012 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1013 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1014 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1015 				inet_sk(sk)->tos;
1016 
1017 		if (!INET_ECN_is_capable(tos) &&
1018 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1019 			tos |= INET_ECN_ECT_0;
1020 
1021 		rcu_read_lock();
1022 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1023 					    ireq->ir_rmt_addr,
1024 					    rcu_dereference(ireq->ireq_opt),
1025 					    tos);
1026 		rcu_read_unlock();
1027 		err = net_xmit_eval(err);
1028 	}
1029 
1030 	return err;
1031 }
1032 
1033 /*
1034  *	IPv4 request_sock destructor.
1035  */
tcp_v4_reqsk_destructor(struct request_sock * req)1036 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1037 {
1038 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1039 }
1040 
1041 #ifdef CONFIG_TCP_MD5SIG
1042 /*
1043  * RFC2385 MD5 checksumming requires a mapping of
1044  * IP address->MD5 Key.
1045  * We need to maintain these in the sk structure.
1046  */
1047 
1048 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1049 EXPORT_SYMBOL(tcp_md5_needed);
1050 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1051 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1052 {
1053 	if (!old)
1054 		return true;
1055 
1056 	/* l3index always overrides non-l3index */
1057 	if (old->l3index && new->l3index == 0)
1058 		return false;
1059 	if (old->l3index == 0 && new->l3index)
1060 		return true;
1061 
1062 	return old->prefixlen < new->prefixlen;
1063 }
1064 
1065 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1066 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1067 					   const union tcp_md5_addr *addr,
1068 					   int family)
1069 {
1070 	const struct tcp_sock *tp = tcp_sk(sk);
1071 	struct tcp_md5sig_key *key;
1072 	const struct tcp_md5sig_info *md5sig;
1073 	__be32 mask;
1074 	struct tcp_md5sig_key *best_match = NULL;
1075 	bool match;
1076 
1077 	/* caller either holds rcu_read_lock() or socket lock */
1078 	md5sig = rcu_dereference_check(tp->md5sig_info,
1079 				       lockdep_sock_is_held(sk));
1080 	if (!md5sig)
1081 		return NULL;
1082 
1083 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084 				 lockdep_sock_is_held(sk)) {
1085 		if (key->family != family)
1086 			continue;
1087 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1088 			continue;
1089 		if (family == AF_INET) {
1090 			mask = inet_make_mask(key->prefixlen);
1091 			match = (key->addr.a4.s_addr & mask) ==
1092 				(addr->a4.s_addr & mask);
1093 #if IS_ENABLED(CONFIG_IPV6)
1094 		} else if (family == AF_INET6) {
1095 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1096 						  key->prefixlen);
1097 #endif
1098 		} else {
1099 			match = false;
1100 		}
1101 
1102 		if (match && better_md5_match(best_match, key))
1103 			best_match = key;
1104 	}
1105 	return best_match;
1106 }
1107 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1108 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1109 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1110 						      const union tcp_md5_addr *addr,
1111 						      int family, u8 prefixlen,
1112 						      int l3index, u8 flags)
1113 {
1114 	const struct tcp_sock *tp = tcp_sk(sk);
1115 	struct tcp_md5sig_key *key;
1116 	unsigned int size = sizeof(struct in_addr);
1117 	const struct tcp_md5sig_info *md5sig;
1118 
1119 	/* caller either holds rcu_read_lock() or socket lock */
1120 	md5sig = rcu_dereference_check(tp->md5sig_info,
1121 				       lockdep_sock_is_held(sk));
1122 	if (!md5sig)
1123 		return NULL;
1124 #if IS_ENABLED(CONFIG_IPV6)
1125 	if (family == AF_INET6)
1126 		size = sizeof(struct in6_addr);
1127 #endif
1128 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1129 				 lockdep_sock_is_held(sk)) {
1130 		if (key->family != family)
1131 			continue;
1132 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1133 			continue;
1134 		if (key->l3index != l3index)
1135 			continue;
1136 		if (!memcmp(&key->addr, addr, size) &&
1137 		    key->prefixlen == prefixlen)
1138 			return key;
1139 	}
1140 	return NULL;
1141 }
1142 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1143 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1144 					 const struct sock *addr_sk)
1145 {
1146 	const union tcp_md5_addr *addr;
1147 	int l3index;
1148 
1149 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1150 						 addr_sk->sk_bound_dev_if);
1151 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1152 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1153 }
1154 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1155 
1156 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1157 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1158 		   int family, u8 prefixlen, int l3index, u8 flags,
1159 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1160 {
1161 	/* Add Key to the list */
1162 	struct tcp_md5sig_key *key;
1163 	struct tcp_sock *tp = tcp_sk(sk);
1164 	struct tcp_md5sig_info *md5sig;
1165 
1166 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1167 	if (key) {
1168 		/* Pre-existing entry - just update that one.
1169 		 * Note that the key might be used concurrently.
1170 		 * data_race() is telling kcsan that we do not care of
1171 		 * key mismatches, since changing MD5 key on live flows
1172 		 * can lead to packet drops.
1173 		 */
1174 		data_race(memcpy(key->key, newkey, newkeylen));
1175 
1176 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1177 		 * Also note that a reader could catch new key->keylen value
1178 		 * but old key->key[], this is the reason we use __GFP_ZERO
1179 		 * at sock_kmalloc() time below these lines.
1180 		 */
1181 		WRITE_ONCE(key->keylen, newkeylen);
1182 
1183 		return 0;
1184 	}
1185 
1186 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1187 					   lockdep_sock_is_held(sk));
1188 	if (!md5sig) {
1189 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1190 		if (!md5sig)
1191 			return -ENOMEM;
1192 
1193 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1194 		INIT_HLIST_HEAD(&md5sig->head);
1195 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1196 	}
1197 
1198 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1199 	if (!key)
1200 		return -ENOMEM;
1201 	if (!tcp_alloc_md5sig_pool()) {
1202 		sock_kfree_s(sk, key, sizeof(*key));
1203 		return -ENOMEM;
1204 	}
1205 
1206 	memcpy(key->key, newkey, newkeylen);
1207 	key->keylen = newkeylen;
1208 	key->family = family;
1209 	key->prefixlen = prefixlen;
1210 	key->l3index = l3index;
1211 	key->flags = flags;
1212 	memcpy(&key->addr, addr,
1213 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1214 				      sizeof(struct in_addr));
1215 	hlist_add_head_rcu(&key->node, &md5sig->head);
1216 	return 0;
1217 }
1218 EXPORT_SYMBOL(tcp_md5_do_add);
1219 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1220 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1221 		   u8 prefixlen, int l3index, u8 flags)
1222 {
1223 	struct tcp_md5sig_key *key;
1224 
1225 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1226 	if (!key)
1227 		return -ENOENT;
1228 	hlist_del_rcu(&key->node);
1229 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1230 	kfree_rcu(key, rcu);
1231 	return 0;
1232 }
1233 EXPORT_SYMBOL(tcp_md5_do_del);
1234 
tcp_clear_md5_list(struct sock * sk)1235 static void tcp_clear_md5_list(struct sock *sk)
1236 {
1237 	struct tcp_sock *tp = tcp_sk(sk);
1238 	struct tcp_md5sig_key *key;
1239 	struct hlist_node *n;
1240 	struct tcp_md5sig_info *md5sig;
1241 
1242 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1243 
1244 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1245 		hlist_del_rcu(&key->node);
1246 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1247 		kfree_rcu(key, rcu);
1248 	}
1249 }
1250 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1251 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1252 				 sockptr_t optval, int optlen)
1253 {
1254 	struct tcp_md5sig cmd;
1255 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1256 	const union tcp_md5_addr *addr;
1257 	u8 prefixlen = 32;
1258 	int l3index = 0;
1259 	u8 flags;
1260 
1261 	if (optlen < sizeof(cmd))
1262 		return -EINVAL;
1263 
1264 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1265 		return -EFAULT;
1266 
1267 	if (sin->sin_family != AF_INET)
1268 		return -EINVAL;
1269 
1270 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1271 
1272 	if (optname == TCP_MD5SIG_EXT &&
1273 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1274 		prefixlen = cmd.tcpm_prefixlen;
1275 		if (prefixlen > 32)
1276 			return -EINVAL;
1277 	}
1278 
1279 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1280 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1281 		struct net_device *dev;
1282 
1283 		rcu_read_lock();
1284 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1285 		if (dev && netif_is_l3_master(dev))
1286 			l3index = dev->ifindex;
1287 
1288 		rcu_read_unlock();
1289 
1290 		/* ok to reference set/not set outside of rcu;
1291 		 * right now device MUST be an L3 master
1292 		 */
1293 		if (!dev || !l3index)
1294 			return -EINVAL;
1295 	}
1296 
1297 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1298 
1299 	if (!cmd.tcpm_keylen)
1300 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1301 
1302 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1303 		return -EINVAL;
1304 
1305 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1306 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1307 }
1308 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1309 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1310 				   __be32 daddr, __be32 saddr,
1311 				   const struct tcphdr *th, int nbytes)
1312 {
1313 	struct tcp4_pseudohdr *bp;
1314 	struct scatterlist sg;
1315 	struct tcphdr *_th;
1316 
1317 	bp = hp->scratch;
1318 	bp->saddr = saddr;
1319 	bp->daddr = daddr;
1320 	bp->pad = 0;
1321 	bp->protocol = IPPROTO_TCP;
1322 	bp->len = cpu_to_be16(nbytes);
1323 
1324 	_th = (struct tcphdr *)(bp + 1);
1325 	memcpy(_th, th, sizeof(*th));
1326 	_th->check = 0;
1327 
1328 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1329 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1330 				sizeof(*bp) + sizeof(*th));
1331 	return crypto_ahash_update(hp->md5_req);
1332 }
1333 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1334 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1335 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1336 {
1337 	struct tcp_md5sig_pool *hp;
1338 	struct ahash_request *req;
1339 
1340 	hp = tcp_get_md5sig_pool();
1341 	if (!hp)
1342 		goto clear_hash_noput;
1343 	req = hp->md5_req;
1344 
1345 	if (crypto_ahash_init(req))
1346 		goto clear_hash;
1347 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1348 		goto clear_hash;
1349 	if (tcp_md5_hash_key(hp, key))
1350 		goto clear_hash;
1351 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352 	if (crypto_ahash_final(req))
1353 		goto clear_hash;
1354 
1355 	tcp_put_md5sig_pool();
1356 	return 0;
1357 
1358 clear_hash:
1359 	tcp_put_md5sig_pool();
1360 clear_hash_noput:
1361 	memset(md5_hash, 0, 16);
1362 	return 1;
1363 }
1364 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1365 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1366 			const struct sock *sk,
1367 			const struct sk_buff *skb)
1368 {
1369 	struct tcp_md5sig_pool *hp;
1370 	struct ahash_request *req;
1371 	const struct tcphdr *th = tcp_hdr(skb);
1372 	__be32 saddr, daddr;
1373 
1374 	if (sk) { /* valid for establish/request sockets */
1375 		saddr = sk->sk_rcv_saddr;
1376 		daddr = sk->sk_daddr;
1377 	} else {
1378 		const struct iphdr *iph = ip_hdr(skb);
1379 		saddr = iph->saddr;
1380 		daddr = iph->daddr;
1381 	}
1382 
1383 	hp = tcp_get_md5sig_pool();
1384 	if (!hp)
1385 		goto clear_hash_noput;
1386 	req = hp->md5_req;
1387 
1388 	if (crypto_ahash_init(req))
1389 		goto clear_hash;
1390 
1391 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1392 		goto clear_hash;
1393 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1394 		goto clear_hash;
1395 	if (tcp_md5_hash_key(hp, key))
1396 		goto clear_hash;
1397 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1398 	if (crypto_ahash_final(req))
1399 		goto clear_hash;
1400 
1401 	tcp_put_md5sig_pool();
1402 	return 0;
1403 
1404 clear_hash:
1405 	tcp_put_md5sig_pool();
1406 clear_hash_noput:
1407 	memset(md5_hash, 0, 16);
1408 	return 1;
1409 }
1410 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1411 
1412 #endif
1413 
1414 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1415 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1416 				    const struct sk_buff *skb,
1417 				    int dif, int sdif)
1418 {
1419 #ifdef CONFIG_TCP_MD5SIG
1420 	/*
1421 	 * This gets called for each TCP segment that arrives
1422 	 * so we want to be efficient.
1423 	 * We have 3 drop cases:
1424 	 * o No MD5 hash and one expected.
1425 	 * o MD5 hash and we're not expecting one.
1426 	 * o MD5 hash and its wrong.
1427 	 */
1428 	const __u8 *hash_location = NULL;
1429 	struct tcp_md5sig_key *hash_expected;
1430 	const struct iphdr *iph = ip_hdr(skb);
1431 	const struct tcphdr *th = tcp_hdr(skb);
1432 	const union tcp_md5_addr *addr;
1433 	unsigned char newhash[16];
1434 	int genhash, l3index;
1435 
1436 	/* sdif set, means packet ingressed via a device
1437 	 * in an L3 domain and dif is set to the l3mdev
1438 	 */
1439 	l3index = sdif ? dif : 0;
1440 
1441 	addr = (union tcp_md5_addr *)&iph->saddr;
1442 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1443 	hash_location = tcp_parse_md5sig_option(th);
1444 
1445 	/* We've parsed the options - do we have a hash? */
1446 	if (!hash_expected && !hash_location)
1447 		return false;
1448 
1449 	if (hash_expected && !hash_location) {
1450 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1451 		return true;
1452 	}
1453 
1454 	if (!hash_expected && hash_location) {
1455 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1456 		return true;
1457 	}
1458 
1459 	/* Okay, so this is hash_expected and hash_location -
1460 	 * so we need to calculate the checksum.
1461 	 */
1462 	genhash = tcp_v4_md5_hash_skb(newhash,
1463 				      hash_expected,
1464 				      NULL, skb);
1465 
1466 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1467 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1468 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1469 				     &iph->saddr, ntohs(th->source),
1470 				     &iph->daddr, ntohs(th->dest),
1471 				     genhash ? " tcp_v4_calc_md5_hash failed"
1472 				     : "", l3index);
1473 		return true;
1474 	}
1475 	return false;
1476 #endif
1477 	return false;
1478 }
1479 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1480 static void tcp_v4_init_req(struct request_sock *req,
1481 			    const struct sock *sk_listener,
1482 			    struct sk_buff *skb)
1483 {
1484 	struct inet_request_sock *ireq = inet_rsk(req);
1485 	struct net *net = sock_net(sk_listener);
1486 
1487 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1488 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1489 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1490 }
1491 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1492 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1493 					  struct sk_buff *skb,
1494 					  struct flowi *fl,
1495 					  struct request_sock *req)
1496 {
1497 	tcp_v4_init_req(req, sk, skb);
1498 
1499 	if (security_inet_conn_request(sk, skb, req))
1500 		return NULL;
1501 
1502 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1503 }
1504 
1505 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1506 	.family		=	PF_INET,
1507 	.obj_size	=	sizeof(struct tcp_request_sock),
1508 	.rtx_syn_ack	=	tcp_rtx_synack,
1509 	.send_ack	=	tcp_v4_reqsk_send_ack,
1510 	.destructor	=	tcp_v4_reqsk_destructor,
1511 	.send_reset	=	tcp_v4_send_reset,
1512 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1513 };
1514 
1515 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1516 	.mss_clamp	=	TCP_MSS_DEFAULT,
1517 #ifdef CONFIG_TCP_MD5SIG
1518 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1519 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1520 #endif
1521 #ifdef CONFIG_SYN_COOKIES
1522 	.cookie_init_seq =	cookie_v4_init_sequence,
1523 #endif
1524 	.route_req	=	tcp_v4_route_req,
1525 	.init_seq	=	tcp_v4_init_seq,
1526 	.init_ts_off	=	tcp_v4_init_ts_off,
1527 	.send_synack	=	tcp_v4_send_synack,
1528 };
1529 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1530 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1531 {
1532 	/* Never answer to SYNs send to broadcast or multicast */
1533 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1534 		goto drop;
1535 
1536 	return tcp_conn_request(&tcp_request_sock_ops,
1537 				&tcp_request_sock_ipv4_ops, sk, skb);
1538 
1539 drop:
1540 	tcp_listendrop(sk);
1541 	return 0;
1542 }
1543 EXPORT_SYMBOL(tcp_v4_conn_request);
1544 
1545 
1546 /*
1547  * The three way handshake has completed - we got a valid synack -
1548  * now create the new socket.
1549  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1550 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1551 				  struct request_sock *req,
1552 				  struct dst_entry *dst,
1553 				  struct request_sock *req_unhash,
1554 				  bool *own_req)
1555 {
1556 	struct inet_request_sock *ireq;
1557 	bool found_dup_sk = false;
1558 	struct inet_sock *newinet;
1559 	struct tcp_sock *newtp;
1560 	struct sock *newsk;
1561 #ifdef CONFIG_TCP_MD5SIG
1562 	const union tcp_md5_addr *addr;
1563 	struct tcp_md5sig_key *key;
1564 	int l3index;
1565 #endif
1566 	struct ip_options_rcu *inet_opt;
1567 
1568 	if (sk_acceptq_is_full(sk))
1569 		goto exit_overflow;
1570 
1571 	newsk = tcp_create_openreq_child(sk, req, skb);
1572 	if (!newsk)
1573 		goto exit_nonewsk;
1574 
1575 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1576 	inet_sk_rx_dst_set(newsk, skb);
1577 
1578 	newtp		      = tcp_sk(newsk);
1579 	newinet		      = inet_sk(newsk);
1580 	ireq		      = inet_rsk(req);
1581 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1582 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1583 	newsk->sk_bound_dev_if = ireq->ir_iif;
1584 	newinet->inet_saddr   = ireq->ir_loc_addr;
1585 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1586 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1587 	newinet->mc_index     = inet_iif(skb);
1588 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1589 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1590 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1591 	if (inet_opt)
1592 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1593 	newinet->inet_id = prandom_u32();
1594 
1595 	/* Set ToS of the new socket based upon the value of incoming SYN.
1596 	 * ECT bits are set later in tcp_init_transfer().
1597 	 */
1598 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1599 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1600 
1601 	if (!dst) {
1602 		dst = inet_csk_route_child_sock(sk, newsk, req);
1603 		if (!dst)
1604 			goto put_and_exit;
1605 	} else {
1606 		/* syncookie case : see end of cookie_v4_check() */
1607 	}
1608 	sk_setup_caps(newsk, dst);
1609 
1610 	tcp_ca_openreq_child(newsk, dst);
1611 
1612 	tcp_sync_mss(newsk, dst_mtu(dst));
1613 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1614 
1615 	tcp_initialize_rcv_mss(newsk);
1616 
1617 #ifdef CONFIG_TCP_MD5SIG
1618 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1619 	/* Copy over the MD5 key from the original socket */
1620 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1621 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1622 	if (key) {
1623 		/*
1624 		 * We're using one, so create a matching key
1625 		 * on the newsk structure. If we fail to get
1626 		 * memory, then we end up not copying the key
1627 		 * across. Shucks.
1628 		 */
1629 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1630 			       key->key, key->keylen, GFP_ATOMIC);
1631 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1632 	}
1633 #endif
1634 
1635 	if (__inet_inherit_port(sk, newsk) < 0)
1636 		goto put_and_exit;
1637 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638 				       &found_dup_sk);
1639 	if (likely(*own_req)) {
1640 		tcp_move_syn(newtp, req);
1641 		ireq->ireq_opt = NULL;
1642 	} else {
1643 		newinet->inet_opt = NULL;
1644 
1645 		if (!req_unhash && found_dup_sk) {
1646 			/* This code path should only be executed in the
1647 			 * syncookie case only
1648 			 */
1649 			bh_unlock_sock(newsk);
1650 			sock_put(newsk);
1651 			newsk = NULL;
1652 		}
1653 	}
1654 	return newsk;
1655 
1656 exit_overflow:
1657 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659 	dst_release(dst);
1660 exit:
1661 	tcp_listendrop(sk);
1662 	return NULL;
1663 put_and_exit:
1664 	newinet->inet_opt = NULL;
1665 	inet_csk_prepare_forced_close(newsk);
1666 	tcp_done(newsk);
1667 	goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674 	const struct tcphdr *th = tcp_hdr(skb);
1675 
1676 	if (!th->syn)
1677 		sk = cookie_v4_check(sk, skb);
1678 #endif
1679 	return sk;
1680 }
1681 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683 			 struct tcphdr *th, u32 *cookie)
1684 {
1685 	u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688 				    &tcp_request_sock_ipv4_ops, sk, th);
1689 	if (mss) {
1690 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691 		tcp_synq_overflow(sk);
1692 	}
1693 #endif
1694 	return mss;
1695 }
1696 
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698 							   u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709 	struct sock *rsk;
1710 
1711 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712 		struct dst_entry *dst;
1713 
1714 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1715 						lockdep_sock_is_held(sk));
1716 
1717 		sock_rps_save_rxhash(sk, skb);
1718 		sk_mark_napi_id(sk, skb);
1719 		if (dst) {
1720 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1722 					     dst, 0)) {
1723 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1724 				dst_release(dst);
1725 			}
1726 		}
1727 		tcp_rcv_established(sk, skb);
1728 		return 0;
1729 	}
1730 
1731 	if (tcp_checksum_complete(skb))
1732 		goto csum_err;
1733 
1734 	if (sk->sk_state == TCP_LISTEN) {
1735 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1736 
1737 		if (!nsk)
1738 			goto discard;
1739 		if (nsk != sk) {
1740 			if (tcp_child_process(sk, nsk, skb)) {
1741 				rsk = nsk;
1742 				goto reset;
1743 			}
1744 			return 0;
1745 		}
1746 	} else
1747 		sock_rps_save_rxhash(sk, skb);
1748 
1749 	if (tcp_rcv_state_process(sk, skb)) {
1750 		rsk = sk;
1751 		goto reset;
1752 	}
1753 	return 0;
1754 
1755 reset:
1756 	tcp_v4_send_reset(rsk, skb);
1757 discard:
1758 	kfree_skb(skb);
1759 	/* Be careful here. If this function gets more complicated and
1760 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1761 	 * might be destroyed here. This current version compiles correctly,
1762 	 * but you have been warned.
1763 	 */
1764 	return 0;
1765 
1766 csum_err:
1767 	trace_tcp_bad_csum(skb);
1768 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1769 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1770 	goto discard;
1771 }
1772 EXPORT_SYMBOL(tcp_v4_do_rcv);
1773 
tcp_v4_early_demux(struct sk_buff * skb)1774 int tcp_v4_early_demux(struct sk_buff *skb)
1775 {
1776 	const struct iphdr *iph;
1777 	const struct tcphdr *th;
1778 	struct sock *sk;
1779 
1780 	if (skb->pkt_type != PACKET_HOST)
1781 		return 0;
1782 
1783 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1784 		return 0;
1785 
1786 	iph = ip_hdr(skb);
1787 	th = tcp_hdr(skb);
1788 
1789 	if (th->doff < sizeof(struct tcphdr) / 4)
1790 		return 0;
1791 
1792 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1793 				       iph->saddr, th->source,
1794 				       iph->daddr, ntohs(th->dest),
1795 				       skb->skb_iif, inet_sdif(skb));
1796 	if (sk) {
1797 		skb->sk = sk;
1798 		skb->destructor = sock_edemux;
1799 		if (sk_fullsock(sk)) {
1800 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1801 
1802 			if (dst)
1803 				dst = dst_check(dst, 0);
1804 			if (dst &&
1805 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1806 				skb_dst_set_noref(skb, dst);
1807 		}
1808 	}
1809 	return 0;
1810 }
1811 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1812 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1813 {
1814 	u32 limit, tail_gso_size, tail_gso_segs;
1815 	struct skb_shared_info *shinfo;
1816 	const struct tcphdr *th;
1817 	struct tcphdr *thtail;
1818 	struct sk_buff *tail;
1819 	unsigned int hdrlen;
1820 	bool fragstolen;
1821 	u32 gso_segs;
1822 	u32 gso_size;
1823 	int delta;
1824 
1825 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1826 	 * we can fix skb->truesize to its real value to avoid future drops.
1827 	 * This is valid because skb is not yet charged to the socket.
1828 	 * It has been noticed pure SACK packets were sometimes dropped
1829 	 * (if cooked by drivers without copybreak feature).
1830 	 */
1831 	skb_condense(skb);
1832 
1833 	skb_dst_drop(skb);
1834 
1835 	if (unlikely(tcp_checksum_complete(skb))) {
1836 		bh_unlock_sock(sk);
1837 		trace_tcp_bad_csum(skb);
1838 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1839 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1840 		return true;
1841 	}
1842 
1843 	/* Attempt coalescing to last skb in backlog, even if we are
1844 	 * above the limits.
1845 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1846 	 */
1847 	th = (const struct tcphdr *)skb->data;
1848 	hdrlen = th->doff * 4;
1849 
1850 	tail = sk->sk_backlog.tail;
1851 	if (!tail)
1852 		goto no_coalesce;
1853 	thtail = (struct tcphdr *)tail->data;
1854 
1855 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1856 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1857 	    ((TCP_SKB_CB(tail)->tcp_flags |
1858 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1859 	    !((TCP_SKB_CB(tail)->tcp_flags &
1860 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1861 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1862 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1863 #ifdef CONFIG_TLS_DEVICE
1864 	    tail->decrypted != skb->decrypted ||
1865 #endif
1866 	    !mptcp_skb_can_collapse(tail, skb) ||
1867 	    thtail->doff != th->doff ||
1868 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1869 		goto no_coalesce;
1870 
1871 	__skb_pull(skb, hdrlen);
1872 
1873 	shinfo = skb_shinfo(skb);
1874 	gso_size = shinfo->gso_size ?: skb->len;
1875 	gso_segs = shinfo->gso_segs ?: 1;
1876 
1877 	shinfo = skb_shinfo(tail);
1878 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1879 	tail_gso_segs = shinfo->gso_segs ?: 1;
1880 
1881 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1882 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1883 
1884 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1885 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1886 			thtail->window = th->window;
1887 		}
1888 
1889 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1890 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1891 		 * is not entered if we append a packet with a FIN.
1892 		 * SYN, RST, URG are not present.
1893 		 * ACK is set on both packets.
1894 		 * PSH : we do not really care in TCP stack,
1895 		 *       at least for 'GRO' packets.
1896 		 */
1897 		thtail->fin |= th->fin;
1898 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1899 
1900 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1901 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1902 			tail->tstamp = skb->tstamp;
1903 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1904 		}
1905 
1906 		/* Not as strict as GRO. We only need to carry mss max value */
1907 		shinfo->gso_size = max(gso_size, tail_gso_size);
1908 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1909 
1910 		sk->sk_backlog.len += delta;
1911 		__NET_INC_STATS(sock_net(sk),
1912 				LINUX_MIB_TCPBACKLOGCOALESCE);
1913 		kfree_skb_partial(skb, fragstolen);
1914 		return false;
1915 	}
1916 	__skb_push(skb, hdrlen);
1917 
1918 no_coalesce:
1919 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1920 
1921 	/* Only socket owner can try to collapse/prune rx queues
1922 	 * to reduce memory overhead, so add a little headroom here.
1923 	 * Few sockets backlog are possibly concurrently non empty.
1924 	 */
1925 	limit += 64 * 1024;
1926 
1927 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1928 		bh_unlock_sock(sk);
1929 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1930 		return true;
1931 	}
1932 	return false;
1933 }
1934 EXPORT_SYMBOL(tcp_add_backlog);
1935 
tcp_filter(struct sock * sk,struct sk_buff * skb)1936 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1937 {
1938 	struct tcphdr *th = (struct tcphdr *)skb->data;
1939 
1940 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1941 }
1942 EXPORT_SYMBOL(tcp_filter);
1943 
tcp_v4_restore_cb(struct sk_buff * skb)1944 static void tcp_v4_restore_cb(struct sk_buff *skb)
1945 {
1946 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1947 		sizeof(struct inet_skb_parm));
1948 }
1949 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1950 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1951 			   const struct tcphdr *th)
1952 {
1953 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1954 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1955 	 */
1956 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1957 		sizeof(struct inet_skb_parm));
1958 	barrier();
1959 
1960 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1961 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1962 				    skb->len - th->doff * 4);
1963 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1964 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1965 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1966 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1967 	TCP_SKB_CB(skb)->sacked	 = 0;
1968 	TCP_SKB_CB(skb)->has_rxtstamp =
1969 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1970 }
1971 
1972 /*
1973  *	From tcp_input.c
1974  */
1975 
tcp_v4_rcv(struct sk_buff * skb)1976 int tcp_v4_rcv(struct sk_buff *skb)
1977 {
1978 	struct net *net = dev_net(skb->dev);
1979 	struct sk_buff *skb_to_free;
1980 	int sdif = inet_sdif(skb);
1981 	int dif = inet_iif(skb);
1982 	const struct iphdr *iph;
1983 	const struct tcphdr *th;
1984 	bool refcounted;
1985 	struct sock *sk;
1986 	int drop_reason;
1987 	int ret;
1988 
1989 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1990 	if (skb->pkt_type != PACKET_HOST)
1991 		goto discard_it;
1992 
1993 	/* Count it even if it's bad */
1994 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1995 
1996 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1997 		goto discard_it;
1998 
1999 	th = (const struct tcphdr *)skb->data;
2000 
2001 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2002 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2003 		goto bad_packet;
2004 	}
2005 	if (!pskb_may_pull(skb, th->doff * 4))
2006 		goto discard_it;
2007 
2008 	/* An explanation is required here, I think.
2009 	 * Packet length and doff are validated by header prediction,
2010 	 * provided case of th->doff==0 is eliminated.
2011 	 * So, we defer the checks. */
2012 
2013 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2014 		goto csum_error;
2015 
2016 	th = (const struct tcphdr *)skb->data;
2017 	iph = ip_hdr(skb);
2018 lookup:
2019 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2020 			       th->dest, sdif, &refcounted);
2021 	if (!sk)
2022 		goto no_tcp_socket;
2023 
2024 process:
2025 	if (sk->sk_state == TCP_TIME_WAIT)
2026 		goto do_time_wait;
2027 
2028 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2029 		struct request_sock *req = inet_reqsk(sk);
2030 		bool req_stolen = false;
2031 		struct sock *nsk;
2032 
2033 		sk = req->rsk_listener;
2034 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2035 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2036 			sk_drops_add(sk, skb);
2037 			reqsk_put(req);
2038 			goto discard_it;
2039 		}
2040 		if (tcp_checksum_complete(skb)) {
2041 			reqsk_put(req);
2042 			goto csum_error;
2043 		}
2044 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2045 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2046 			if (!nsk) {
2047 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2048 				goto lookup;
2049 			}
2050 			sk = nsk;
2051 			/* reuseport_migrate_sock() has already held one sk_refcnt
2052 			 * before returning.
2053 			 */
2054 		} else {
2055 			/* We own a reference on the listener, increase it again
2056 			 * as we might lose it too soon.
2057 			 */
2058 			sock_hold(sk);
2059 		}
2060 		refcounted = true;
2061 		nsk = NULL;
2062 		if (!tcp_filter(sk, skb)) {
2063 			th = (const struct tcphdr *)skb->data;
2064 			iph = ip_hdr(skb);
2065 			tcp_v4_fill_cb(skb, iph, th);
2066 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2067 		}
2068 		if (!nsk) {
2069 			reqsk_put(req);
2070 			if (req_stolen) {
2071 				/* Another cpu got exclusive access to req
2072 				 * and created a full blown socket.
2073 				 * Try to feed this packet to this socket
2074 				 * instead of discarding it.
2075 				 */
2076 				tcp_v4_restore_cb(skb);
2077 				sock_put(sk);
2078 				goto lookup;
2079 			}
2080 			goto discard_and_relse;
2081 		}
2082 		nf_reset_ct(skb);
2083 		if (nsk == sk) {
2084 			reqsk_put(req);
2085 			tcp_v4_restore_cb(skb);
2086 		} else if (tcp_child_process(sk, nsk, skb)) {
2087 			tcp_v4_send_reset(nsk, skb);
2088 			goto discard_and_relse;
2089 		} else {
2090 			sock_put(sk);
2091 			return 0;
2092 		}
2093 	}
2094 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2095 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2096 		goto discard_and_relse;
2097 	}
2098 
2099 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2100 		goto discard_and_relse;
2101 
2102 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2103 		goto discard_and_relse;
2104 
2105 	nf_reset_ct(skb);
2106 
2107 	if (tcp_filter(sk, skb)) {
2108 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2109 		goto discard_and_relse;
2110 	}
2111 	th = (const struct tcphdr *)skb->data;
2112 	iph = ip_hdr(skb);
2113 	tcp_v4_fill_cb(skb, iph, th);
2114 
2115 	skb->dev = NULL;
2116 
2117 	if (sk->sk_state == TCP_LISTEN) {
2118 		ret = tcp_v4_do_rcv(sk, skb);
2119 		goto put_and_return;
2120 	}
2121 
2122 	sk_incoming_cpu_update(sk);
2123 
2124 	bh_lock_sock_nested(sk);
2125 	tcp_segs_in(tcp_sk(sk), skb);
2126 	ret = 0;
2127 	if (!sock_owned_by_user(sk)) {
2128 		skb_to_free = sk->sk_rx_skb_cache;
2129 		sk->sk_rx_skb_cache = NULL;
2130 		ret = tcp_v4_do_rcv(sk, skb);
2131 	} else {
2132 		if (tcp_add_backlog(sk, skb))
2133 			goto discard_and_relse;
2134 		skb_to_free = NULL;
2135 	}
2136 	bh_unlock_sock(sk);
2137 	if (skb_to_free)
2138 		__kfree_skb(skb_to_free);
2139 
2140 put_and_return:
2141 	if (refcounted)
2142 		sock_put(sk);
2143 
2144 	return ret;
2145 
2146 no_tcp_socket:
2147 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2148 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2149 		goto discard_it;
2150 
2151 	tcp_v4_fill_cb(skb, iph, th);
2152 
2153 	if (tcp_checksum_complete(skb)) {
2154 csum_error:
2155 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2156 		trace_tcp_bad_csum(skb);
2157 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2158 bad_packet:
2159 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2160 	} else {
2161 		tcp_v4_send_reset(NULL, skb);
2162 	}
2163 
2164 discard_it:
2165 	/* Discard frame. */
2166 	kfree_skb_reason(skb, drop_reason);
2167 	return 0;
2168 
2169 discard_and_relse:
2170 	sk_drops_add(sk, skb);
2171 	if (refcounted)
2172 		sock_put(sk);
2173 	goto discard_it;
2174 
2175 do_time_wait:
2176 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2177 		inet_twsk_put(inet_twsk(sk));
2178 		goto discard_it;
2179 	}
2180 
2181 	tcp_v4_fill_cb(skb, iph, th);
2182 
2183 	if (tcp_checksum_complete(skb)) {
2184 		inet_twsk_put(inet_twsk(sk));
2185 		goto csum_error;
2186 	}
2187 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2188 	case TCP_TW_SYN: {
2189 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2190 							&tcp_hashinfo, skb,
2191 							__tcp_hdrlen(th),
2192 							iph->saddr, th->source,
2193 							iph->daddr, th->dest,
2194 							inet_iif(skb),
2195 							sdif);
2196 		if (sk2) {
2197 			inet_twsk_deschedule_put(inet_twsk(sk));
2198 			sk = sk2;
2199 			tcp_v4_restore_cb(skb);
2200 			refcounted = false;
2201 			goto process;
2202 		}
2203 	}
2204 		/* to ACK */
2205 		fallthrough;
2206 	case TCP_TW_ACK:
2207 		tcp_v4_timewait_ack(sk, skb);
2208 		break;
2209 	case TCP_TW_RST:
2210 		tcp_v4_send_reset(sk, skb);
2211 		inet_twsk_deschedule_put(inet_twsk(sk));
2212 		goto discard_it;
2213 	case TCP_TW_SUCCESS:;
2214 	}
2215 	goto discard_it;
2216 }
2217 
2218 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2219 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2220 	.twsk_unique	= tcp_twsk_unique,
2221 	.twsk_destructor= tcp_twsk_destructor,
2222 };
2223 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2224 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2225 {
2226 	struct dst_entry *dst = skb_dst(skb);
2227 
2228 	if (dst && dst_hold_safe(dst)) {
2229 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2230 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2231 	}
2232 }
2233 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2234 
2235 const struct inet_connection_sock_af_ops ipv4_specific = {
2236 	.queue_xmit	   = ip_queue_xmit,
2237 	.send_check	   = tcp_v4_send_check,
2238 	.rebuild_header	   = inet_sk_rebuild_header,
2239 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2240 	.conn_request	   = tcp_v4_conn_request,
2241 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2242 	.net_header_len	   = sizeof(struct iphdr),
2243 	.setsockopt	   = ip_setsockopt,
2244 	.getsockopt	   = ip_getsockopt,
2245 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2246 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2247 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2248 };
2249 EXPORT_SYMBOL(ipv4_specific);
2250 
2251 #ifdef CONFIG_TCP_MD5SIG
2252 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2253 	.md5_lookup		= tcp_v4_md5_lookup,
2254 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2255 	.md5_parse		= tcp_v4_parse_md5_keys,
2256 };
2257 #endif
2258 
2259 /* NOTE: A lot of things set to zero explicitly by call to
2260  *       sk_alloc() so need not be done here.
2261  */
tcp_v4_init_sock(struct sock * sk)2262 static int tcp_v4_init_sock(struct sock *sk)
2263 {
2264 	struct inet_connection_sock *icsk = inet_csk(sk);
2265 
2266 	tcp_init_sock(sk);
2267 
2268 	icsk->icsk_af_ops = &ipv4_specific;
2269 
2270 #ifdef CONFIG_TCP_MD5SIG
2271 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2272 #endif
2273 
2274 	return 0;
2275 }
2276 
tcp_v4_destroy_sock(struct sock * sk)2277 void tcp_v4_destroy_sock(struct sock *sk)
2278 {
2279 	struct tcp_sock *tp = tcp_sk(sk);
2280 
2281 	trace_tcp_destroy_sock(sk);
2282 
2283 	tcp_clear_xmit_timers(sk);
2284 
2285 	tcp_cleanup_congestion_control(sk);
2286 
2287 	tcp_cleanup_ulp(sk);
2288 
2289 	/* Cleanup up the write buffer. */
2290 	tcp_write_queue_purge(sk);
2291 
2292 	/* Check if we want to disable active TFO */
2293 	tcp_fastopen_active_disable_ofo_check(sk);
2294 
2295 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2296 	skb_rbtree_purge(&tp->out_of_order_queue);
2297 
2298 #ifdef CONFIG_TCP_MD5SIG
2299 	/* Clean up the MD5 key list, if any */
2300 	if (tp->md5sig_info) {
2301 		tcp_clear_md5_list(sk);
2302 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2303 		tp->md5sig_info = NULL;
2304 	}
2305 #endif
2306 
2307 	/* Clean up a referenced TCP bind bucket. */
2308 	if (inet_csk(sk)->icsk_bind_hash)
2309 		inet_put_port(sk);
2310 
2311 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2312 
2313 	/* If socket is aborted during connect operation */
2314 	tcp_free_fastopen_req(tp);
2315 	tcp_fastopen_destroy_cipher(sk);
2316 	tcp_saved_syn_free(tp);
2317 
2318 	sk_sockets_allocated_dec(sk);
2319 }
2320 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2321 
2322 #ifdef CONFIG_PROC_FS
2323 /* Proc filesystem TCP sock list dumping. */
2324 
2325 static unsigned short seq_file_family(const struct seq_file *seq);
2326 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2327 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2328 {
2329 	unsigned short family = seq_file_family(seq);
2330 
2331 	/* AF_UNSPEC is used as a match all */
2332 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2333 		net_eq(sock_net(sk), seq_file_net(seq)));
2334 }
2335 
2336 /* Find a non empty bucket (starting from st->bucket)
2337  * and return the first sk from it.
2338  */
listening_get_first(struct seq_file * seq)2339 static void *listening_get_first(struct seq_file *seq)
2340 {
2341 	struct tcp_iter_state *st = seq->private;
2342 
2343 	st->offset = 0;
2344 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2345 		struct inet_listen_hashbucket *ilb2;
2346 		struct inet_connection_sock *icsk;
2347 		struct sock *sk;
2348 
2349 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2350 		if (hlist_empty(&ilb2->head))
2351 			continue;
2352 
2353 		spin_lock(&ilb2->lock);
2354 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2355 			sk = (struct sock *)icsk;
2356 			if (seq_sk_match(seq, sk))
2357 				return sk;
2358 		}
2359 		spin_unlock(&ilb2->lock);
2360 	}
2361 
2362 	return NULL;
2363 }
2364 
2365 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2366  * If "cur" is the last one in the st->bucket,
2367  * call listening_get_first() to return the first sk of the next
2368  * non empty bucket.
2369  */
listening_get_next(struct seq_file * seq,void * cur)2370 static void *listening_get_next(struct seq_file *seq, void *cur)
2371 {
2372 	struct tcp_iter_state *st = seq->private;
2373 	struct inet_listen_hashbucket *ilb2;
2374 	struct inet_connection_sock *icsk;
2375 	struct sock *sk = cur;
2376 
2377 	++st->num;
2378 	++st->offset;
2379 
2380 	icsk = inet_csk(sk);
2381 	inet_lhash2_for_each_icsk_continue(icsk) {
2382 		sk = (struct sock *)icsk;
2383 		if (seq_sk_match(seq, sk))
2384 			return sk;
2385 	}
2386 
2387 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2388 	spin_unlock(&ilb2->lock);
2389 	++st->bucket;
2390 	return listening_get_first(seq);
2391 }
2392 
listening_get_idx(struct seq_file * seq,loff_t * pos)2393 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2394 {
2395 	struct tcp_iter_state *st = seq->private;
2396 	void *rc;
2397 
2398 	st->bucket = 0;
2399 	st->offset = 0;
2400 	rc = listening_get_first(seq);
2401 
2402 	while (rc && *pos) {
2403 		rc = listening_get_next(seq, rc);
2404 		--*pos;
2405 	}
2406 	return rc;
2407 }
2408 
empty_bucket(const struct tcp_iter_state * st)2409 static inline bool empty_bucket(const struct tcp_iter_state *st)
2410 {
2411 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2412 }
2413 
2414 /*
2415  * Get first established socket starting from bucket given in st->bucket.
2416  * If st->bucket is zero, the very first socket in the hash is returned.
2417  */
established_get_first(struct seq_file * seq)2418 static void *established_get_first(struct seq_file *seq)
2419 {
2420 	struct tcp_iter_state *st = seq->private;
2421 
2422 	st->offset = 0;
2423 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2424 		struct sock *sk;
2425 		struct hlist_nulls_node *node;
2426 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2427 
2428 		/* Lockless fast path for the common case of empty buckets */
2429 		if (empty_bucket(st))
2430 			continue;
2431 
2432 		spin_lock_bh(lock);
2433 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2434 			if (seq_sk_match(seq, sk))
2435 				return sk;
2436 		}
2437 		spin_unlock_bh(lock);
2438 	}
2439 
2440 	return NULL;
2441 }
2442 
established_get_next(struct seq_file * seq,void * cur)2443 static void *established_get_next(struct seq_file *seq, void *cur)
2444 {
2445 	struct sock *sk = cur;
2446 	struct hlist_nulls_node *node;
2447 	struct tcp_iter_state *st = seq->private;
2448 
2449 	++st->num;
2450 	++st->offset;
2451 
2452 	sk = sk_nulls_next(sk);
2453 
2454 	sk_nulls_for_each_from(sk, node) {
2455 		if (seq_sk_match(seq, sk))
2456 			return sk;
2457 	}
2458 
2459 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2460 	++st->bucket;
2461 	return established_get_first(seq);
2462 }
2463 
established_get_idx(struct seq_file * seq,loff_t pos)2464 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2465 {
2466 	struct tcp_iter_state *st = seq->private;
2467 	void *rc;
2468 
2469 	st->bucket = 0;
2470 	rc = established_get_first(seq);
2471 
2472 	while (rc && pos) {
2473 		rc = established_get_next(seq, rc);
2474 		--pos;
2475 	}
2476 	return rc;
2477 }
2478 
tcp_get_idx(struct seq_file * seq,loff_t pos)2479 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2480 {
2481 	void *rc;
2482 	struct tcp_iter_state *st = seq->private;
2483 
2484 	st->state = TCP_SEQ_STATE_LISTENING;
2485 	rc	  = listening_get_idx(seq, &pos);
2486 
2487 	if (!rc) {
2488 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2489 		rc	  = established_get_idx(seq, pos);
2490 	}
2491 
2492 	return rc;
2493 }
2494 
tcp_seek_last_pos(struct seq_file * seq)2495 static void *tcp_seek_last_pos(struct seq_file *seq)
2496 {
2497 	struct tcp_iter_state *st = seq->private;
2498 	int bucket = st->bucket;
2499 	int offset = st->offset;
2500 	int orig_num = st->num;
2501 	void *rc = NULL;
2502 
2503 	switch (st->state) {
2504 	case TCP_SEQ_STATE_LISTENING:
2505 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2506 			break;
2507 		st->state = TCP_SEQ_STATE_LISTENING;
2508 		rc = listening_get_first(seq);
2509 		while (offset-- && rc && bucket == st->bucket)
2510 			rc = listening_get_next(seq, rc);
2511 		if (rc)
2512 			break;
2513 		st->bucket = 0;
2514 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2515 		fallthrough;
2516 	case TCP_SEQ_STATE_ESTABLISHED:
2517 		if (st->bucket > tcp_hashinfo.ehash_mask)
2518 			break;
2519 		rc = established_get_first(seq);
2520 		while (offset-- && rc && bucket == st->bucket)
2521 			rc = established_get_next(seq, rc);
2522 	}
2523 
2524 	st->num = orig_num;
2525 
2526 	return rc;
2527 }
2528 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2529 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2530 {
2531 	struct tcp_iter_state *st = seq->private;
2532 	void *rc;
2533 
2534 	if (*pos && *pos == st->last_pos) {
2535 		rc = tcp_seek_last_pos(seq);
2536 		if (rc)
2537 			goto out;
2538 	}
2539 
2540 	st->state = TCP_SEQ_STATE_LISTENING;
2541 	st->num = 0;
2542 	st->bucket = 0;
2543 	st->offset = 0;
2544 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2545 
2546 out:
2547 	st->last_pos = *pos;
2548 	return rc;
2549 }
2550 EXPORT_SYMBOL(tcp_seq_start);
2551 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2552 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2553 {
2554 	struct tcp_iter_state *st = seq->private;
2555 	void *rc = NULL;
2556 
2557 	if (v == SEQ_START_TOKEN) {
2558 		rc = tcp_get_idx(seq, 0);
2559 		goto out;
2560 	}
2561 
2562 	switch (st->state) {
2563 	case TCP_SEQ_STATE_LISTENING:
2564 		rc = listening_get_next(seq, v);
2565 		if (!rc) {
2566 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2567 			st->bucket = 0;
2568 			st->offset = 0;
2569 			rc	  = established_get_first(seq);
2570 		}
2571 		break;
2572 	case TCP_SEQ_STATE_ESTABLISHED:
2573 		rc = established_get_next(seq, v);
2574 		break;
2575 	}
2576 out:
2577 	++*pos;
2578 	st->last_pos = *pos;
2579 	return rc;
2580 }
2581 EXPORT_SYMBOL(tcp_seq_next);
2582 
tcp_seq_stop(struct seq_file * seq,void * v)2583 void tcp_seq_stop(struct seq_file *seq, void *v)
2584 {
2585 	struct tcp_iter_state *st = seq->private;
2586 
2587 	switch (st->state) {
2588 	case TCP_SEQ_STATE_LISTENING:
2589 		if (v != SEQ_START_TOKEN)
2590 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2591 		break;
2592 	case TCP_SEQ_STATE_ESTABLISHED:
2593 		if (v)
2594 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2595 		break;
2596 	}
2597 }
2598 EXPORT_SYMBOL(tcp_seq_stop);
2599 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2600 static void get_openreq4(const struct request_sock *req,
2601 			 struct seq_file *f, int i)
2602 {
2603 	const struct inet_request_sock *ireq = inet_rsk(req);
2604 	long delta = req->rsk_timer.expires - jiffies;
2605 
2606 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2607 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2608 		i,
2609 		ireq->ir_loc_addr,
2610 		ireq->ir_num,
2611 		ireq->ir_rmt_addr,
2612 		ntohs(ireq->ir_rmt_port),
2613 		TCP_SYN_RECV,
2614 		0, 0, /* could print option size, but that is af dependent. */
2615 		1,    /* timers active (only the expire timer) */
2616 		jiffies_delta_to_clock_t(delta),
2617 		req->num_timeout,
2618 		from_kuid_munged(seq_user_ns(f),
2619 				 sock_i_uid(req->rsk_listener)),
2620 		0,  /* non standard timer */
2621 		0, /* open_requests have no inode */
2622 		0,
2623 		req);
2624 }
2625 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2626 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2627 {
2628 	int timer_active;
2629 	unsigned long timer_expires;
2630 	const struct tcp_sock *tp = tcp_sk(sk);
2631 	const struct inet_connection_sock *icsk = inet_csk(sk);
2632 	const struct inet_sock *inet = inet_sk(sk);
2633 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2634 	__be32 dest = inet->inet_daddr;
2635 	__be32 src = inet->inet_rcv_saddr;
2636 	__u16 destp = ntohs(inet->inet_dport);
2637 	__u16 srcp = ntohs(inet->inet_sport);
2638 	int rx_queue;
2639 	int state;
2640 
2641 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2642 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2643 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2644 		timer_active	= 1;
2645 		timer_expires	= icsk->icsk_timeout;
2646 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2647 		timer_active	= 4;
2648 		timer_expires	= icsk->icsk_timeout;
2649 	} else if (timer_pending(&sk->sk_timer)) {
2650 		timer_active	= 2;
2651 		timer_expires	= sk->sk_timer.expires;
2652 	} else {
2653 		timer_active	= 0;
2654 		timer_expires = jiffies;
2655 	}
2656 
2657 	state = inet_sk_state_load(sk);
2658 	if (state == TCP_LISTEN)
2659 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2660 	else
2661 		/* Because we don't lock the socket,
2662 		 * we might find a transient negative value.
2663 		 */
2664 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2665 				      READ_ONCE(tp->copied_seq), 0);
2666 
2667 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2668 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2669 		i, src, srcp, dest, destp, state,
2670 		READ_ONCE(tp->write_seq) - tp->snd_una,
2671 		rx_queue,
2672 		timer_active,
2673 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2674 		icsk->icsk_retransmits,
2675 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2676 		icsk->icsk_probes_out,
2677 		sock_i_ino(sk),
2678 		refcount_read(&sk->sk_refcnt), sk,
2679 		jiffies_to_clock_t(icsk->icsk_rto),
2680 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2681 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2682 		tcp_snd_cwnd(tp),
2683 		state == TCP_LISTEN ?
2684 		    fastopenq->max_qlen :
2685 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2686 }
2687 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2688 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2689 			       struct seq_file *f, int i)
2690 {
2691 	long delta = tw->tw_timer.expires - jiffies;
2692 	__be32 dest, src;
2693 	__u16 destp, srcp;
2694 
2695 	dest  = tw->tw_daddr;
2696 	src   = tw->tw_rcv_saddr;
2697 	destp = ntohs(tw->tw_dport);
2698 	srcp  = ntohs(tw->tw_sport);
2699 
2700 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2701 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2702 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2703 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2704 		refcount_read(&tw->tw_refcnt), tw);
2705 }
2706 
2707 #define TMPSZ 150
2708 
tcp4_seq_show(struct seq_file * seq,void * v)2709 static int tcp4_seq_show(struct seq_file *seq, void *v)
2710 {
2711 	struct tcp_iter_state *st;
2712 	struct sock *sk = v;
2713 
2714 	seq_setwidth(seq, TMPSZ - 1);
2715 	if (v == SEQ_START_TOKEN) {
2716 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2717 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2718 			   "inode");
2719 		goto out;
2720 	}
2721 	st = seq->private;
2722 
2723 	if (sk->sk_state == TCP_TIME_WAIT)
2724 		get_timewait4_sock(v, seq, st->num);
2725 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2726 		get_openreq4(v, seq, st->num);
2727 	else
2728 		get_tcp4_sock(v, seq, st->num);
2729 out:
2730 	seq_pad(seq, '\n');
2731 	return 0;
2732 }
2733 
2734 #ifdef CONFIG_BPF_SYSCALL
2735 struct bpf_tcp_iter_state {
2736 	struct tcp_iter_state state;
2737 	unsigned int cur_sk;
2738 	unsigned int end_sk;
2739 	unsigned int max_sk;
2740 	struct sock **batch;
2741 	bool st_bucket_done;
2742 };
2743 
2744 struct bpf_iter__tcp {
2745 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2746 	__bpf_md_ptr(struct sock_common *, sk_common);
2747 	uid_t uid __aligned(8);
2748 };
2749 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2750 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2751 			     struct sock_common *sk_common, uid_t uid)
2752 {
2753 	struct bpf_iter__tcp ctx;
2754 
2755 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2756 	ctx.meta = meta;
2757 	ctx.sk_common = sk_common;
2758 	ctx.uid = uid;
2759 	return bpf_iter_run_prog(prog, &ctx);
2760 }
2761 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2762 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2763 {
2764 	while (iter->cur_sk < iter->end_sk)
2765 		sock_gen_put(iter->batch[iter->cur_sk++]);
2766 }
2767 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2768 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2769 				      unsigned int new_batch_sz)
2770 {
2771 	struct sock **new_batch;
2772 
2773 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2774 			     GFP_USER | __GFP_NOWARN);
2775 	if (!new_batch)
2776 		return -ENOMEM;
2777 
2778 	bpf_iter_tcp_put_batch(iter);
2779 	kvfree(iter->batch);
2780 	iter->batch = new_batch;
2781 	iter->max_sk = new_batch_sz;
2782 
2783 	return 0;
2784 }
2785 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2786 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2787 						 struct sock *start_sk)
2788 {
2789 	struct bpf_tcp_iter_state *iter = seq->private;
2790 	struct tcp_iter_state *st = &iter->state;
2791 	struct inet_connection_sock *icsk;
2792 	unsigned int expected = 1;
2793 	struct sock *sk;
2794 
2795 	sock_hold(start_sk);
2796 	iter->batch[iter->end_sk++] = start_sk;
2797 
2798 	icsk = inet_csk(start_sk);
2799 	inet_lhash2_for_each_icsk_continue(icsk) {
2800 		sk = (struct sock *)icsk;
2801 		if (seq_sk_match(seq, sk)) {
2802 			if (iter->end_sk < iter->max_sk) {
2803 				sock_hold(sk);
2804 				iter->batch[iter->end_sk++] = sk;
2805 			}
2806 			expected++;
2807 		}
2808 	}
2809 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2810 
2811 	return expected;
2812 }
2813 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2814 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2815 						   struct sock *start_sk)
2816 {
2817 	struct bpf_tcp_iter_state *iter = seq->private;
2818 	struct tcp_iter_state *st = &iter->state;
2819 	struct hlist_nulls_node *node;
2820 	unsigned int expected = 1;
2821 	struct sock *sk;
2822 
2823 	sock_hold(start_sk);
2824 	iter->batch[iter->end_sk++] = start_sk;
2825 
2826 	sk = sk_nulls_next(start_sk);
2827 	sk_nulls_for_each_from(sk, node) {
2828 		if (seq_sk_match(seq, sk)) {
2829 			if (iter->end_sk < iter->max_sk) {
2830 				sock_hold(sk);
2831 				iter->batch[iter->end_sk++] = sk;
2832 			}
2833 			expected++;
2834 		}
2835 	}
2836 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2837 
2838 	return expected;
2839 }
2840 
bpf_iter_tcp_batch(struct seq_file * seq)2841 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2842 {
2843 	struct bpf_tcp_iter_state *iter = seq->private;
2844 	struct tcp_iter_state *st = &iter->state;
2845 	unsigned int expected;
2846 	bool resized = false;
2847 	struct sock *sk;
2848 
2849 	/* The st->bucket is done.  Directly advance to the next
2850 	 * bucket instead of having the tcp_seek_last_pos() to skip
2851 	 * one by one in the current bucket and eventually find out
2852 	 * it has to advance to the next bucket.
2853 	 */
2854 	if (iter->st_bucket_done) {
2855 		st->offset = 0;
2856 		st->bucket++;
2857 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2858 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2859 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2860 			st->bucket = 0;
2861 		}
2862 	}
2863 
2864 again:
2865 	/* Get a new batch */
2866 	iter->cur_sk = 0;
2867 	iter->end_sk = 0;
2868 	iter->st_bucket_done = false;
2869 
2870 	sk = tcp_seek_last_pos(seq);
2871 	if (!sk)
2872 		return NULL; /* Done */
2873 
2874 	if (st->state == TCP_SEQ_STATE_LISTENING)
2875 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2876 	else
2877 		expected = bpf_iter_tcp_established_batch(seq, sk);
2878 
2879 	if (iter->end_sk == expected) {
2880 		iter->st_bucket_done = true;
2881 		return sk;
2882 	}
2883 
2884 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2885 		resized = true;
2886 		goto again;
2887 	}
2888 
2889 	return sk;
2890 }
2891 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2892 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2893 {
2894 	/* bpf iter does not support lseek, so it always
2895 	 * continue from where it was stop()-ped.
2896 	 */
2897 	if (*pos)
2898 		return bpf_iter_tcp_batch(seq);
2899 
2900 	return SEQ_START_TOKEN;
2901 }
2902 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2903 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2904 {
2905 	struct bpf_tcp_iter_state *iter = seq->private;
2906 	struct tcp_iter_state *st = &iter->state;
2907 	struct sock *sk;
2908 
2909 	/* Whenever seq_next() is called, the iter->cur_sk is
2910 	 * done with seq_show(), so advance to the next sk in
2911 	 * the batch.
2912 	 */
2913 	if (iter->cur_sk < iter->end_sk) {
2914 		/* Keeping st->num consistent in tcp_iter_state.
2915 		 * bpf_iter_tcp does not use st->num.
2916 		 * meta.seq_num is used instead.
2917 		 */
2918 		st->num++;
2919 		/* Move st->offset to the next sk in the bucket such that
2920 		 * the future start() will resume at st->offset in
2921 		 * st->bucket.  See tcp_seek_last_pos().
2922 		 */
2923 		st->offset++;
2924 		sock_gen_put(iter->batch[iter->cur_sk++]);
2925 	}
2926 
2927 	if (iter->cur_sk < iter->end_sk)
2928 		sk = iter->batch[iter->cur_sk];
2929 	else
2930 		sk = bpf_iter_tcp_batch(seq);
2931 
2932 	++*pos;
2933 	/* Keeping st->last_pos consistent in tcp_iter_state.
2934 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2935 	 */
2936 	st->last_pos = *pos;
2937 	return sk;
2938 }
2939 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2940 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2941 {
2942 	struct bpf_iter_meta meta;
2943 	struct bpf_prog *prog;
2944 	struct sock *sk = v;
2945 	bool slow;
2946 	uid_t uid;
2947 	int ret;
2948 
2949 	if (v == SEQ_START_TOKEN)
2950 		return 0;
2951 
2952 	if (sk_fullsock(sk))
2953 		slow = lock_sock_fast(sk);
2954 
2955 	if (unlikely(sk_unhashed(sk))) {
2956 		ret = SEQ_SKIP;
2957 		goto unlock;
2958 	}
2959 
2960 	if (sk->sk_state == TCP_TIME_WAIT) {
2961 		uid = 0;
2962 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2963 		const struct request_sock *req = v;
2964 
2965 		uid = from_kuid_munged(seq_user_ns(seq),
2966 				       sock_i_uid(req->rsk_listener));
2967 	} else {
2968 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2969 	}
2970 
2971 	meta.seq = seq;
2972 	prog = bpf_iter_get_info(&meta, false);
2973 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2974 
2975 unlock:
2976 	if (sk_fullsock(sk))
2977 		unlock_sock_fast(sk, slow);
2978 	return ret;
2979 
2980 }
2981 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2982 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2983 {
2984 	struct bpf_tcp_iter_state *iter = seq->private;
2985 	struct bpf_iter_meta meta;
2986 	struct bpf_prog *prog;
2987 
2988 	if (!v) {
2989 		meta.seq = seq;
2990 		prog = bpf_iter_get_info(&meta, true);
2991 		if (prog)
2992 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2993 	}
2994 
2995 	if (iter->cur_sk < iter->end_sk) {
2996 		bpf_iter_tcp_put_batch(iter);
2997 		iter->st_bucket_done = false;
2998 	}
2999 }
3000 
3001 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3002 	.show		= bpf_iter_tcp_seq_show,
3003 	.start		= bpf_iter_tcp_seq_start,
3004 	.next		= bpf_iter_tcp_seq_next,
3005 	.stop		= bpf_iter_tcp_seq_stop,
3006 };
3007 #endif
seq_file_family(const struct seq_file * seq)3008 static unsigned short seq_file_family(const struct seq_file *seq)
3009 {
3010 	const struct tcp_seq_afinfo *afinfo;
3011 
3012 #ifdef CONFIG_BPF_SYSCALL
3013 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3014 	if (seq->op == &bpf_iter_tcp_seq_ops)
3015 		return AF_UNSPEC;
3016 #endif
3017 
3018 	/* Iterated from proc fs */
3019 	afinfo = PDE_DATA(file_inode(seq->file));
3020 	return afinfo->family;
3021 }
3022 
3023 static const struct seq_operations tcp4_seq_ops = {
3024 	.show		= tcp4_seq_show,
3025 	.start		= tcp_seq_start,
3026 	.next		= tcp_seq_next,
3027 	.stop		= tcp_seq_stop,
3028 };
3029 
3030 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3031 	.family		= AF_INET,
3032 };
3033 
tcp4_proc_init_net(struct net * net)3034 static int __net_init tcp4_proc_init_net(struct net *net)
3035 {
3036 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3037 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3038 		return -ENOMEM;
3039 	return 0;
3040 }
3041 
tcp4_proc_exit_net(struct net * net)3042 static void __net_exit tcp4_proc_exit_net(struct net *net)
3043 {
3044 	remove_proc_entry("tcp", net->proc_net);
3045 }
3046 
3047 static struct pernet_operations tcp4_net_ops = {
3048 	.init = tcp4_proc_init_net,
3049 	.exit = tcp4_proc_exit_net,
3050 };
3051 
tcp4_proc_init(void)3052 int __init tcp4_proc_init(void)
3053 {
3054 	return register_pernet_subsys(&tcp4_net_ops);
3055 }
3056 
tcp4_proc_exit(void)3057 void tcp4_proc_exit(void)
3058 {
3059 	unregister_pernet_subsys(&tcp4_net_ops);
3060 }
3061 #endif /* CONFIG_PROC_FS */
3062 
3063 /* @wake is one when sk_stream_write_space() calls us.
3064  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3065  * This mimics the strategy used in sock_def_write_space().
3066  */
tcp_stream_memory_free(const struct sock * sk,int wake)3067 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3068 {
3069 	const struct tcp_sock *tp = tcp_sk(sk);
3070 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3071 			    READ_ONCE(tp->snd_nxt);
3072 
3073 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3074 }
3075 EXPORT_SYMBOL(tcp_stream_memory_free);
3076 
3077 struct proto tcp_prot = {
3078 	.name			= "TCP",
3079 	.owner			= THIS_MODULE,
3080 	.close			= tcp_close,
3081 	.pre_connect		= tcp_v4_pre_connect,
3082 	.connect		= tcp_v4_connect,
3083 	.disconnect		= tcp_disconnect,
3084 	.accept			= inet_csk_accept,
3085 	.ioctl			= tcp_ioctl,
3086 	.init			= tcp_v4_init_sock,
3087 	.destroy		= tcp_v4_destroy_sock,
3088 	.shutdown		= tcp_shutdown,
3089 	.setsockopt		= tcp_setsockopt,
3090 	.getsockopt		= tcp_getsockopt,
3091 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3092 	.keepalive		= tcp_set_keepalive,
3093 	.recvmsg		= tcp_recvmsg,
3094 	.sendmsg		= tcp_sendmsg,
3095 	.sendpage		= tcp_sendpage,
3096 	.backlog_rcv		= tcp_v4_do_rcv,
3097 	.release_cb		= tcp_release_cb,
3098 	.hash			= inet_hash,
3099 	.unhash			= inet_unhash,
3100 	.get_port		= inet_csk_get_port,
3101 #ifdef CONFIG_BPF_SYSCALL
3102 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3103 #endif
3104 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3105 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3106 	.stream_memory_free	= tcp_stream_memory_free,
3107 	.sockets_allocated	= &tcp_sockets_allocated,
3108 	.orphan_count		= &tcp_orphan_count,
3109 	.memory_allocated	= &tcp_memory_allocated,
3110 	.memory_pressure	= &tcp_memory_pressure,
3111 	.sysctl_mem		= sysctl_tcp_mem,
3112 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3113 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3114 	.max_header		= MAX_TCP_HEADER,
3115 	.obj_size		= sizeof(struct tcp_sock),
3116 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3117 	.twsk_prot		= &tcp_timewait_sock_ops,
3118 	.rsk_prot		= &tcp_request_sock_ops,
3119 	.h.hashinfo		= &tcp_hashinfo,
3120 	.no_autobind		= true,
3121 	.diag_destroy		= tcp_abort,
3122 };
3123 EXPORT_SYMBOL(tcp_prot);
3124 
tcp_sk_exit(struct net * net)3125 static void __net_exit tcp_sk_exit(struct net *net)
3126 {
3127 	if (net->ipv4.tcp_congestion_control)
3128 		bpf_module_put(net->ipv4.tcp_congestion_control,
3129 			       net->ipv4.tcp_congestion_control->owner);
3130 }
3131 
tcp_sk_init(struct net * net)3132 static int __net_init tcp_sk_init(struct net *net)
3133 {
3134 	int cnt;
3135 
3136 	net->ipv4.sysctl_tcp_ecn = 2;
3137 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3138 
3139 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3140 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3141 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3142 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3143 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3144 
3145 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3146 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3147 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3148 
3149 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3150 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3151 	net->ipv4.sysctl_tcp_syncookies = 1;
3152 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3153 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3154 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3155 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3156 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3157 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3158 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3159 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3160 
3161 	cnt = tcp_hashinfo.ehash_mask + 1;
3162 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3163 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3164 
3165 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3166 	net->ipv4.sysctl_tcp_sack = 1;
3167 	net->ipv4.sysctl_tcp_window_scaling = 1;
3168 	net->ipv4.sysctl_tcp_timestamps = 1;
3169 	net->ipv4.sysctl_tcp_early_retrans = 3;
3170 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3171 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3172 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3173 	net->ipv4.sysctl_tcp_max_reordering = 300;
3174 	net->ipv4.sysctl_tcp_dsack = 1;
3175 	net->ipv4.sysctl_tcp_app_win = 31;
3176 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3177 	net->ipv4.sysctl_tcp_frto = 2;
3178 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3179 	/* This limits the percentage of the congestion window which we
3180 	 * will allow a single TSO frame to consume.  Building TSO frames
3181 	 * which are too large can cause TCP streams to be bursty.
3182 	 */
3183 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3184 	/* Default TSQ limit of 16 TSO segments */
3185 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3186 	/* rfc5961 challenge ack rate limiting */
3187 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3188 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3189 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3190 	net->ipv4.sysctl_tcp_autocorking = 1;
3191 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3192 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3193 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3194 	if (net != &init_net) {
3195 		memcpy(net->ipv4.sysctl_tcp_rmem,
3196 		       init_net.ipv4.sysctl_tcp_rmem,
3197 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3198 		memcpy(net->ipv4.sysctl_tcp_wmem,
3199 		       init_net.ipv4.sysctl_tcp_wmem,
3200 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3201 	}
3202 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3203 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3204 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3205 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3206 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3207 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3208 
3209 	/* Reno is always built in */
3210 	if (!net_eq(net, &init_net) &&
3211 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3212 			       init_net.ipv4.tcp_congestion_control->owner))
3213 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3214 	else
3215 		net->ipv4.tcp_congestion_control = &tcp_reno;
3216 
3217 	return 0;
3218 }
3219 
tcp_sk_exit_batch(struct list_head * net_exit_list)3220 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3221 {
3222 	struct net *net;
3223 
3224 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3225 
3226 	list_for_each_entry(net, net_exit_list, exit_list)
3227 		tcp_fastopen_ctx_destroy(net);
3228 }
3229 
3230 static struct pernet_operations __net_initdata tcp_sk_ops = {
3231        .init	   = tcp_sk_init,
3232        .exit	   = tcp_sk_exit,
3233        .exit_batch = tcp_sk_exit_batch,
3234 };
3235 
3236 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3237 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3238 		     struct sock_common *sk_common, uid_t uid)
3239 
3240 #define INIT_BATCH_SZ 16
3241 
3242 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3243 {
3244 	struct bpf_tcp_iter_state *iter = priv_data;
3245 	int err;
3246 
3247 	err = bpf_iter_init_seq_net(priv_data, aux);
3248 	if (err)
3249 		return err;
3250 
3251 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3252 	if (err) {
3253 		bpf_iter_fini_seq_net(priv_data);
3254 		return err;
3255 	}
3256 
3257 	return 0;
3258 }
3259 
bpf_iter_fini_tcp(void * priv_data)3260 static void bpf_iter_fini_tcp(void *priv_data)
3261 {
3262 	struct bpf_tcp_iter_state *iter = priv_data;
3263 
3264 	bpf_iter_fini_seq_net(priv_data);
3265 	kvfree(iter->batch);
3266 }
3267 
3268 static const struct bpf_iter_seq_info tcp_seq_info = {
3269 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3270 	.init_seq_private	= bpf_iter_init_tcp,
3271 	.fini_seq_private	= bpf_iter_fini_tcp,
3272 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3273 };
3274 
3275 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3276 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3277 			    const struct bpf_prog *prog)
3278 {
3279 	switch (func_id) {
3280 	case BPF_FUNC_setsockopt:
3281 		return &bpf_sk_setsockopt_proto;
3282 	case BPF_FUNC_getsockopt:
3283 		return &bpf_sk_getsockopt_proto;
3284 	default:
3285 		return NULL;
3286 	}
3287 }
3288 
3289 static struct bpf_iter_reg tcp_reg_info = {
3290 	.target			= "tcp",
3291 	.ctx_arg_info_size	= 1,
3292 	.ctx_arg_info		= {
3293 		{ offsetof(struct bpf_iter__tcp, sk_common),
3294 		  PTR_TO_BTF_ID_OR_NULL },
3295 	},
3296 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3297 	.seq_info		= &tcp_seq_info,
3298 };
3299 
bpf_iter_register(void)3300 static void __init bpf_iter_register(void)
3301 {
3302 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3303 	if (bpf_iter_reg_target(&tcp_reg_info))
3304 		pr_warn("Warning: could not register bpf iterator tcp\n");
3305 }
3306 
3307 #endif
3308 
tcp_v4_init(void)3309 void __init tcp_v4_init(void)
3310 {
3311 	int cpu, res;
3312 
3313 	for_each_possible_cpu(cpu) {
3314 		struct sock *sk;
3315 
3316 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3317 					   IPPROTO_TCP, &init_net);
3318 		if (res)
3319 			panic("Failed to create the TCP control socket.\n");
3320 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3321 
3322 		/* Please enforce IP_DF and IPID==0 for RST and
3323 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3324 		 */
3325 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3326 
3327 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3328 	}
3329 	if (register_pernet_subsys(&tcp_sk_ops))
3330 		panic("Failed to create the TCP control socket.\n");
3331 
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333 	bpf_iter_register();
3334 #endif
3335 }
3336