• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_timewait_death_row *tcp_death_row;
204 	struct inet_sock *inet = inet_sk(sk);
205 	struct tcp_sock *tp = tcp_sk(sk);
206 	struct ip_options_rcu *inet_opt;
207 	struct net *net = sock_net(sk);
208 	__be16 orig_sport, orig_dport;
209 	__be32 daddr, nexthop;
210 	struct flowi4 *fl4;
211 	struct rtable *rt;
212 	int err;
213 
214 	if (addr_len < sizeof(struct sockaddr_in))
215 		return -EINVAL;
216 
217 	if (usin->sin_family != AF_INET)
218 		return -EAFNOSUPPORT;
219 
220 	nexthop = daddr = usin->sin_addr.s_addr;
221 	inet_opt = rcu_dereference_protected(inet->inet_opt,
222 					     lockdep_sock_is_held(sk));
223 	if (inet_opt && inet_opt->opt.srr) {
224 		if (!daddr)
225 			return -EINVAL;
226 		nexthop = inet_opt->opt.faddr;
227 	}
228 
229 	orig_sport = inet->inet_sport;
230 	orig_dport = usin->sin_port;
231 	fl4 = &inet->cork.fl.u.ip4;
232 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234 			      orig_dport, sk);
235 	if (IS_ERR(rt)) {
236 		err = PTR_ERR(rt);
237 		if (err == -ENETUNREACH)
238 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239 		return err;
240 	}
241 
242 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 		ip_rt_put(rt);
244 		return -ENETUNREACH;
245 	}
246 
247 	if (!inet_opt || !inet_opt->opt.srr)
248 		daddr = fl4->daddr;
249 
250 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251 
252 	if (!inet->inet_saddr) {
253 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254 		if (err) {
255 			ip_rt_put(rt);
256 			return err;
257 		}
258 	} else {
259 		sk_rcv_saddr_set(sk, inet->inet_saddr);
260 	}
261 
262 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263 		/* Reset inherited state */
264 		tp->rx_opt.ts_recent	   = 0;
265 		tp->rx_opt.ts_recent_stamp = 0;
266 		if (likely(!tp->repair))
267 			WRITE_ONCE(tp->write_seq, 0);
268 	}
269 
270 	inet->inet_dport = usin->sin_port;
271 	sk_daddr_set(sk, daddr);
272 
273 	inet_csk(sk)->icsk_ext_hdr_len = 0;
274 	if (inet_opt)
275 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276 
277 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278 
279 	/* Socket identity is still unknown (sport may be zero).
280 	 * However we set state to SYN-SENT and not releasing socket
281 	 * lock select source port, enter ourselves into the hash tables and
282 	 * complete initialization after this.
283 	 */
284 	tcp_set_state(sk, TCP_SYN_SENT);
285 	err = inet_hash_connect(tcp_death_row, sk);
286 	if (err)
287 		goto failure;
288 
289 	sk_set_txhash(sk);
290 
291 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292 			       inet->inet_sport, inet->inet_dport, sk);
293 	if (IS_ERR(rt)) {
294 		err = PTR_ERR(rt);
295 		rt = NULL;
296 		goto failure;
297 	}
298 	/* OK, now commit destination to socket.  */
299 	sk->sk_gso_type = SKB_GSO_TCPV4;
300 	sk_setup_caps(sk, &rt->dst);
301 	rt = NULL;
302 
303 	if (likely(!tp->repair)) {
304 		if (!tp->write_seq)
305 			WRITE_ONCE(tp->write_seq,
306 				   secure_tcp_seq(inet->inet_saddr,
307 						  inet->inet_daddr,
308 						  inet->inet_sport,
309 						  usin->sin_port));
310 		WRITE_ONCE(tp->tsoffset,
311 			   secure_tcp_ts_off(net, inet->inet_saddr,
312 					     inet->inet_daddr));
313 	}
314 
315 	inet->inet_id = get_random_u16();
316 
317 	if (tcp_fastopen_defer_connect(sk, &err))
318 		return err;
319 	if (err)
320 		goto failure;
321 
322 	err = tcp_connect(sk);
323 
324 	if (err)
325 		goto failure;
326 
327 	return 0;
328 
329 failure:
330 	/*
331 	 * This unhashes the socket and releases the local port,
332 	 * if necessary.
333 	 */
334 	tcp_set_state(sk, TCP_CLOSE);
335 	inet_bhash2_reset_saddr(sk);
336 	ip_rt_put(rt);
337 	sk->sk_route_caps = 0;
338 	inet->inet_dport = 0;
339 	return err;
340 }
341 EXPORT_SYMBOL(tcp_v4_connect);
342 
343 /*
344  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
345  * It can be called through tcp_release_cb() if socket was owned by user
346  * at the time tcp_v4_err() was called to handle ICMP message.
347  */
tcp_v4_mtu_reduced(struct sock * sk)348 void tcp_v4_mtu_reduced(struct sock *sk)
349 {
350 	struct inet_sock *inet = inet_sk(sk);
351 	struct dst_entry *dst;
352 	u32 mtu;
353 
354 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
355 		return;
356 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
357 	dst = inet_csk_update_pmtu(sk, mtu);
358 	if (!dst)
359 		return;
360 
361 	/* Something is about to be wrong... Remember soft error
362 	 * for the case, if this connection will not able to recover.
363 	 */
364 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
365 		sk->sk_err_soft = EMSGSIZE;
366 
367 	mtu = dst_mtu(dst);
368 
369 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
370 	    ip_sk_accept_pmtu(sk) &&
371 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
372 		tcp_sync_mss(sk, mtu);
373 
374 		/* Resend the TCP packet because it's
375 		 * clear that the old packet has been
376 		 * dropped. This is the new "fast" path mtu
377 		 * discovery.
378 		 */
379 		tcp_simple_retransmit(sk);
380 	} /* else let the usual retransmit timer handle it */
381 }
382 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
383 
do_redirect(struct sk_buff * skb,struct sock * sk)384 static void do_redirect(struct sk_buff *skb, struct sock *sk)
385 {
386 	struct dst_entry *dst = __sk_dst_check(sk, 0);
387 
388 	if (dst)
389 		dst->ops->redirect(dst, sk, skb);
390 }
391 
392 
393 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)394 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
395 {
396 	struct request_sock *req = inet_reqsk(sk);
397 	struct net *net = sock_net(sk);
398 
399 	/* ICMPs are not backlogged, hence we cannot get
400 	 * an established socket here.
401 	 */
402 	if (seq != tcp_rsk(req)->snt_isn) {
403 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
404 	} else if (abort) {
405 		/*
406 		 * Still in SYN_RECV, just remove it silently.
407 		 * There is no good way to pass the error to the newly
408 		 * created socket, and POSIX does not want network
409 		 * errors returned from accept().
410 		 */
411 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
412 		tcp_listendrop(req->rsk_listener);
413 	}
414 	reqsk_put(req);
415 }
416 EXPORT_SYMBOL(tcp_req_err);
417 
418 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)419 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
420 {
421 	struct inet_connection_sock *icsk = inet_csk(sk);
422 	struct tcp_sock *tp = tcp_sk(sk);
423 	struct sk_buff *skb;
424 	s32 remaining;
425 	u32 delta_us;
426 
427 	if (sock_owned_by_user(sk))
428 		return;
429 
430 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
431 	    !icsk->icsk_backoff)
432 		return;
433 
434 	skb = tcp_rtx_queue_head(sk);
435 	if (WARN_ON_ONCE(!skb))
436 		return;
437 
438 	icsk->icsk_backoff--;
439 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
440 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
441 
442 	tcp_mstamp_refresh(tp);
443 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
444 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
445 
446 	if (remaining > 0) {
447 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 					  remaining, TCP_RTO_MAX);
449 	} else {
450 		/* RTO revert clocked out retransmission.
451 		 * Will retransmit now.
452 		 */
453 		tcp_retransmit_timer(sk);
454 	}
455 }
456 EXPORT_SYMBOL(tcp_ld_RTO_revert);
457 
458 /*
459  * This routine is called by the ICMP module when it gets some
460  * sort of error condition.  If err < 0 then the socket should
461  * be closed and the error returned to the user.  If err > 0
462  * it's just the icmp type << 8 | icmp code.  After adjustment
463  * header points to the first 8 bytes of the tcp header.  We need
464  * to find the appropriate port.
465  *
466  * The locking strategy used here is very "optimistic". When
467  * someone else accesses the socket the ICMP is just dropped
468  * and for some paths there is no check at all.
469  * A more general error queue to queue errors for later handling
470  * is probably better.
471  *
472  */
473 
tcp_v4_err(struct sk_buff * skb,u32 info)474 int tcp_v4_err(struct sk_buff *skb, u32 info)
475 {
476 	const struct iphdr *iph = (const struct iphdr *)skb->data;
477 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
478 	struct tcp_sock *tp;
479 	struct inet_sock *inet;
480 	const int type = icmp_hdr(skb)->type;
481 	const int code = icmp_hdr(skb)->code;
482 	struct sock *sk;
483 	struct request_sock *fastopen;
484 	u32 seq, snd_una;
485 	int err;
486 	struct net *net = dev_net(skb->dev);
487 
488 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489 				       iph->daddr, th->dest, iph->saddr,
490 				       ntohs(th->source), inet_iif(skb), 0);
491 	if (!sk) {
492 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493 		return -ENOENT;
494 	}
495 	if (sk->sk_state == TCP_TIME_WAIT) {
496 		inet_twsk_put(inet_twsk(sk));
497 		return 0;
498 	}
499 	seq = ntohl(th->seq);
500 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
501 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502 				     type == ICMP_TIME_EXCEEDED ||
503 				     (type == ICMP_DEST_UNREACH &&
504 				      (code == ICMP_NET_UNREACH ||
505 				       code == ICMP_HOST_UNREACH)));
506 		return 0;
507 	}
508 
509 	bh_lock_sock(sk);
510 	/* If too many ICMPs get dropped on busy
511 	 * servers this needs to be solved differently.
512 	 * We do take care of PMTU discovery (RFC1191) special case :
513 	 * we can receive locally generated ICMP messages while socket is held.
514 	 */
515 	if (sock_owned_by_user(sk)) {
516 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518 	}
519 	if (sk->sk_state == TCP_CLOSE)
520 		goto out;
521 
522 	if (static_branch_unlikely(&ip4_min_ttl)) {
523 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
524 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526 			goto out;
527 		}
528 	}
529 
530 	tp = tcp_sk(sk);
531 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532 	fastopen = rcu_dereference(tp->fastopen_rsk);
533 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534 	if (sk->sk_state != TCP_LISTEN &&
535 	    !between(seq, snd_una, tp->snd_nxt)) {
536 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537 		goto out;
538 	}
539 
540 	switch (type) {
541 	case ICMP_REDIRECT:
542 		if (!sock_owned_by_user(sk))
543 			do_redirect(skb, sk);
544 		goto out;
545 	case ICMP_SOURCE_QUENCH:
546 		/* Just silently ignore these. */
547 		goto out;
548 	case ICMP_PARAMETERPROB:
549 		err = EPROTO;
550 		break;
551 	case ICMP_DEST_UNREACH:
552 		if (code > NR_ICMP_UNREACH)
553 			goto out;
554 
555 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556 			/* We are not interested in TCP_LISTEN and open_requests
557 			 * (SYN-ACKs send out by Linux are always <576bytes so
558 			 * they should go through unfragmented).
559 			 */
560 			if (sk->sk_state == TCP_LISTEN)
561 				goto out;
562 
563 			WRITE_ONCE(tp->mtu_info, info);
564 			if (!sock_owned_by_user(sk)) {
565 				tcp_v4_mtu_reduced(sk);
566 			} else {
567 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568 					sock_hold(sk);
569 			}
570 			goto out;
571 		}
572 
573 		err = icmp_err_convert[code].errno;
574 		/* check if this ICMP message allows revert of backoff.
575 		 * (see RFC 6069)
576 		 */
577 		if (!fastopen &&
578 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579 			tcp_ld_RTO_revert(sk, seq);
580 		break;
581 	case ICMP_TIME_EXCEEDED:
582 		err = EHOSTUNREACH;
583 		break;
584 	default:
585 		goto out;
586 	}
587 
588 	switch (sk->sk_state) {
589 	case TCP_SYN_SENT:
590 	case TCP_SYN_RECV:
591 		/* Only in fast or simultaneous open. If a fast open socket is
592 		 * already accepted it is treated as a connected one below.
593 		 */
594 		if (fastopen && !fastopen->sk)
595 			break;
596 
597 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598 
599 		if (!sock_owned_by_user(sk)) {
600 			sk->sk_err = err;
601 
602 			sk_error_report(sk);
603 
604 			tcp_done(sk);
605 		} else {
606 			sk->sk_err_soft = err;
607 		}
608 		goto out;
609 	}
610 
611 	/* If we've already connected we will keep trying
612 	 * until we time out, or the user gives up.
613 	 *
614 	 * rfc1122 4.2.3.9 allows to consider as hard errors
615 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616 	 * but it is obsoleted by pmtu discovery).
617 	 *
618 	 * Note, that in modern internet, where routing is unreliable
619 	 * and in each dark corner broken firewalls sit, sending random
620 	 * errors ordered by their masters even this two messages finally lose
621 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
622 	 *
623 	 * Now we are in compliance with RFCs.
624 	 *							--ANK (980905)
625 	 */
626 
627 	inet = inet_sk(sk);
628 	if (!sock_owned_by_user(sk) && inet->recverr) {
629 		sk->sk_err = err;
630 		sk_error_report(sk);
631 	} else	{ /* Only an error on timeout */
632 		sk->sk_err_soft = err;
633 	}
634 
635 out:
636 	bh_unlock_sock(sk);
637 	sock_put(sk);
638 	return 0;
639 }
640 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643 	struct tcphdr *th = tcp_hdr(skb);
644 
645 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646 	skb->csum_start = skb_transport_header(skb) - skb->head;
647 	skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649 
650 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653 	const struct inet_sock *inet = inet_sk(sk);
654 
655 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658 
659 /*
660  *	This routine will send an RST to the other tcp.
661  *
662  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *		      for reset.
664  *	Answer: if a packet caused RST, it is not for a socket
665  *		existing in our system, if it is matched to a socket,
666  *		it is just duplicate segment or bug in other side's TCP.
667  *		So that we build reply only basing on parameters
668  *		arrived with segment.
669  *	Exception: precedence violation. We do not implement it in any case.
670  */
671 
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680 	const struct tcphdr *th = tcp_hdr(skb);
681 	struct {
682 		struct tcphdr th;
683 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
684 	} rep;
685 	struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687 	struct tcp_md5sig_key *key = NULL;
688 	const __u8 *hash_location = NULL;
689 	unsigned char newhash[16];
690 	int genhash;
691 	struct sock *sk1 = NULL;
692 #endif
693 	u64 transmit_time = 0;
694 	struct sock *ctl_sk;
695 	struct net *net;
696 	u32 txhash = 0;
697 
698 	/* Never send a reset in response to a reset. */
699 	if (th->rst)
700 		return;
701 
702 	/* If sk not NULL, it means we did a successful lookup and incoming
703 	 * route had to be correct. prequeue might have dropped our dst.
704 	 */
705 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706 		return;
707 
708 	/* Swap the send and the receive. */
709 	memset(&rep, 0, sizeof(rep));
710 	rep.th.dest   = th->source;
711 	rep.th.source = th->dest;
712 	rep.th.doff   = sizeof(struct tcphdr) / 4;
713 	rep.th.rst    = 1;
714 
715 	if (th->ack) {
716 		rep.th.seq = th->ack_seq;
717 	} else {
718 		rep.th.ack = 1;
719 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720 				       skb->len - (th->doff << 2));
721 	}
722 
723 	memset(&arg, 0, sizeof(arg));
724 	arg.iov[0].iov_base = (unsigned char *)&rep;
725 	arg.iov[0].iov_len  = sizeof(rep.th);
726 
727 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729 	rcu_read_lock();
730 	hash_location = tcp_parse_md5sig_option(th);
731 	if (sk && sk_fullsock(sk)) {
732 		const union tcp_md5_addr *addr;
733 		int l3index;
734 
735 		/* sdif set, means packet ingressed via a device
736 		 * in an L3 domain and inet_iif is set to it.
737 		 */
738 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741 	} else if (hash_location) {
742 		const union tcp_md5_addr *addr;
743 		int sdif = tcp_v4_sdif(skb);
744 		int dif = inet_iif(skb);
745 		int l3index;
746 
747 		/*
748 		 * active side is lost. Try to find listening socket through
749 		 * source port, and then find md5 key through listening socket.
750 		 * we are not loose security here:
751 		 * Incoming packet is checked with md5 hash with finding key,
752 		 * no RST generated if md5 hash doesn't match.
753 		 */
754 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755 					     NULL, 0, ip_hdr(skb)->saddr,
756 					     th->source, ip_hdr(skb)->daddr,
757 					     ntohs(th->source), dif, sdif);
758 		/* don't send rst if it can't find key */
759 		if (!sk1)
760 			goto out;
761 
762 		/* sdif set, means packet ingressed via a device
763 		 * in an L3 domain and dif is set to it.
764 		 */
765 		l3index = sdif ? dif : 0;
766 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768 		if (!key)
769 			goto out;
770 
771 
772 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
774 			goto out;
775 
776 	}
777 
778 	if (key) {
779 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780 				   (TCPOPT_NOP << 16) |
781 				   (TCPOPT_MD5SIG << 8) |
782 				   TCPOLEN_MD5SIG);
783 		/* Update length and the length the header thinks exists */
784 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785 		rep.th.doff = arg.iov[0].iov_len / 4;
786 
787 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788 				     key, ip_hdr(skb)->saddr,
789 				     ip_hdr(skb)->daddr, &rep.th);
790 	}
791 #endif
792 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793 	if (rep.opt[0] == 0) {
794 		__be32 mrst = mptcp_reset_option(skb);
795 
796 		if (mrst) {
797 			rep.opt[0] = mrst;
798 			arg.iov[0].iov_len += sizeof(mrst);
799 			rep.th.doff = arg.iov[0].iov_len / 4;
800 		}
801 	}
802 
803 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804 				      ip_hdr(skb)->saddr, /* XXX */
805 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
806 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808 
809 	/* When socket is gone, all binding information is lost.
810 	 * routing might fail in this case. No choice here, if we choose to force
811 	 * input interface, we will misroute in case of asymmetric route.
812 	 */
813 	if (sk) {
814 		arg.bound_dev_if = sk->sk_bound_dev_if;
815 		if (sk_fullsock(sk))
816 			trace_tcp_send_reset(sk, skb);
817 	}
818 
819 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821 
822 	arg.tos = ip_hdr(skb)->tos;
823 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824 	local_bh_disable();
825 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
826 	sock_net_set(ctl_sk, net);
827 	if (sk) {
828 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
830 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
832 		transmit_time = tcp_transmit_time(sk);
833 		xfrm_sk_clone_policy(ctl_sk, sk);
834 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836 	} else {
837 		ctl_sk->sk_mark = 0;
838 		ctl_sk->sk_priority = 0;
839 	}
840 	ip_send_unicast_reply(ctl_sk,
841 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
842 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843 			      &arg, arg.iov[0].iov_len,
844 			      transmit_time, txhash);
845 
846 	xfrm_sk_free_policy(ctl_sk);
847 	sock_net_set(ctl_sk, &init_net);
848 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850 	local_bh_enable();
851 
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854 	rcu_read_unlock();
855 #endif
856 }
857 
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos,u32 txhash)862 static void tcp_v4_send_ack(const struct sock *sk,
863 			    struct sk_buff *skb, u32 seq, u32 ack,
864 			    u32 win, u32 tsval, u32 tsecr, int oif,
865 			    struct tcp_md5sig_key *key,
866 			    int reply_flags, u8 tos, u32 txhash)
867 {
868 	const struct tcphdr *th = tcp_hdr(skb);
869 	struct {
870 		struct tcphdr th;
871 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875 			];
876 	} rep;
877 	struct net *net = sock_net(sk);
878 	struct ip_reply_arg arg;
879 	struct sock *ctl_sk;
880 	u64 transmit_time;
881 
882 	memset(&rep.th, 0, sizeof(struct tcphdr));
883 	memset(&arg, 0, sizeof(arg));
884 
885 	arg.iov[0].iov_base = (unsigned char *)&rep;
886 	arg.iov[0].iov_len  = sizeof(rep.th);
887 	if (tsecr) {
888 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889 				   (TCPOPT_TIMESTAMP << 8) |
890 				   TCPOLEN_TIMESTAMP);
891 		rep.opt[1] = htonl(tsval);
892 		rep.opt[2] = htonl(tsecr);
893 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894 	}
895 
896 	/* Swap the send and the receive. */
897 	rep.th.dest    = th->source;
898 	rep.th.source  = th->dest;
899 	rep.th.doff    = arg.iov[0].iov_len / 4;
900 	rep.th.seq     = htonl(seq);
901 	rep.th.ack_seq = htonl(ack);
902 	rep.th.ack     = 1;
903 	rep.th.window  = htons(win);
904 
905 #ifdef CONFIG_TCP_MD5SIG
906 	if (key) {
907 		int offset = (tsecr) ? 3 : 0;
908 
909 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910 					  (TCPOPT_NOP << 16) |
911 					  (TCPOPT_MD5SIG << 8) |
912 					  TCPOLEN_MD5SIG);
913 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914 		rep.th.doff = arg.iov[0].iov_len/4;
915 
916 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917 				    key, ip_hdr(skb)->saddr,
918 				    ip_hdr(skb)->daddr, &rep.th);
919 	}
920 #endif
921 	arg.flags = reply_flags;
922 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923 				      ip_hdr(skb)->saddr, /* XXX */
924 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
925 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926 	if (oif)
927 		arg.bound_dev_if = oif;
928 	arg.tos = tos;
929 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930 	local_bh_disable();
931 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
932 	sock_net_set(ctl_sk, net);
933 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937 	transmit_time = tcp_transmit_time(sk);
938 	ip_send_unicast_reply(ctl_sk,
939 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
940 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941 			      &arg, arg.iov[0].iov_len,
942 			      transmit_time, txhash);
943 
944 	sock_net_set(ctl_sk, &init_net);
945 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946 	local_bh_enable();
947 }
948 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951 	struct inet_timewait_sock *tw = inet_twsk(sk);
952 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953 
954 	tcp_v4_send_ack(sk, skb,
955 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958 			tcptw->tw_ts_recent,
959 			tw->tw_bound_dev_if,
960 			tcp_twsk_md5_key(tcptw),
961 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962 			tw->tw_tos,
963 			tw->tw_txhash
964 			);
965 
966 	inet_twsk_put(tw);
967 }
968 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970 				  struct request_sock *req)
971 {
972 	const union tcp_md5_addr *addr;
973 	int l3index;
974 
975 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977 	 */
978 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979 					     tcp_sk(sk)->snd_nxt;
980 
981 	/* RFC 7323 2.3
982 	 * The window field (SEG.WND) of every outgoing segment, with the
983 	 * exception of <SYN> segments, MUST be right-shifted by
984 	 * Rcv.Wind.Shift bits:
985 	 */
986 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988 	tcp_v4_send_ack(sk, skb, seq,
989 			tcp_rsk(req)->rcv_nxt,
990 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992 			READ_ONCE(req->ts_recent),
993 			0,
994 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996 			ip_hdr(skb)->tos,
997 			READ_ONCE(tcp_rsk(req)->txhash));
998 }
999 
1000 /*
1001  *	Send a SYN-ACK after having received a SYN.
1002  *	This still operates on a request_sock only, not on a big
1003  *	socket.
1004  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006 			      struct flowi *fl,
1007 			      struct request_sock *req,
1008 			      struct tcp_fastopen_cookie *foc,
1009 			      enum tcp_synack_type synack_type,
1010 			      struct sk_buff *syn_skb)
1011 {
1012 	const struct inet_request_sock *ireq = inet_rsk(req);
1013 	struct flowi4 fl4;
1014 	int err = -1;
1015 	struct sk_buff *skb;
1016 	u8 tos;
1017 
1018 	/* First, grab a route. */
1019 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020 		return -1;
1021 
1022 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023 
1024 	if (skb) {
1025 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026 
1027 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1030 				inet_sk(sk)->tos;
1031 
1032 		if (!INET_ECN_is_capable(tos) &&
1033 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1034 			tos |= INET_ECN_ECT_0;
1035 
1036 		rcu_read_lock();
1037 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038 					    ireq->ir_rmt_addr,
1039 					    rcu_dereference(ireq->ireq_opt),
1040 					    tos);
1041 		rcu_read_unlock();
1042 		err = net_xmit_eval(err);
1043 	}
1044 
1045 	return err;
1046 }
1047 
1048 /*
1049  *	IPv4 request_sock destructor.
1050  */
tcp_v4_reqsk_destructor(struct request_sock * req)1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055 
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062 
1063 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068 	if (!old)
1069 		return true;
1070 
1071 	/* l3index always overrides non-l3index */
1072 	if (old->l3index && new->l3index == 0)
1073 		return false;
1074 	if (old->l3index == 0 && new->l3index)
1075 		return true;
1076 
1077 	return old->prefixlen < new->prefixlen;
1078 }
1079 
1080 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082 					   const union tcp_md5_addr *addr,
1083 					   int family)
1084 {
1085 	const struct tcp_sock *tp = tcp_sk(sk);
1086 	struct tcp_md5sig_key *key;
1087 	const struct tcp_md5sig_info *md5sig;
1088 	__be32 mask;
1089 	struct tcp_md5sig_key *best_match = NULL;
1090 	bool match;
1091 
1092 	/* caller either holds rcu_read_lock() or socket lock */
1093 	md5sig = rcu_dereference_check(tp->md5sig_info,
1094 				       lockdep_sock_is_held(sk));
1095 	if (!md5sig)
1096 		return NULL;
1097 
1098 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099 				 lockdep_sock_is_held(sk)) {
1100 		if (key->family != family)
1101 			continue;
1102 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103 			continue;
1104 		if (family == AF_INET) {
1105 			mask = inet_make_mask(key->prefixlen);
1106 			match = (key->addr.a4.s_addr & mask) ==
1107 				(addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109 		} else if (family == AF_INET6) {
1110 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111 						  key->prefixlen);
1112 #endif
1113 		} else {
1114 			match = false;
1115 		}
1116 
1117 		if (match && better_md5_match(best_match, key))
1118 			best_match = key;
1119 	}
1120 	return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125 						      const union tcp_md5_addr *addr,
1126 						      int family, u8 prefixlen,
1127 						      int l3index, u8 flags)
1128 {
1129 	const struct tcp_sock *tp = tcp_sk(sk);
1130 	struct tcp_md5sig_key *key;
1131 	unsigned int size = sizeof(struct in_addr);
1132 	const struct tcp_md5sig_info *md5sig;
1133 
1134 	/* caller either holds rcu_read_lock() or socket lock */
1135 	md5sig = rcu_dereference_check(tp->md5sig_info,
1136 				       lockdep_sock_is_held(sk));
1137 	if (!md5sig)
1138 		return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140 	if (family == AF_INET6)
1141 		size = sizeof(struct in6_addr);
1142 #endif
1143 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144 				 lockdep_sock_is_held(sk)) {
1145 		if (key->family != family)
1146 			continue;
1147 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148 			continue;
1149 		if (key->l3index != l3index)
1150 			continue;
1151 		if (!memcmp(&key->addr, addr, size) &&
1152 		    key->prefixlen == prefixlen)
1153 			return key;
1154 	}
1155 	return NULL;
1156 }
1157 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159 					 const struct sock *addr_sk)
1160 {
1161 	const union tcp_md5_addr *addr;
1162 	int l3index;
1163 
1164 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165 						 addr_sk->sk_bound_dev_if);
1166 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170 
1171 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1172 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1173 		   int family, u8 prefixlen, int l3index, u8 flags,
1174 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1175 {
1176 	/* Add Key to the list */
1177 	struct tcp_md5sig_key *key;
1178 	struct tcp_sock *tp = tcp_sk(sk);
1179 	struct tcp_md5sig_info *md5sig;
1180 
1181 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1182 	if (key) {
1183 		/* Pre-existing entry - just update that one.
1184 		 * Note that the key might be used concurrently.
1185 		 * data_race() is telling kcsan that we do not care of
1186 		 * key mismatches, since changing MD5 key on live flows
1187 		 * can lead to packet drops.
1188 		 */
1189 		data_race(memcpy(key->key, newkey, newkeylen));
1190 
1191 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1192 		 * Also note that a reader could catch new key->keylen value
1193 		 * but old key->key[], this is the reason we use __GFP_ZERO
1194 		 * at sock_kmalloc() time below these lines.
1195 		 */
1196 		WRITE_ONCE(key->keylen, newkeylen);
1197 
1198 		return 0;
1199 	}
1200 
1201 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1202 					   lockdep_sock_is_held(sk));
1203 	if (!md5sig) {
1204 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1205 		if (!md5sig)
1206 			return -ENOMEM;
1207 
1208 		sk_gso_disable(sk);
1209 		INIT_HLIST_HEAD(&md5sig->head);
1210 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1211 	}
1212 
1213 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1214 	if (!key)
1215 		return -ENOMEM;
1216 	if (!tcp_alloc_md5sig_pool()) {
1217 		sock_kfree_s(sk, key, sizeof(*key));
1218 		return -ENOMEM;
1219 	}
1220 
1221 	memcpy(key->key, newkey, newkeylen);
1222 	key->keylen = newkeylen;
1223 	key->family = family;
1224 	key->prefixlen = prefixlen;
1225 	key->l3index = l3index;
1226 	key->flags = flags;
1227 	memcpy(&key->addr, addr,
1228 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1229 								 sizeof(struct in_addr));
1230 	hlist_add_head_rcu(&key->node, &md5sig->head);
1231 	return 0;
1232 }
1233 EXPORT_SYMBOL(tcp_md5_do_add);
1234 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1235 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1236 		   u8 prefixlen, int l3index, u8 flags)
1237 {
1238 	struct tcp_md5sig_key *key;
1239 
1240 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1241 	if (!key)
1242 		return -ENOENT;
1243 	hlist_del_rcu(&key->node);
1244 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1245 	kfree_rcu(key, rcu);
1246 	return 0;
1247 }
1248 EXPORT_SYMBOL(tcp_md5_do_del);
1249 
tcp_clear_md5_list(struct sock * sk)1250 static void tcp_clear_md5_list(struct sock *sk)
1251 {
1252 	struct tcp_sock *tp = tcp_sk(sk);
1253 	struct tcp_md5sig_key *key;
1254 	struct hlist_node *n;
1255 	struct tcp_md5sig_info *md5sig;
1256 
1257 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1258 
1259 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1260 		hlist_del_rcu(&key->node);
1261 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1262 		kfree_rcu(key, rcu);
1263 	}
1264 }
1265 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1266 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1267 				 sockptr_t optval, int optlen)
1268 {
1269 	struct tcp_md5sig cmd;
1270 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1271 	const union tcp_md5_addr *addr;
1272 	u8 prefixlen = 32;
1273 	int l3index = 0;
1274 	u8 flags;
1275 
1276 	if (optlen < sizeof(cmd))
1277 		return -EINVAL;
1278 
1279 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1280 		return -EFAULT;
1281 
1282 	if (sin->sin_family != AF_INET)
1283 		return -EINVAL;
1284 
1285 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1286 
1287 	if (optname == TCP_MD5SIG_EXT &&
1288 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1289 		prefixlen = cmd.tcpm_prefixlen;
1290 		if (prefixlen > 32)
1291 			return -EINVAL;
1292 	}
1293 
1294 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1295 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1296 		struct net_device *dev;
1297 
1298 		rcu_read_lock();
1299 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1300 		if (dev && netif_is_l3_master(dev))
1301 			l3index = dev->ifindex;
1302 
1303 		rcu_read_unlock();
1304 
1305 		/* ok to reference set/not set outside of rcu;
1306 		 * right now device MUST be an L3 master
1307 		 */
1308 		if (!dev || !l3index)
1309 			return -EINVAL;
1310 	}
1311 
1312 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1313 
1314 	if (!cmd.tcpm_keylen)
1315 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1316 
1317 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1318 		return -EINVAL;
1319 
1320 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1321 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1322 }
1323 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1324 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1325 				   __be32 daddr, __be32 saddr,
1326 				   const struct tcphdr *th, int nbytes)
1327 {
1328 	struct tcp4_pseudohdr *bp;
1329 	struct scatterlist sg;
1330 	struct tcphdr *_th;
1331 
1332 	bp = hp->scratch;
1333 	bp->saddr = saddr;
1334 	bp->daddr = daddr;
1335 	bp->pad = 0;
1336 	bp->protocol = IPPROTO_TCP;
1337 	bp->len = cpu_to_be16(nbytes);
1338 
1339 	_th = (struct tcphdr *)(bp + 1);
1340 	memcpy(_th, th, sizeof(*th));
1341 	_th->check = 0;
1342 
1343 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1344 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1345 				sizeof(*bp) + sizeof(*th));
1346 	return crypto_ahash_update(hp->md5_req);
1347 }
1348 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1349 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1350 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1351 {
1352 	struct tcp_md5sig_pool *hp;
1353 	struct ahash_request *req;
1354 
1355 	hp = tcp_get_md5sig_pool();
1356 	if (!hp)
1357 		goto clear_hash_noput;
1358 	req = hp->md5_req;
1359 
1360 	if (crypto_ahash_init(req))
1361 		goto clear_hash;
1362 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1363 		goto clear_hash;
1364 	if (tcp_md5_hash_key(hp, key))
1365 		goto clear_hash;
1366 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1367 	if (crypto_ahash_final(req))
1368 		goto clear_hash;
1369 
1370 	tcp_put_md5sig_pool();
1371 	return 0;
1372 
1373 clear_hash:
1374 	tcp_put_md5sig_pool();
1375 clear_hash_noput:
1376 	memset(md5_hash, 0, 16);
1377 	return 1;
1378 }
1379 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1380 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1381 			const struct sock *sk,
1382 			const struct sk_buff *skb)
1383 {
1384 	struct tcp_md5sig_pool *hp;
1385 	struct ahash_request *req;
1386 	const struct tcphdr *th = tcp_hdr(skb);
1387 	__be32 saddr, daddr;
1388 
1389 	if (sk) { /* valid for establish/request sockets */
1390 		saddr = sk->sk_rcv_saddr;
1391 		daddr = sk->sk_daddr;
1392 	} else {
1393 		const struct iphdr *iph = ip_hdr(skb);
1394 		saddr = iph->saddr;
1395 		daddr = iph->daddr;
1396 	}
1397 
1398 	hp = tcp_get_md5sig_pool();
1399 	if (!hp)
1400 		goto clear_hash_noput;
1401 	req = hp->md5_req;
1402 
1403 	if (crypto_ahash_init(req))
1404 		goto clear_hash;
1405 
1406 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1407 		goto clear_hash;
1408 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1409 		goto clear_hash;
1410 	if (tcp_md5_hash_key(hp, key))
1411 		goto clear_hash;
1412 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1413 	if (crypto_ahash_final(req))
1414 		goto clear_hash;
1415 
1416 	tcp_put_md5sig_pool();
1417 	return 0;
1418 
1419 clear_hash:
1420 	tcp_put_md5sig_pool();
1421 clear_hash_noput:
1422 	memset(md5_hash, 0, 16);
1423 	return 1;
1424 }
1425 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1426 
1427 #endif
1428 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1429 static void tcp_v4_init_req(struct request_sock *req,
1430 			    const struct sock *sk_listener,
1431 			    struct sk_buff *skb)
1432 {
1433 	struct inet_request_sock *ireq = inet_rsk(req);
1434 	struct net *net = sock_net(sk_listener);
1435 
1436 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1437 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1438 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1439 }
1440 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1442 					  struct sk_buff *skb,
1443 					  struct flowi *fl,
1444 					  struct request_sock *req)
1445 {
1446 	tcp_v4_init_req(req, sk, skb);
1447 
1448 	if (security_inet_conn_request(sk, skb, req))
1449 		return NULL;
1450 
1451 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1452 }
1453 
1454 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1455 	.family		=	PF_INET,
1456 	.obj_size	=	sizeof(struct tcp_request_sock),
1457 	.rtx_syn_ack	=	tcp_rtx_synack,
1458 	.send_ack	=	tcp_v4_reqsk_send_ack,
1459 	.destructor	=	tcp_v4_reqsk_destructor,
1460 	.send_reset	=	tcp_v4_send_reset,
1461 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1462 };
1463 
1464 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1465 	.mss_clamp	=	TCP_MSS_DEFAULT,
1466 #ifdef CONFIG_TCP_MD5SIG
1467 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1468 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1469 #endif
1470 #ifdef CONFIG_SYN_COOKIES
1471 	.cookie_init_seq =	cookie_v4_init_sequence,
1472 #endif
1473 	.route_req	=	tcp_v4_route_req,
1474 	.init_seq	=	tcp_v4_init_seq,
1475 	.init_ts_off	=	tcp_v4_init_ts_off,
1476 	.send_synack	=	tcp_v4_send_synack,
1477 };
1478 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1480 {
1481 	/* Never answer to SYNs send to broadcast or multicast */
1482 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1483 		goto drop;
1484 
1485 	return tcp_conn_request(&tcp_request_sock_ops,
1486 				&tcp_request_sock_ipv4_ops, sk, skb);
1487 
1488 drop:
1489 	tcp_listendrop(sk);
1490 	return 0;
1491 }
1492 EXPORT_SYMBOL(tcp_v4_conn_request);
1493 
1494 
1495 /*
1496  * The three way handshake has completed - we got a valid synack -
1497  * now create the new socket.
1498  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1500 				  struct request_sock *req,
1501 				  struct dst_entry *dst,
1502 				  struct request_sock *req_unhash,
1503 				  bool *own_req)
1504 {
1505 	struct inet_request_sock *ireq;
1506 	bool found_dup_sk = false;
1507 	struct inet_sock *newinet;
1508 	struct tcp_sock *newtp;
1509 	struct sock *newsk;
1510 #ifdef CONFIG_TCP_MD5SIG
1511 	const union tcp_md5_addr *addr;
1512 	struct tcp_md5sig_key *key;
1513 	int l3index;
1514 #endif
1515 	struct ip_options_rcu *inet_opt;
1516 
1517 	if (sk_acceptq_is_full(sk))
1518 		goto exit_overflow;
1519 
1520 	newsk = tcp_create_openreq_child(sk, req, skb);
1521 	if (!newsk)
1522 		goto exit_nonewsk;
1523 
1524 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1525 	inet_sk_rx_dst_set(newsk, skb);
1526 
1527 	newtp		      = tcp_sk(newsk);
1528 	newinet		      = inet_sk(newsk);
1529 	ireq		      = inet_rsk(req);
1530 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1531 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1532 	newsk->sk_bound_dev_if = ireq->ir_iif;
1533 	newinet->inet_saddr   = ireq->ir_loc_addr;
1534 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1535 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1536 	newinet->mc_index     = inet_iif(skb);
1537 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1538 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1539 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1540 	if (inet_opt)
1541 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1542 	newinet->inet_id = get_random_u16();
1543 
1544 	/* Set ToS of the new socket based upon the value of incoming SYN.
1545 	 * ECT bits are set later in tcp_init_transfer().
1546 	 */
1547 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1548 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1549 
1550 	if (!dst) {
1551 		dst = inet_csk_route_child_sock(sk, newsk, req);
1552 		if (!dst)
1553 			goto put_and_exit;
1554 	} else {
1555 		/* syncookie case : see end of cookie_v4_check() */
1556 	}
1557 	sk_setup_caps(newsk, dst);
1558 
1559 	tcp_ca_openreq_child(newsk, dst);
1560 
1561 	tcp_sync_mss(newsk, dst_mtu(dst));
1562 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1563 
1564 	tcp_initialize_rcv_mss(newsk);
1565 
1566 #ifdef CONFIG_TCP_MD5SIG
1567 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1568 	/* Copy over the MD5 key from the original socket */
1569 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1570 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1571 	if (key) {
1572 		/*
1573 		 * We're using one, so create a matching key
1574 		 * on the newsk structure. If we fail to get
1575 		 * memory, then we end up not copying the key
1576 		 * across. Shucks.
1577 		 */
1578 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1579 			       key->key, key->keylen, GFP_ATOMIC);
1580 		sk_gso_disable(newsk);
1581 	}
1582 #endif
1583 
1584 	if (__inet_inherit_port(sk, newsk) < 0)
1585 		goto put_and_exit;
1586 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1587 				       &found_dup_sk);
1588 	if (likely(*own_req)) {
1589 		tcp_move_syn(newtp, req);
1590 		ireq->ireq_opt = NULL;
1591 	} else {
1592 		newinet->inet_opt = NULL;
1593 
1594 		if (!req_unhash && found_dup_sk) {
1595 			/* This code path should only be executed in the
1596 			 * syncookie case only
1597 			 */
1598 			bh_unlock_sock(newsk);
1599 			sock_put(newsk);
1600 			newsk = NULL;
1601 		}
1602 	}
1603 	return newsk;
1604 
1605 exit_overflow:
1606 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1607 exit_nonewsk:
1608 	dst_release(dst);
1609 exit:
1610 	tcp_listendrop(sk);
1611 	return NULL;
1612 put_and_exit:
1613 	newinet->inet_opt = NULL;
1614 	inet_csk_prepare_forced_close(newsk);
1615 	tcp_done(newsk);
1616 	goto exit;
1617 }
1618 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1619 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1620 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1621 {
1622 #ifdef CONFIG_SYN_COOKIES
1623 	const struct tcphdr *th = tcp_hdr(skb);
1624 
1625 	if (!th->syn)
1626 		sk = cookie_v4_check(sk, skb);
1627 #endif
1628 	return sk;
1629 }
1630 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1631 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1632 			 struct tcphdr *th, u32 *cookie)
1633 {
1634 	u16 mss = 0;
1635 #ifdef CONFIG_SYN_COOKIES
1636 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1637 				    &tcp_request_sock_ipv4_ops, sk, th);
1638 	if (mss) {
1639 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1640 		tcp_synq_overflow(sk);
1641 	}
1642 #endif
1643 	return mss;
1644 }
1645 
1646 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1647 							   u32));
1648 /* The socket must have it's spinlock held when we get
1649  * here, unless it is a TCP_LISTEN socket.
1650  *
1651  * We have a potential double-lock case here, so even when
1652  * doing backlog processing we use the BH locking scheme.
1653  * This is because we cannot sleep with the original spinlock
1654  * held.
1655  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1656 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1657 {
1658 	enum skb_drop_reason reason;
1659 	struct sock *rsk;
1660 
1661 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1662 		struct dst_entry *dst;
1663 
1664 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1665 						lockdep_sock_is_held(sk));
1666 
1667 		sock_rps_save_rxhash(sk, skb);
1668 		sk_mark_napi_id(sk, skb);
1669 		if (dst) {
1670 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1671 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1672 					     dst, 0)) {
1673 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1674 				dst_release(dst);
1675 			}
1676 		}
1677 		tcp_rcv_established(sk, skb);
1678 		return 0;
1679 	}
1680 
1681 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1682 	if (tcp_checksum_complete(skb))
1683 		goto csum_err;
1684 
1685 	if (sk->sk_state == TCP_LISTEN) {
1686 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1687 
1688 		if (!nsk)
1689 			goto discard;
1690 		if (nsk != sk) {
1691 			if (tcp_child_process(sk, nsk, skb)) {
1692 				rsk = nsk;
1693 				goto reset;
1694 			}
1695 			return 0;
1696 		}
1697 	} else
1698 		sock_rps_save_rxhash(sk, skb);
1699 
1700 	if (tcp_rcv_state_process(sk, skb)) {
1701 		rsk = sk;
1702 		goto reset;
1703 	}
1704 	return 0;
1705 
1706 reset:
1707 	tcp_v4_send_reset(rsk, skb);
1708 discard:
1709 	kfree_skb_reason(skb, reason);
1710 	/* Be careful here. If this function gets more complicated and
1711 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1712 	 * might be destroyed here. This current version compiles correctly,
1713 	 * but you have been warned.
1714 	 */
1715 	return 0;
1716 
1717 csum_err:
1718 	reason = SKB_DROP_REASON_TCP_CSUM;
1719 	trace_tcp_bad_csum(skb);
1720 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1721 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1722 	goto discard;
1723 }
1724 EXPORT_SYMBOL(tcp_v4_do_rcv);
1725 
tcp_v4_early_demux(struct sk_buff * skb)1726 int tcp_v4_early_demux(struct sk_buff *skb)
1727 {
1728 	struct net *net = dev_net(skb->dev);
1729 	const struct iphdr *iph;
1730 	const struct tcphdr *th;
1731 	struct sock *sk;
1732 
1733 	if (skb->pkt_type != PACKET_HOST)
1734 		return 0;
1735 
1736 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1737 		return 0;
1738 
1739 	iph = ip_hdr(skb);
1740 	th = tcp_hdr(skb);
1741 
1742 	if (th->doff < sizeof(struct tcphdr) / 4)
1743 		return 0;
1744 
1745 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1746 				       iph->saddr, th->source,
1747 				       iph->daddr, ntohs(th->dest),
1748 				       skb->skb_iif, inet_sdif(skb));
1749 	if (sk) {
1750 		skb->sk = sk;
1751 		skb->destructor = sock_edemux;
1752 		if (sk_fullsock(sk)) {
1753 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1754 
1755 			if (dst)
1756 				dst = dst_check(dst, 0);
1757 			if (dst &&
1758 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1759 				skb_dst_set_noref(skb, dst);
1760 		}
1761 	}
1762 	return 0;
1763 }
1764 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)1765 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1766 		     enum skb_drop_reason *reason)
1767 {
1768 	u32 limit, tail_gso_size, tail_gso_segs;
1769 	struct skb_shared_info *shinfo;
1770 	const struct tcphdr *th;
1771 	struct tcphdr *thtail;
1772 	struct sk_buff *tail;
1773 	unsigned int hdrlen;
1774 	bool fragstolen;
1775 	u32 gso_segs;
1776 	u32 gso_size;
1777 	int delta;
1778 
1779 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1780 	 * we can fix skb->truesize to its real value to avoid future drops.
1781 	 * This is valid because skb is not yet charged to the socket.
1782 	 * It has been noticed pure SACK packets were sometimes dropped
1783 	 * (if cooked by drivers without copybreak feature).
1784 	 */
1785 	skb_condense(skb);
1786 
1787 	skb_dst_drop(skb);
1788 
1789 	if (unlikely(tcp_checksum_complete(skb))) {
1790 		bh_unlock_sock(sk);
1791 		trace_tcp_bad_csum(skb);
1792 		*reason = SKB_DROP_REASON_TCP_CSUM;
1793 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1794 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1795 		return true;
1796 	}
1797 
1798 	/* Attempt coalescing to last skb in backlog, even if we are
1799 	 * above the limits.
1800 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1801 	 */
1802 	th = (const struct tcphdr *)skb->data;
1803 	hdrlen = th->doff * 4;
1804 
1805 	tail = sk->sk_backlog.tail;
1806 	if (!tail)
1807 		goto no_coalesce;
1808 	thtail = (struct tcphdr *)tail->data;
1809 
1810 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1811 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1812 	    ((TCP_SKB_CB(tail)->tcp_flags |
1813 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1814 	    !((TCP_SKB_CB(tail)->tcp_flags &
1815 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1816 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1817 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1818 #ifdef CONFIG_TLS_DEVICE
1819 	    tail->decrypted != skb->decrypted ||
1820 #endif
1821 	    !mptcp_skb_can_collapse(tail, skb) ||
1822 	    thtail->doff != th->doff ||
1823 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1824 		goto no_coalesce;
1825 
1826 	__skb_pull(skb, hdrlen);
1827 
1828 	shinfo = skb_shinfo(skb);
1829 	gso_size = shinfo->gso_size ?: skb->len;
1830 	gso_segs = shinfo->gso_segs ?: 1;
1831 
1832 	shinfo = skb_shinfo(tail);
1833 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1834 	tail_gso_segs = shinfo->gso_segs ?: 1;
1835 
1836 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1837 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1838 
1839 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1840 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1841 			thtail->window = th->window;
1842 		}
1843 
1844 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1845 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1846 		 * is not entered if we append a packet with a FIN.
1847 		 * SYN, RST, URG are not present.
1848 		 * ACK is set on both packets.
1849 		 * PSH : we do not really care in TCP stack,
1850 		 *       at least for 'GRO' packets.
1851 		 */
1852 		thtail->fin |= th->fin;
1853 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1854 
1855 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1856 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1857 			tail->tstamp = skb->tstamp;
1858 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1859 		}
1860 
1861 		/* Not as strict as GRO. We only need to carry mss max value */
1862 		shinfo->gso_size = max(gso_size, tail_gso_size);
1863 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1864 
1865 		sk->sk_backlog.len += delta;
1866 		__NET_INC_STATS(sock_net(sk),
1867 				LINUX_MIB_TCPBACKLOGCOALESCE);
1868 		kfree_skb_partial(skb, fragstolen);
1869 		return false;
1870 	}
1871 	__skb_push(skb, hdrlen);
1872 
1873 no_coalesce:
1874 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1875 
1876 	/* Only socket owner can try to collapse/prune rx queues
1877 	 * to reduce memory overhead, so add a little headroom here.
1878 	 * Few sockets backlog are possibly concurrently non empty.
1879 	 */
1880 	limit += 64 * 1024;
1881 
1882 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1883 		bh_unlock_sock(sk);
1884 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1885 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1886 		return true;
1887 	}
1888 	return false;
1889 }
1890 EXPORT_SYMBOL(tcp_add_backlog);
1891 
tcp_filter(struct sock * sk,struct sk_buff * skb)1892 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1893 {
1894 	struct tcphdr *th = (struct tcphdr *)skb->data;
1895 
1896 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1897 }
1898 EXPORT_SYMBOL(tcp_filter);
1899 
tcp_v4_restore_cb(struct sk_buff * skb)1900 static void tcp_v4_restore_cb(struct sk_buff *skb)
1901 {
1902 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1903 		sizeof(struct inet_skb_parm));
1904 }
1905 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1906 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1907 			   const struct tcphdr *th)
1908 {
1909 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1910 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1911 	 */
1912 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1913 		sizeof(struct inet_skb_parm));
1914 	barrier();
1915 
1916 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1917 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1918 				    skb->len - th->doff * 4);
1919 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1920 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1921 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1922 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1923 	TCP_SKB_CB(skb)->sacked	 = 0;
1924 	TCP_SKB_CB(skb)->has_rxtstamp =
1925 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1926 }
1927 
1928 /*
1929  *	From tcp_input.c
1930  */
1931 
tcp_v4_rcv(struct sk_buff * skb)1932 int tcp_v4_rcv(struct sk_buff *skb)
1933 {
1934 	struct net *net = dev_net(skb->dev);
1935 	enum skb_drop_reason drop_reason;
1936 	int sdif = inet_sdif(skb);
1937 	int dif = inet_iif(skb);
1938 	const struct iphdr *iph;
1939 	const struct tcphdr *th;
1940 	bool refcounted;
1941 	struct sock *sk;
1942 	int ret;
1943 
1944 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1945 	if (skb->pkt_type != PACKET_HOST)
1946 		goto discard_it;
1947 
1948 	/* Count it even if it's bad */
1949 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1950 
1951 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1952 		goto discard_it;
1953 
1954 	th = (const struct tcphdr *)skb->data;
1955 
1956 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1957 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1958 		goto bad_packet;
1959 	}
1960 	if (!pskb_may_pull(skb, th->doff * 4))
1961 		goto discard_it;
1962 
1963 	/* An explanation is required here, I think.
1964 	 * Packet length and doff are validated by header prediction,
1965 	 * provided case of th->doff==0 is eliminated.
1966 	 * So, we defer the checks. */
1967 
1968 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1969 		goto csum_error;
1970 
1971 	th = (const struct tcphdr *)skb->data;
1972 	iph = ip_hdr(skb);
1973 lookup:
1974 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1975 			       skb, __tcp_hdrlen(th), th->source,
1976 			       th->dest, sdif, &refcounted);
1977 	if (!sk)
1978 		goto no_tcp_socket;
1979 
1980 process:
1981 	if (sk->sk_state == TCP_TIME_WAIT)
1982 		goto do_time_wait;
1983 
1984 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1985 		struct request_sock *req = inet_reqsk(sk);
1986 		bool req_stolen = false;
1987 		struct sock *nsk;
1988 
1989 		sk = req->rsk_listener;
1990 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1991 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1992 		else
1993 			drop_reason = tcp_inbound_md5_hash(sk, skb,
1994 						   &iph->saddr, &iph->daddr,
1995 						   AF_INET, dif, sdif);
1996 		if (unlikely(drop_reason)) {
1997 			sk_drops_add(sk, skb);
1998 			reqsk_put(req);
1999 			goto discard_it;
2000 		}
2001 		if (tcp_checksum_complete(skb)) {
2002 			reqsk_put(req);
2003 			goto csum_error;
2004 		}
2005 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2006 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2007 			if (!nsk) {
2008 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2009 				goto lookup;
2010 			}
2011 			sk = nsk;
2012 			/* reuseport_migrate_sock() has already held one sk_refcnt
2013 			 * before returning.
2014 			 */
2015 		} else {
2016 			/* We own a reference on the listener, increase it again
2017 			 * as we might lose it too soon.
2018 			 */
2019 			sock_hold(sk);
2020 		}
2021 		refcounted = true;
2022 		nsk = NULL;
2023 		if (!tcp_filter(sk, skb)) {
2024 			th = (const struct tcphdr *)skb->data;
2025 			iph = ip_hdr(skb);
2026 			tcp_v4_fill_cb(skb, iph, th);
2027 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2028 		} else {
2029 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2030 		}
2031 		if (!nsk) {
2032 			reqsk_put(req);
2033 			if (req_stolen) {
2034 				/* Another cpu got exclusive access to req
2035 				 * and created a full blown socket.
2036 				 * Try to feed this packet to this socket
2037 				 * instead of discarding it.
2038 				 */
2039 				tcp_v4_restore_cb(skb);
2040 				sock_put(sk);
2041 				goto lookup;
2042 			}
2043 			goto discard_and_relse;
2044 		}
2045 		nf_reset_ct(skb);
2046 		if (nsk == sk) {
2047 			reqsk_put(req);
2048 			tcp_v4_restore_cb(skb);
2049 		} else if (tcp_child_process(sk, nsk, skb)) {
2050 			tcp_v4_send_reset(nsk, skb);
2051 			goto discard_and_relse;
2052 		} else {
2053 			sock_put(sk);
2054 			return 0;
2055 		}
2056 	}
2057 
2058 	if (static_branch_unlikely(&ip4_min_ttl)) {
2059 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2060 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2061 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2062 			goto discard_and_relse;
2063 		}
2064 	}
2065 
2066 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2067 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2068 		goto discard_and_relse;
2069 	}
2070 
2071 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2072 					   &iph->daddr, AF_INET, dif, sdif);
2073 	if (drop_reason)
2074 		goto discard_and_relse;
2075 
2076 	nf_reset_ct(skb);
2077 
2078 	if (tcp_filter(sk, skb)) {
2079 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080 		goto discard_and_relse;
2081 	}
2082 	th = (const struct tcphdr *)skb->data;
2083 	iph = ip_hdr(skb);
2084 	tcp_v4_fill_cb(skb, iph, th);
2085 
2086 	skb->dev = NULL;
2087 
2088 	if (sk->sk_state == TCP_LISTEN) {
2089 		ret = tcp_v4_do_rcv(sk, skb);
2090 		goto put_and_return;
2091 	}
2092 
2093 	sk_incoming_cpu_update(sk);
2094 
2095 	bh_lock_sock_nested(sk);
2096 	tcp_segs_in(tcp_sk(sk), skb);
2097 	ret = 0;
2098 	if (!sock_owned_by_user(sk)) {
2099 		ret = tcp_v4_do_rcv(sk, skb);
2100 	} else {
2101 		if (tcp_add_backlog(sk, skb, &drop_reason))
2102 			goto discard_and_relse;
2103 	}
2104 	bh_unlock_sock(sk);
2105 
2106 put_and_return:
2107 	if (refcounted)
2108 		sock_put(sk);
2109 
2110 	return ret;
2111 
2112 no_tcp_socket:
2113 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2114 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2115 		goto discard_it;
2116 
2117 	tcp_v4_fill_cb(skb, iph, th);
2118 
2119 	if (tcp_checksum_complete(skb)) {
2120 csum_error:
2121 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2122 		trace_tcp_bad_csum(skb);
2123 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2124 bad_packet:
2125 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2126 	} else {
2127 		tcp_v4_send_reset(NULL, skb);
2128 	}
2129 
2130 discard_it:
2131 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2132 	/* Discard frame. */
2133 	kfree_skb_reason(skb, drop_reason);
2134 	return 0;
2135 
2136 discard_and_relse:
2137 	sk_drops_add(sk, skb);
2138 	if (refcounted)
2139 		sock_put(sk);
2140 	goto discard_it;
2141 
2142 do_time_wait:
2143 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2144 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2145 		inet_twsk_put(inet_twsk(sk));
2146 		goto discard_it;
2147 	}
2148 
2149 	tcp_v4_fill_cb(skb, iph, th);
2150 
2151 	if (tcp_checksum_complete(skb)) {
2152 		inet_twsk_put(inet_twsk(sk));
2153 		goto csum_error;
2154 	}
2155 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2156 	case TCP_TW_SYN: {
2157 		struct sock *sk2 = inet_lookup_listener(net,
2158 							net->ipv4.tcp_death_row.hashinfo,
2159 							skb, __tcp_hdrlen(th),
2160 							iph->saddr, th->source,
2161 							iph->daddr, th->dest,
2162 							inet_iif(skb),
2163 							sdif);
2164 		if (sk2) {
2165 			inet_twsk_deschedule_put(inet_twsk(sk));
2166 			sk = sk2;
2167 			tcp_v4_restore_cb(skb);
2168 			refcounted = false;
2169 			goto process;
2170 		}
2171 	}
2172 		/* to ACK */
2173 		fallthrough;
2174 	case TCP_TW_ACK:
2175 		tcp_v4_timewait_ack(sk, skb);
2176 		break;
2177 	case TCP_TW_RST:
2178 		tcp_v4_send_reset(sk, skb);
2179 		inet_twsk_deschedule_put(inet_twsk(sk));
2180 		goto discard_it;
2181 	case TCP_TW_SUCCESS:;
2182 	}
2183 	goto discard_it;
2184 }
2185 
2186 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2187 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2188 	.twsk_unique	= tcp_twsk_unique,
2189 	.twsk_destructor= tcp_twsk_destructor,
2190 };
2191 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2192 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2193 {
2194 	struct dst_entry *dst = skb_dst(skb);
2195 
2196 	if (dst && dst_hold_safe(dst)) {
2197 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2198 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2199 	}
2200 }
2201 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2202 
2203 const struct inet_connection_sock_af_ops ipv4_specific = {
2204 	.queue_xmit	   = ip_queue_xmit,
2205 	.send_check	   = tcp_v4_send_check,
2206 	.rebuild_header	   = inet_sk_rebuild_header,
2207 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2208 	.conn_request	   = tcp_v4_conn_request,
2209 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2210 	.net_header_len	   = sizeof(struct iphdr),
2211 	.setsockopt	   = ip_setsockopt,
2212 	.getsockopt	   = ip_getsockopt,
2213 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2214 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2215 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2216 };
2217 EXPORT_SYMBOL(ipv4_specific);
2218 
2219 #ifdef CONFIG_TCP_MD5SIG
2220 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2221 	.md5_lookup		= tcp_v4_md5_lookup,
2222 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2223 	.md5_parse		= tcp_v4_parse_md5_keys,
2224 };
2225 #endif
2226 
2227 /* NOTE: A lot of things set to zero explicitly by call to
2228  *       sk_alloc() so need not be done here.
2229  */
tcp_v4_init_sock(struct sock * sk)2230 static int tcp_v4_init_sock(struct sock *sk)
2231 {
2232 	struct inet_connection_sock *icsk = inet_csk(sk);
2233 
2234 	tcp_init_sock(sk);
2235 
2236 	icsk->icsk_af_ops = &ipv4_specific;
2237 
2238 #ifdef CONFIG_TCP_MD5SIG
2239 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2240 #endif
2241 
2242 	return 0;
2243 }
2244 
tcp_v4_destroy_sock(struct sock * sk)2245 void tcp_v4_destroy_sock(struct sock *sk)
2246 {
2247 	struct tcp_sock *tp = tcp_sk(sk);
2248 
2249 	trace_tcp_destroy_sock(sk);
2250 
2251 	tcp_clear_xmit_timers(sk);
2252 
2253 	tcp_cleanup_congestion_control(sk);
2254 
2255 	tcp_cleanup_ulp(sk);
2256 
2257 	/* Cleanup up the write buffer. */
2258 	tcp_write_queue_purge(sk);
2259 
2260 	/* Check if we want to disable active TFO */
2261 	tcp_fastopen_active_disable_ofo_check(sk);
2262 
2263 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2264 	skb_rbtree_purge(&tp->out_of_order_queue);
2265 
2266 #ifdef CONFIG_TCP_MD5SIG
2267 	/* Clean up the MD5 key list, if any */
2268 	if (tp->md5sig_info) {
2269 		tcp_clear_md5_list(sk);
2270 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2271 		tp->md5sig_info = NULL;
2272 	}
2273 #endif
2274 
2275 	/* Clean up a referenced TCP bind bucket. */
2276 	if (inet_csk(sk)->icsk_bind_hash)
2277 		inet_put_port(sk);
2278 
2279 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2280 
2281 	/* If socket is aborted during connect operation */
2282 	tcp_free_fastopen_req(tp);
2283 	tcp_fastopen_destroy_cipher(sk);
2284 	tcp_saved_syn_free(tp);
2285 
2286 	sk_sockets_allocated_dec(sk);
2287 }
2288 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2289 
2290 #ifdef CONFIG_PROC_FS
2291 /* Proc filesystem TCP sock list dumping. */
2292 
2293 static unsigned short seq_file_family(const struct seq_file *seq);
2294 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2295 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2296 {
2297 	unsigned short family = seq_file_family(seq);
2298 
2299 	/* AF_UNSPEC is used as a match all */
2300 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2301 		net_eq(sock_net(sk), seq_file_net(seq)));
2302 }
2303 
2304 /* Find a non empty bucket (starting from st->bucket)
2305  * and return the first sk from it.
2306  */
listening_get_first(struct seq_file * seq)2307 static void *listening_get_first(struct seq_file *seq)
2308 {
2309 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2310 	struct tcp_iter_state *st = seq->private;
2311 
2312 	st->offset = 0;
2313 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2314 		struct inet_listen_hashbucket *ilb2;
2315 		struct hlist_nulls_node *node;
2316 		struct sock *sk;
2317 
2318 		ilb2 = &hinfo->lhash2[st->bucket];
2319 		if (hlist_nulls_empty(&ilb2->nulls_head))
2320 			continue;
2321 
2322 		spin_lock(&ilb2->lock);
2323 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2324 			if (seq_sk_match(seq, sk))
2325 				return sk;
2326 		}
2327 		spin_unlock(&ilb2->lock);
2328 	}
2329 
2330 	return NULL;
2331 }
2332 
2333 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2334  * If "cur" is the last one in the st->bucket,
2335  * call listening_get_first() to return the first sk of the next
2336  * non empty bucket.
2337  */
listening_get_next(struct seq_file * seq,void * cur)2338 static void *listening_get_next(struct seq_file *seq, void *cur)
2339 {
2340 	struct tcp_iter_state *st = seq->private;
2341 	struct inet_listen_hashbucket *ilb2;
2342 	struct hlist_nulls_node *node;
2343 	struct inet_hashinfo *hinfo;
2344 	struct sock *sk = cur;
2345 
2346 	++st->num;
2347 	++st->offset;
2348 
2349 	sk = sk_nulls_next(sk);
2350 	sk_nulls_for_each_from(sk, node) {
2351 		if (seq_sk_match(seq, sk))
2352 			return sk;
2353 	}
2354 
2355 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2356 	ilb2 = &hinfo->lhash2[st->bucket];
2357 	spin_unlock(&ilb2->lock);
2358 	++st->bucket;
2359 	return listening_get_first(seq);
2360 }
2361 
listening_get_idx(struct seq_file * seq,loff_t * pos)2362 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2363 {
2364 	struct tcp_iter_state *st = seq->private;
2365 	void *rc;
2366 
2367 	st->bucket = 0;
2368 	st->offset = 0;
2369 	rc = listening_get_first(seq);
2370 
2371 	while (rc && *pos) {
2372 		rc = listening_get_next(seq, rc);
2373 		--*pos;
2374 	}
2375 	return rc;
2376 }
2377 
empty_bucket(struct inet_hashinfo * hinfo,const struct tcp_iter_state * st)2378 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2379 				const struct tcp_iter_state *st)
2380 {
2381 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2382 }
2383 
2384 /*
2385  * Get first established socket starting from bucket given in st->bucket.
2386  * If st->bucket is zero, the very first socket in the hash is returned.
2387  */
established_get_first(struct seq_file * seq)2388 static void *established_get_first(struct seq_file *seq)
2389 {
2390 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2391 	struct tcp_iter_state *st = seq->private;
2392 
2393 	st->offset = 0;
2394 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2395 		struct sock *sk;
2396 		struct hlist_nulls_node *node;
2397 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2398 
2399 		/* Lockless fast path for the common case of empty buckets */
2400 		if (empty_bucket(hinfo, st))
2401 			continue;
2402 
2403 		spin_lock_bh(lock);
2404 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2405 			if (seq_sk_match(seq, sk))
2406 				return sk;
2407 		}
2408 		spin_unlock_bh(lock);
2409 	}
2410 
2411 	return NULL;
2412 }
2413 
established_get_next(struct seq_file * seq,void * cur)2414 static void *established_get_next(struct seq_file *seq, void *cur)
2415 {
2416 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2417 	struct tcp_iter_state *st = seq->private;
2418 	struct hlist_nulls_node *node;
2419 	struct sock *sk = cur;
2420 
2421 	++st->num;
2422 	++st->offset;
2423 
2424 	sk = sk_nulls_next(sk);
2425 
2426 	sk_nulls_for_each_from(sk, node) {
2427 		if (seq_sk_match(seq, sk))
2428 			return sk;
2429 	}
2430 
2431 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2432 	++st->bucket;
2433 	return established_get_first(seq);
2434 }
2435 
established_get_idx(struct seq_file * seq,loff_t pos)2436 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2437 {
2438 	struct tcp_iter_state *st = seq->private;
2439 	void *rc;
2440 
2441 	st->bucket = 0;
2442 	rc = established_get_first(seq);
2443 
2444 	while (rc && pos) {
2445 		rc = established_get_next(seq, rc);
2446 		--pos;
2447 	}
2448 	return rc;
2449 }
2450 
tcp_get_idx(struct seq_file * seq,loff_t pos)2451 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2452 {
2453 	void *rc;
2454 	struct tcp_iter_state *st = seq->private;
2455 
2456 	st->state = TCP_SEQ_STATE_LISTENING;
2457 	rc	  = listening_get_idx(seq, &pos);
2458 
2459 	if (!rc) {
2460 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2461 		rc	  = established_get_idx(seq, pos);
2462 	}
2463 
2464 	return rc;
2465 }
2466 
tcp_seek_last_pos(struct seq_file * seq)2467 static void *tcp_seek_last_pos(struct seq_file *seq)
2468 {
2469 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2470 	struct tcp_iter_state *st = seq->private;
2471 	int bucket = st->bucket;
2472 	int offset = st->offset;
2473 	int orig_num = st->num;
2474 	void *rc = NULL;
2475 
2476 	switch (st->state) {
2477 	case TCP_SEQ_STATE_LISTENING:
2478 		if (st->bucket > hinfo->lhash2_mask)
2479 			break;
2480 		st->state = TCP_SEQ_STATE_LISTENING;
2481 		rc = listening_get_first(seq);
2482 		while (offset-- && rc && bucket == st->bucket)
2483 			rc = listening_get_next(seq, rc);
2484 		if (rc)
2485 			break;
2486 		st->bucket = 0;
2487 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2488 		fallthrough;
2489 	case TCP_SEQ_STATE_ESTABLISHED:
2490 		if (st->bucket > hinfo->ehash_mask)
2491 			break;
2492 		rc = established_get_first(seq);
2493 		while (offset-- && rc && bucket == st->bucket)
2494 			rc = established_get_next(seq, rc);
2495 	}
2496 
2497 	st->num = orig_num;
2498 
2499 	return rc;
2500 }
2501 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2502 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2503 {
2504 	struct tcp_iter_state *st = seq->private;
2505 	void *rc;
2506 
2507 	if (*pos && *pos == st->last_pos) {
2508 		rc = tcp_seek_last_pos(seq);
2509 		if (rc)
2510 			goto out;
2511 	}
2512 
2513 	st->state = TCP_SEQ_STATE_LISTENING;
2514 	st->num = 0;
2515 	st->bucket = 0;
2516 	st->offset = 0;
2517 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2518 
2519 out:
2520 	st->last_pos = *pos;
2521 	return rc;
2522 }
2523 EXPORT_SYMBOL(tcp_seq_start);
2524 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2525 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2526 {
2527 	struct tcp_iter_state *st = seq->private;
2528 	void *rc = NULL;
2529 
2530 	if (v == SEQ_START_TOKEN) {
2531 		rc = tcp_get_idx(seq, 0);
2532 		goto out;
2533 	}
2534 
2535 	switch (st->state) {
2536 	case TCP_SEQ_STATE_LISTENING:
2537 		rc = listening_get_next(seq, v);
2538 		if (!rc) {
2539 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2540 			st->bucket = 0;
2541 			st->offset = 0;
2542 			rc	  = established_get_first(seq);
2543 		}
2544 		break;
2545 	case TCP_SEQ_STATE_ESTABLISHED:
2546 		rc = established_get_next(seq, v);
2547 		break;
2548 	}
2549 out:
2550 	++*pos;
2551 	st->last_pos = *pos;
2552 	return rc;
2553 }
2554 EXPORT_SYMBOL(tcp_seq_next);
2555 
tcp_seq_stop(struct seq_file * seq,void * v)2556 void tcp_seq_stop(struct seq_file *seq, void *v)
2557 {
2558 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2559 	struct tcp_iter_state *st = seq->private;
2560 
2561 	switch (st->state) {
2562 	case TCP_SEQ_STATE_LISTENING:
2563 		if (v != SEQ_START_TOKEN)
2564 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2565 		break;
2566 	case TCP_SEQ_STATE_ESTABLISHED:
2567 		if (v)
2568 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2569 		break;
2570 	}
2571 }
2572 EXPORT_SYMBOL(tcp_seq_stop);
2573 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2574 static void get_openreq4(const struct request_sock *req,
2575 			 struct seq_file *f, int i)
2576 {
2577 	const struct inet_request_sock *ireq = inet_rsk(req);
2578 	long delta = req->rsk_timer.expires - jiffies;
2579 
2580 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2581 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2582 		i,
2583 		ireq->ir_loc_addr,
2584 		ireq->ir_num,
2585 		ireq->ir_rmt_addr,
2586 		ntohs(ireq->ir_rmt_port),
2587 		TCP_SYN_RECV,
2588 		0, 0, /* could print option size, but that is af dependent. */
2589 		1,    /* timers active (only the expire timer) */
2590 		jiffies_delta_to_clock_t(delta),
2591 		req->num_timeout,
2592 		from_kuid_munged(seq_user_ns(f),
2593 				 sock_i_uid(req->rsk_listener)),
2594 		0,  /* non standard timer */
2595 		0, /* open_requests have no inode */
2596 		0,
2597 		req);
2598 }
2599 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2600 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2601 {
2602 	int timer_active;
2603 	unsigned long timer_expires;
2604 	const struct tcp_sock *tp = tcp_sk(sk);
2605 	const struct inet_connection_sock *icsk = inet_csk(sk);
2606 	const struct inet_sock *inet = inet_sk(sk);
2607 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2608 	__be32 dest = inet->inet_daddr;
2609 	__be32 src = inet->inet_rcv_saddr;
2610 	__u16 destp = ntohs(inet->inet_dport);
2611 	__u16 srcp = ntohs(inet->inet_sport);
2612 	int rx_queue;
2613 	int state;
2614 
2615 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2616 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2617 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2618 		timer_active	= 1;
2619 		timer_expires	= icsk->icsk_timeout;
2620 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2621 		timer_active	= 4;
2622 		timer_expires	= icsk->icsk_timeout;
2623 	} else if (timer_pending(&sk->sk_timer)) {
2624 		timer_active	= 2;
2625 		timer_expires	= sk->sk_timer.expires;
2626 	} else {
2627 		timer_active	= 0;
2628 		timer_expires = jiffies;
2629 	}
2630 
2631 	state = inet_sk_state_load(sk);
2632 	if (state == TCP_LISTEN)
2633 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2634 	else
2635 		/* Because we don't lock the socket,
2636 		 * we might find a transient negative value.
2637 		 */
2638 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2639 				      READ_ONCE(tp->copied_seq), 0);
2640 
2641 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2642 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2643 		i, src, srcp, dest, destp, state,
2644 		READ_ONCE(tp->write_seq) - tp->snd_una,
2645 		rx_queue,
2646 		timer_active,
2647 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2648 		icsk->icsk_retransmits,
2649 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2650 		icsk->icsk_probes_out,
2651 		sock_i_ino(sk),
2652 		refcount_read(&sk->sk_refcnt), sk,
2653 		jiffies_to_clock_t(icsk->icsk_rto),
2654 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2655 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2656 		tcp_snd_cwnd(tp),
2657 		state == TCP_LISTEN ?
2658 		    fastopenq->max_qlen :
2659 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2660 }
2661 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2662 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2663 			       struct seq_file *f, int i)
2664 {
2665 	long delta = tw->tw_timer.expires - jiffies;
2666 	__be32 dest, src;
2667 	__u16 destp, srcp;
2668 
2669 	dest  = tw->tw_daddr;
2670 	src   = tw->tw_rcv_saddr;
2671 	destp = ntohs(tw->tw_dport);
2672 	srcp  = ntohs(tw->tw_sport);
2673 
2674 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2675 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2676 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2677 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2678 		refcount_read(&tw->tw_refcnt), tw);
2679 }
2680 
2681 #define TMPSZ 150
2682 
tcp4_seq_show(struct seq_file * seq,void * v)2683 static int tcp4_seq_show(struct seq_file *seq, void *v)
2684 {
2685 	struct tcp_iter_state *st;
2686 	struct sock *sk = v;
2687 
2688 	seq_setwidth(seq, TMPSZ - 1);
2689 	if (v == SEQ_START_TOKEN) {
2690 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2691 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2692 			   "inode");
2693 		goto out;
2694 	}
2695 	st = seq->private;
2696 
2697 	if (sk->sk_state == TCP_TIME_WAIT)
2698 		get_timewait4_sock(v, seq, st->num);
2699 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2700 		get_openreq4(v, seq, st->num);
2701 	else
2702 		get_tcp4_sock(v, seq, st->num);
2703 out:
2704 	seq_pad(seq, '\n');
2705 	return 0;
2706 }
2707 
2708 #ifdef CONFIG_BPF_SYSCALL
2709 struct bpf_tcp_iter_state {
2710 	struct tcp_iter_state state;
2711 	unsigned int cur_sk;
2712 	unsigned int end_sk;
2713 	unsigned int max_sk;
2714 	struct sock **batch;
2715 	bool st_bucket_done;
2716 };
2717 
2718 struct bpf_iter__tcp {
2719 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2720 	__bpf_md_ptr(struct sock_common *, sk_common);
2721 	uid_t uid __aligned(8);
2722 };
2723 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2724 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725 			     struct sock_common *sk_common, uid_t uid)
2726 {
2727 	struct bpf_iter__tcp ctx;
2728 
2729 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2730 	ctx.meta = meta;
2731 	ctx.sk_common = sk_common;
2732 	ctx.uid = uid;
2733 	return bpf_iter_run_prog(prog, &ctx);
2734 }
2735 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2736 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737 {
2738 	while (iter->cur_sk < iter->end_sk)
2739 		sock_gen_put(iter->batch[iter->cur_sk++]);
2740 }
2741 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2742 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743 				      unsigned int new_batch_sz)
2744 {
2745 	struct sock **new_batch;
2746 
2747 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748 			     GFP_USER | __GFP_NOWARN);
2749 	if (!new_batch)
2750 		return -ENOMEM;
2751 
2752 	bpf_iter_tcp_put_batch(iter);
2753 	kvfree(iter->batch);
2754 	iter->batch = new_batch;
2755 	iter->max_sk = new_batch_sz;
2756 
2757 	return 0;
2758 }
2759 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2760 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761 						 struct sock *start_sk)
2762 {
2763 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2764 	struct bpf_tcp_iter_state *iter = seq->private;
2765 	struct tcp_iter_state *st = &iter->state;
2766 	struct hlist_nulls_node *node;
2767 	unsigned int expected = 1;
2768 	struct sock *sk;
2769 
2770 	sock_hold(start_sk);
2771 	iter->batch[iter->end_sk++] = start_sk;
2772 
2773 	sk = sk_nulls_next(start_sk);
2774 	sk_nulls_for_each_from(sk, node) {
2775 		if (seq_sk_match(seq, sk)) {
2776 			if (iter->end_sk < iter->max_sk) {
2777 				sock_hold(sk);
2778 				iter->batch[iter->end_sk++] = sk;
2779 			}
2780 			expected++;
2781 		}
2782 	}
2783 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2784 
2785 	return expected;
2786 }
2787 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2788 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789 						   struct sock *start_sk)
2790 {
2791 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2792 	struct bpf_tcp_iter_state *iter = seq->private;
2793 	struct tcp_iter_state *st = &iter->state;
2794 	struct hlist_nulls_node *node;
2795 	unsigned int expected = 1;
2796 	struct sock *sk;
2797 
2798 	sock_hold(start_sk);
2799 	iter->batch[iter->end_sk++] = start_sk;
2800 
2801 	sk = sk_nulls_next(start_sk);
2802 	sk_nulls_for_each_from(sk, node) {
2803 		if (seq_sk_match(seq, sk)) {
2804 			if (iter->end_sk < iter->max_sk) {
2805 				sock_hold(sk);
2806 				iter->batch[iter->end_sk++] = sk;
2807 			}
2808 			expected++;
2809 		}
2810 	}
2811 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2812 
2813 	return expected;
2814 }
2815 
bpf_iter_tcp_batch(struct seq_file * seq)2816 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2817 {
2818 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2819 	struct bpf_tcp_iter_state *iter = seq->private;
2820 	struct tcp_iter_state *st = &iter->state;
2821 	unsigned int expected;
2822 	bool resized = false;
2823 	struct sock *sk;
2824 
2825 	/* The st->bucket is done.  Directly advance to the next
2826 	 * bucket instead of having the tcp_seek_last_pos() to skip
2827 	 * one by one in the current bucket and eventually find out
2828 	 * it has to advance to the next bucket.
2829 	 */
2830 	if (iter->st_bucket_done) {
2831 		st->offset = 0;
2832 		st->bucket++;
2833 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2834 		    st->bucket > hinfo->lhash2_mask) {
2835 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2836 			st->bucket = 0;
2837 		}
2838 	}
2839 
2840 again:
2841 	/* Get a new batch */
2842 	iter->cur_sk = 0;
2843 	iter->end_sk = 0;
2844 	iter->st_bucket_done = false;
2845 
2846 	sk = tcp_seek_last_pos(seq);
2847 	if (!sk)
2848 		return NULL; /* Done */
2849 
2850 	if (st->state == TCP_SEQ_STATE_LISTENING)
2851 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2852 	else
2853 		expected = bpf_iter_tcp_established_batch(seq, sk);
2854 
2855 	if (iter->end_sk == expected) {
2856 		iter->st_bucket_done = true;
2857 		return sk;
2858 	}
2859 
2860 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2861 		resized = true;
2862 		goto again;
2863 	}
2864 
2865 	return sk;
2866 }
2867 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2868 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2869 {
2870 	/* bpf iter does not support lseek, so it always
2871 	 * continue from where it was stop()-ped.
2872 	 */
2873 	if (*pos)
2874 		return bpf_iter_tcp_batch(seq);
2875 
2876 	return SEQ_START_TOKEN;
2877 }
2878 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2879 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2880 {
2881 	struct bpf_tcp_iter_state *iter = seq->private;
2882 	struct tcp_iter_state *st = &iter->state;
2883 	struct sock *sk;
2884 
2885 	/* Whenever seq_next() is called, the iter->cur_sk is
2886 	 * done with seq_show(), so advance to the next sk in
2887 	 * the batch.
2888 	 */
2889 	if (iter->cur_sk < iter->end_sk) {
2890 		/* Keeping st->num consistent in tcp_iter_state.
2891 		 * bpf_iter_tcp does not use st->num.
2892 		 * meta.seq_num is used instead.
2893 		 */
2894 		st->num++;
2895 		/* Move st->offset to the next sk in the bucket such that
2896 		 * the future start() will resume at st->offset in
2897 		 * st->bucket.  See tcp_seek_last_pos().
2898 		 */
2899 		st->offset++;
2900 		sock_gen_put(iter->batch[iter->cur_sk++]);
2901 	}
2902 
2903 	if (iter->cur_sk < iter->end_sk)
2904 		sk = iter->batch[iter->cur_sk];
2905 	else
2906 		sk = bpf_iter_tcp_batch(seq);
2907 
2908 	++*pos;
2909 	/* Keeping st->last_pos consistent in tcp_iter_state.
2910 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2911 	 */
2912 	st->last_pos = *pos;
2913 	return sk;
2914 }
2915 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2916 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2917 {
2918 	struct bpf_iter_meta meta;
2919 	struct bpf_prog *prog;
2920 	struct sock *sk = v;
2921 	uid_t uid;
2922 	int ret;
2923 
2924 	if (v == SEQ_START_TOKEN)
2925 		return 0;
2926 
2927 	if (sk_fullsock(sk))
2928 		lock_sock(sk);
2929 
2930 	if (unlikely(sk_unhashed(sk))) {
2931 		ret = SEQ_SKIP;
2932 		goto unlock;
2933 	}
2934 
2935 	if (sk->sk_state == TCP_TIME_WAIT) {
2936 		uid = 0;
2937 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2938 		const struct request_sock *req = v;
2939 
2940 		uid = from_kuid_munged(seq_user_ns(seq),
2941 				       sock_i_uid(req->rsk_listener));
2942 	} else {
2943 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2944 	}
2945 
2946 	meta.seq = seq;
2947 	prog = bpf_iter_get_info(&meta, false);
2948 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2949 
2950 unlock:
2951 	if (sk_fullsock(sk))
2952 		release_sock(sk);
2953 	return ret;
2954 
2955 }
2956 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2957 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2958 {
2959 	struct bpf_tcp_iter_state *iter = seq->private;
2960 	struct bpf_iter_meta meta;
2961 	struct bpf_prog *prog;
2962 
2963 	if (!v) {
2964 		meta.seq = seq;
2965 		prog = bpf_iter_get_info(&meta, true);
2966 		if (prog)
2967 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2968 	}
2969 
2970 	if (iter->cur_sk < iter->end_sk) {
2971 		bpf_iter_tcp_put_batch(iter);
2972 		iter->st_bucket_done = false;
2973 	}
2974 }
2975 
2976 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2977 	.show		= bpf_iter_tcp_seq_show,
2978 	.start		= bpf_iter_tcp_seq_start,
2979 	.next		= bpf_iter_tcp_seq_next,
2980 	.stop		= bpf_iter_tcp_seq_stop,
2981 };
2982 #endif
seq_file_family(const struct seq_file * seq)2983 static unsigned short seq_file_family(const struct seq_file *seq)
2984 {
2985 	const struct tcp_seq_afinfo *afinfo;
2986 
2987 #ifdef CONFIG_BPF_SYSCALL
2988 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2989 	if (seq->op == &bpf_iter_tcp_seq_ops)
2990 		return AF_UNSPEC;
2991 #endif
2992 
2993 	/* Iterated from proc fs */
2994 	afinfo = pde_data(file_inode(seq->file));
2995 	return afinfo->family;
2996 }
2997 
2998 static const struct seq_operations tcp4_seq_ops = {
2999 	.show		= tcp4_seq_show,
3000 	.start		= tcp_seq_start,
3001 	.next		= tcp_seq_next,
3002 	.stop		= tcp_seq_stop,
3003 };
3004 
3005 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3006 	.family		= AF_INET,
3007 };
3008 
tcp4_proc_init_net(struct net * net)3009 static int __net_init tcp4_proc_init_net(struct net *net)
3010 {
3011 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3012 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3013 		return -ENOMEM;
3014 	return 0;
3015 }
3016 
tcp4_proc_exit_net(struct net * net)3017 static void __net_exit tcp4_proc_exit_net(struct net *net)
3018 {
3019 	remove_proc_entry("tcp", net->proc_net);
3020 }
3021 
3022 static struct pernet_operations tcp4_net_ops = {
3023 	.init = tcp4_proc_init_net,
3024 	.exit = tcp4_proc_exit_net,
3025 };
3026 
tcp4_proc_init(void)3027 int __init tcp4_proc_init(void)
3028 {
3029 	return register_pernet_subsys(&tcp4_net_ops);
3030 }
3031 
tcp4_proc_exit(void)3032 void tcp4_proc_exit(void)
3033 {
3034 	unregister_pernet_subsys(&tcp4_net_ops);
3035 }
3036 #endif /* CONFIG_PROC_FS */
3037 
3038 /* @wake is one when sk_stream_write_space() calls us.
3039  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3040  * This mimics the strategy used in sock_def_write_space().
3041  */
tcp_stream_memory_free(const struct sock * sk,int wake)3042 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3043 {
3044 	const struct tcp_sock *tp = tcp_sk(sk);
3045 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3046 			    READ_ONCE(tp->snd_nxt);
3047 
3048 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3049 }
3050 EXPORT_SYMBOL(tcp_stream_memory_free);
3051 
3052 struct proto tcp_prot = {
3053 	.name			= "TCP",
3054 	.owner			= THIS_MODULE,
3055 	.close			= tcp_close,
3056 	.pre_connect		= tcp_v4_pre_connect,
3057 	.connect		= tcp_v4_connect,
3058 	.disconnect		= tcp_disconnect,
3059 	.accept			= inet_csk_accept,
3060 	.ioctl			= tcp_ioctl,
3061 	.init			= tcp_v4_init_sock,
3062 	.destroy		= tcp_v4_destroy_sock,
3063 	.shutdown		= tcp_shutdown,
3064 	.setsockopt		= tcp_setsockopt,
3065 	.getsockopt		= tcp_getsockopt,
3066 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3067 	.keepalive		= tcp_set_keepalive,
3068 	.recvmsg		= tcp_recvmsg,
3069 	.sendmsg		= tcp_sendmsg,
3070 	.sendpage		= tcp_sendpage,
3071 	.backlog_rcv		= tcp_v4_do_rcv,
3072 	.release_cb		= tcp_release_cb,
3073 	.hash			= inet_hash,
3074 	.unhash			= inet_unhash,
3075 	.get_port		= inet_csk_get_port,
3076 	.put_port		= inet_put_port,
3077 #ifdef CONFIG_BPF_SYSCALL
3078 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3079 #endif
3080 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3081 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3082 	.stream_memory_free	= tcp_stream_memory_free,
3083 	.sockets_allocated	= &tcp_sockets_allocated,
3084 	.orphan_count		= &tcp_orphan_count,
3085 
3086 	.memory_allocated	= &tcp_memory_allocated,
3087 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3088 
3089 	.memory_pressure	= &tcp_memory_pressure,
3090 	.sysctl_mem		= sysctl_tcp_mem,
3091 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3092 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3093 	.max_header		= MAX_TCP_HEADER,
3094 	.obj_size		= sizeof(struct tcp_sock),
3095 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3096 	.twsk_prot		= &tcp_timewait_sock_ops,
3097 	.rsk_prot		= &tcp_request_sock_ops,
3098 	.h.hashinfo		= NULL,
3099 	.no_autobind		= true,
3100 	.diag_destroy		= tcp_abort,
3101 };
3102 EXPORT_SYMBOL(tcp_prot);
3103 
tcp_sk_exit(struct net * net)3104 static void __net_exit tcp_sk_exit(struct net *net)
3105 {
3106 	if (net->ipv4.tcp_congestion_control)
3107 		bpf_module_put(net->ipv4.tcp_congestion_control,
3108 			       net->ipv4.tcp_congestion_control->owner);
3109 }
3110 
tcp_set_hashinfo(struct net * net)3111 static void __net_init tcp_set_hashinfo(struct net *net)
3112 {
3113 	struct inet_hashinfo *hinfo;
3114 	unsigned int ehash_entries;
3115 	struct net *old_net;
3116 
3117 	if (net_eq(net, &init_net))
3118 		goto fallback;
3119 
3120 	old_net = current->nsproxy->net_ns;
3121 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3122 	if (!ehash_entries)
3123 		goto fallback;
3124 
3125 	ehash_entries = roundup_pow_of_two(ehash_entries);
3126 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3127 	if (!hinfo) {
3128 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3129 			"for a netns, fallback to the global one\n",
3130 			ehash_entries);
3131 fallback:
3132 		hinfo = &tcp_hashinfo;
3133 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3134 	}
3135 
3136 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3137 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3138 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3139 }
3140 
tcp_sk_init(struct net * net)3141 static int __net_init tcp_sk_init(struct net *net)
3142 {
3143 	net->ipv4.sysctl_tcp_ecn = 2;
3144 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3145 
3146 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3147 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3148 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3149 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3150 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3151 
3152 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3153 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3154 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3155 
3156 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3157 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3158 	net->ipv4.sysctl_tcp_syncookies = 1;
3159 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3160 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3161 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3162 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3163 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3164 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3165 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3166 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3167 
3168 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3169 	tcp_set_hashinfo(net);
3170 
3171 	net->ipv4.sysctl_tcp_sack = 1;
3172 	net->ipv4.sysctl_tcp_window_scaling = 1;
3173 	net->ipv4.sysctl_tcp_timestamps = 1;
3174 	net->ipv4.sysctl_tcp_early_retrans = 3;
3175 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3176 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3177 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3178 	net->ipv4.sysctl_tcp_max_reordering = 300;
3179 	net->ipv4.sysctl_tcp_dsack = 1;
3180 	net->ipv4.sysctl_tcp_app_win = 31;
3181 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3182 	net->ipv4.sysctl_tcp_frto = 2;
3183 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3184 	/* This limits the percentage of the congestion window which we
3185 	 * will allow a single TSO frame to consume.  Building TSO frames
3186 	 * which are too large can cause TCP streams to be bursty.
3187 	 */
3188 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3189 	/* Default TSQ limit of 16 TSO segments */
3190 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3191 
3192 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3193 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3194 
3195 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3196 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3197 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3198 	net->ipv4.sysctl_tcp_autocorking = 1;
3199 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3200 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3201 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3202 	if (net != &init_net) {
3203 		memcpy(net->ipv4.sysctl_tcp_rmem,
3204 		       init_net.ipv4.sysctl_tcp_rmem,
3205 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3206 		memcpy(net->ipv4.sysctl_tcp_wmem,
3207 		       init_net.ipv4.sysctl_tcp_wmem,
3208 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3209 	}
3210 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3211 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3212 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3213 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3214 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3215 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3216 
3217 	/* Reno is always built in */
3218 	if (!net_eq(net, &init_net) &&
3219 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3220 			       init_net.ipv4.tcp_congestion_control->owner))
3221 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3222 	else
3223 		net->ipv4.tcp_congestion_control = &tcp_reno;
3224 
3225 	return 0;
3226 }
3227 
tcp_sk_exit_batch(struct list_head * net_exit_list)3228 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3229 {
3230 	struct net *net;
3231 
3232 	tcp_twsk_purge(net_exit_list, AF_INET);
3233 
3234 	list_for_each_entry(net, net_exit_list, exit_list) {
3235 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3236 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3237 		tcp_fastopen_ctx_destroy(net);
3238 	}
3239 }
3240 
3241 static struct pernet_operations __net_initdata tcp_sk_ops = {
3242        .init	   = tcp_sk_init,
3243        .exit	   = tcp_sk_exit,
3244        .exit_batch = tcp_sk_exit_batch,
3245 };
3246 
3247 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3248 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3249 		     struct sock_common *sk_common, uid_t uid)
3250 
3251 #define INIT_BATCH_SZ 16
3252 
3253 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3254 {
3255 	struct bpf_tcp_iter_state *iter = priv_data;
3256 	int err;
3257 
3258 	err = bpf_iter_init_seq_net(priv_data, aux);
3259 	if (err)
3260 		return err;
3261 
3262 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3263 	if (err) {
3264 		bpf_iter_fini_seq_net(priv_data);
3265 		return err;
3266 	}
3267 
3268 	return 0;
3269 }
3270 
bpf_iter_fini_tcp(void * priv_data)3271 static void bpf_iter_fini_tcp(void *priv_data)
3272 {
3273 	struct bpf_tcp_iter_state *iter = priv_data;
3274 
3275 	bpf_iter_fini_seq_net(priv_data);
3276 	kvfree(iter->batch);
3277 }
3278 
3279 static const struct bpf_iter_seq_info tcp_seq_info = {
3280 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3281 	.init_seq_private	= bpf_iter_init_tcp,
3282 	.fini_seq_private	= bpf_iter_fini_tcp,
3283 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3284 };
3285 
3286 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3287 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3288 			    const struct bpf_prog *prog)
3289 {
3290 	switch (func_id) {
3291 	case BPF_FUNC_setsockopt:
3292 		return &bpf_sk_setsockopt_proto;
3293 	case BPF_FUNC_getsockopt:
3294 		return &bpf_sk_getsockopt_proto;
3295 	default:
3296 		return NULL;
3297 	}
3298 }
3299 
3300 static struct bpf_iter_reg tcp_reg_info = {
3301 	.target			= "tcp",
3302 	.ctx_arg_info_size	= 1,
3303 	.ctx_arg_info		= {
3304 		{ offsetof(struct bpf_iter__tcp, sk_common),
3305 		  PTR_TO_BTF_ID_OR_NULL },
3306 	},
3307 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3308 	.seq_info		= &tcp_seq_info,
3309 };
3310 
bpf_iter_register(void)3311 static void __init bpf_iter_register(void)
3312 {
3313 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3314 	if (bpf_iter_reg_target(&tcp_reg_info))
3315 		pr_warn("Warning: could not register bpf iterator tcp\n");
3316 }
3317 
3318 #endif
3319 
tcp_v4_init(void)3320 void __init tcp_v4_init(void)
3321 {
3322 	int cpu, res;
3323 
3324 	for_each_possible_cpu(cpu) {
3325 		struct sock *sk;
3326 
3327 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3328 					   IPPROTO_TCP, &init_net);
3329 		if (res)
3330 			panic("Failed to create the TCP control socket.\n");
3331 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3332 
3333 		/* Please enforce IP_DF and IPID==0 for RST and
3334 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3335 		 */
3336 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3337 
3338 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3339 	}
3340 	if (register_pernet_subsys(&tcp_sk_ops))
3341 		panic("Failed to create the TCP control socket.\n");
3342 
3343 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3344 	bpf_iter_register();
3345 #endif
3346 }
3347