• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
110 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
111 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 	struct tcp_sock *tp = tcp_sk(sk);
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
326 		inet_reset_saddr(sk);
327 	ip_rt_put(rt);
328 	sk->sk_route_caps = 0;
329 	inet->inet_dport = 0;
330 	return err;
331 }
332 EXPORT_SYMBOL(tcp_v4_connect);
333 
334 /*
335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336  * It can be called through tcp_release_cb() if socket was owned by user
337  * at the time tcp_v4_err() was called to handle ICMP message.
338  */
tcp_v4_mtu_reduced(struct sock * sk)339 void tcp_v4_mtu_reduced(struct sock *sk)
340 {
341 	struct inet_sock *inet = inet_sk(sk);
342 	struct dst_entry *dst;
343 	u32 mtu;
344 
345 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 		return;
347 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
348 	dst = inet_csk_update_pmtu(sk, mtu);
349 	if (!dst)
350 		return;
351 
352 	/* Something is about to be wrong... Remember soft error
353 	 * for the case, if this connection will not able to recover.
354 	 */
355 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 		sk->sk_err_soft = EMSGSIZE;
357 
358 	mtu = dst_mtu(dst);
359 
360 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 	    ip_sk_accept_pmtu(sk) &&
362 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 		tcp_sync_mss(sk, mtu);
364 
365 		/* Resend the TCP packet because it's
366 		 * clear that the old packet has been
367 		 * dropped. This is the new "fast" path mtu
368 		 * discovery.
369 		 */
370 		tcp_simple_retransmit(sk);
371 	} /* else let the usual retransmit timer handle it */
372 }
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 
do_redirect(struct sk_buff * skb,struct sock * sk)375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 {
377 	struct dst_entry *dst = __sk_dst_check(sk, 0);
378 
379 	if (dst)
380 		dst->ops->redirect(dst, sk, skb);
381 }
382 
383 
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 {
387 	struct request_sock *req = inet_reqsk(sk);
388 	struct net *net = sock_net(sk);
389 
390 	/* ICMPs are not backlogged, hence we cannot get
391 	 * an established socket here.
392 	 */
393 	if (seq != tcp_rsk(req)->snt_isn) {
394 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 	} else if (abort) {
396 		/*
397 		 * Still in SYN_RECV, just remove it silently.
398 		 * There is no good way to pass the error to the newly
399 		 * created socket, and POSIX does not want network
400 		 * errors returned from accept().
401 		 */
402 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 		tcp_listendrop(req->rsk_listener);
404 	}
405 	reqsk_put(req);
406 }
407 EXPORT_SYMBOL(tcp_req_err);
408 
409 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 {
412 	struct inet_connection_sock *icsk = inet_csk(sk);
413 	struct tcp_sock *tp = tcp_sk(sk);
414 	struct sk_buff *skb;
415 	s32 remaining;
416 	u32 delta_us;
417 
418 	if (sock_owned_by_user(sk))
419 		return;
420 
421 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
422 	    !icsk->icsk_backoff)
423 		return;
424 
425 	skb = tcp_rtx_queue_head(sk);
426 	if (WARN_ON_ONCE(!skb))
427 		return;
428 
429 	icsk->icsk_backoff--;
430 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432 
433 	tcp_mstamp_refresh(tp);
434 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 
437 	if (remaining > 0) {
438 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 					  remaining, TCP_RTO_MAX);
440 	} else {
441 		/* RTO revert clocked out retransmission.
442 		 * Will retransmit now.
443 		 */
444 		tcp_retransmit_timer(sk);
445 	}
446 }
447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 
449 /*
450  * This routine is called by the ICMP module when it gets some
451  * sort of error condition.  If err < 0 then the socket should
452  * be closed and the error returned to the user.  If err > 0
453  * it's just the icmp type << 8 | icmp code.  After adjustment
454  * header points to the first 8 bytes of the tcp header.  We need
455  * to find the appropriate port.
456  *
457  * The locking strategy used here is very "optimistic". When
458  * someone else accesses the socket the ICMP is just dropped
459  * and for some paths there is no check at all.
460  * A more general error queue to queue errors for later handling
461  * is probably better.
462  *
463  */
464 
tcp_v4_err(struct sk_buff * skb,u32 info)465 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 {
467 	const struct iphdr *iph = (const struct iphdr *)skb->data;
468 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 	struct tcp_sock *tp;
470 	struct inet_sock *inet;
471 	const int type = icmp_hdr(skb)->type;
472 	const int code = icmp_hdr(skb)->code;
473 	struct sock *sk;
474 	struct request_sock *fastopen;
475 	u32 seq, snd_una;
476 	int err;
477 	struct net *net = dev_net(skb->dev);
478 
479 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
480 				       th->dest, iph->saddr, ntohs(th->source),
481 				       inet_iif(skb), 0);
482 	if (!sk) {
483 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 		return -ENOENT;
485 	}
486 	if (sk->sk_state == TCP_TIME_WAIT) {
487 		inet_twsk_put(inet_twsk(sk));
488 		return 0;
489 	}
490 	seq = ntohl(th->seq);
491 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
492 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493 				     type == ICMP_TIME_EXCEEDED ||
494 				     (type == ICMP_DEST_UNREACH &&
495 				      (code == ICMP_NET_UNREACH ||
496 				       code == ICMP_HOST_UNREACH)));
497 		return 0;
498 	}
499 
500 	bh_lock_sock(sk);
501 	/* If too many ICMPs get dropped on busy
502 	 * servers this needs to be solved differently.
503 	 * We do take care of PMTU discovery (RFC1191) special case :
504 	 * we can receive locally generated ICMP messages while socket is held.
505 	 */
506 	if (sock_owned_by_user(sk)) {
507 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
508 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 	}
510 	if (sk->sk_state == TCP_CLOSE)
511 		goto out;
512 
513 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
514 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515 		goto out;
516 	}
517 
518 	tp = tcp_sk(sk);
519 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
520 	fastopen = rcu_dereference(tp->fastopen_rsk);
521 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
522 	if (sk->sk_state != TCP_LISTEN &&
523 	    !between(seq, snd_una, tp->snd_nxt)) {
524 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
525 		goto out;
526 	}
527 
528 	switch (type) {
529 	case ICMP_REDIRECT:
530 		if (!sock_owned_by_user(sk))
531 			do_redirect(skb, sk);
532 		goto out;
533 	case ICMP_SOURCE_QUENCH:
534 		/* Just silently ignore these. */
535 		goto out;
536 	case ICMP_PARAMETERPROB:
537 		err = EPROTO;
538 		break;
539 	case ICMP_DEST_UNREACH:
540 		if (code > NR_ICMP_UNREACH)
541 			goto out;
542 
543 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
544 			/* We are not interested in TCP_LISTEN and open_requests
545 			 * (SYN-ACKs send out by Linux are always <576bytes so
546 			 * they should go through unfragmented).
547 			 */
548 			if (sk->sk_state == TCP_LISTEN)
549 				goto out;
550 
551 			WRITE_ONCE(tp->mtu_info, info);
552 			if (!sock_owned_by_user(sk)) {
553 				tcp_v4_mtu_reduced(sk);
554 			} else {
555 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
556 					sock_hold(sk);
557 			}
558 			goto out;
559 		}
560 
561 		err = icmp_err_convert[code].errno;
562 		/* check if this ICMP message allows revert of backoff.
563 		 * (see RFC 6069)
564 		 */
565 		if (!fastopen &&
566 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
567 			tcp_ld_RTO_revert(sk, seq);
568 		break;
569 	case ICMP_TIME_EXCEEDED:
570 		err = EHOSTUNREACH;
571 		break;
572 	default:
573 		goto out;
574 	}
575 
576 	switch (sk->sk_state) {
577 	case TCP_SYN_SENT:
578 	case TCP_SYN_RECV:
579 		/* Only in fast or simultaneous open. If a fast open socket is
580 		 * already accepted it is treated as a connected one below.
581 		 */
582 		if (fastopen && !fastopen->sk)
583 			break;
584 
585 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
586 
587 		if (!sock_owned_by_user(sk)) {
588 			sk->sk_err = err;
589 
590 			sk_error_report(sk);
591 
592 			tcp_done(sk);
593 		} else {
594 			sk->sk_err_soft = err;
595 		}
596 		goto out;
597 	}
598 
599 	/* If we've already connected we will keep trying
600 	 * until we time out, or the user gives up.
601 	 *
602 	 * rfc1122 4.2.3.9 allows to consider as hard errors
603 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
604 	 * but it is obsoleted by pmtu discovery).
605 	 *
606 	 * Note, that in modern internet, where routing is unreliable
607 	 * and in each dark corner broken firewalls sit, sending random
608 	 * errors ordered by their masters even this two messages finally lose
609 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
610 	 *
611 	 * Now we are in compliance with RFCs.
612 	 *							--ANK (980905)
613 	 */
614 
615 	inet = inet_sk(sk);
616 	if (!sock_owned_by_user(sk) && inet->recverr) {
617 		sk->sk_err = err;
618 		sk_error_report(sk);
619 	} else	{ /* Only an error on timeout */
620 		sk->sk_err_soft = err;
621 	}
622 
623 out:
624 	bh_unlock_sock(sk);
625 	sock_put(sk);
626 	return 0;
627 }
628 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)629 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
630 {
631 	struct tcphdr *th = tcp_hdr(skb);
632 
633 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
634 	skb->csum_start = skb_transport_header(skb) - skb->head;
635 	skb->csum_offset = offsetof(struct tcphdr, check);
636 }
637 
638 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)639 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
640 {
641 	const struct inet_sock *inet = inet_sk(sk);
642 
643 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
644 }
645 EXPORT_SYMBOL(tcp_v4_send_check);
646 
647 /*
648  *	This routine will send an RST to the other tcp.
649  *
650  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
651  *		      for reset.
652  *	Answer: if a packet caused RST, it is not for a socket
653  *		existing in our system, if it is matched to a socket,
654  *		it is just duplicate segment or bug in other side's TCP.
655  *		So that we build reply only basing on parameters
656  *		arrived with segment.
657  *	Exception: precedence violation. We do not implement it in any case.
658  */
659 
660 #ifdef CONFIG_TCP_MD5SIG
661 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
662 #else
663 #define OPTION_BYTES sizeof(__be32)
664 #endif
665 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)666 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
667 {
668 	const struct tcphdr *th = tcp_hdr(skb);
669 	struct {
670 		struct tcphdr th;
671 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
672 	} rep;
673 	struct ip_reply_arg arg;
674 #ifdef CONFIG_TCP_MD5SIG
675 	struct tcp_md5sig_key *key = NULL;
676 	const __u8 *hash_location = NULL;
677 	unsigned char newhash[16];
678 	int genhash;
679 	struct sock *sk1 = NULL;
680 #endif
681 	u64 transmit_time = 0;
682 	struct sock *ctl_sk;
683 	struct net *net;
684 
685 	/* Never send a reset in response to a reset. */
686 	if (th->rst)
687 		return;
688 
689 	/* If sk not NULL, it means we did a successful lookup and incoming
690 	 * route had to be correct. prequeue might have dropped our dst.
691 	 */
692 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
693 		return;
694 
695 	/* Swap the send and the receive. */
696 	memset(&rep, 0, sizeof(rep));
697 	rep.th.dest   = th->source;
698 	rep.th.source = th->dest;
699 	rep.th.doff   = sizeof(struct tcphdr) / 4;
700 	rep.th.rst    = 1;
701 
702 	if (th->ack) {
703 		rep.th.seq = th->ack_seq;
704 	} else {
705 		rep.th.ack = 1;
706 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
707 				       skb->len - (th->doff << 2));
708 	}
709 
710 	memset(&arg, 0, sizeof(arg));
711 	arg.iov[0].iov_base = (unsigned char *)&rep;
712 	arg.iov[0].iov_len  = sizeof(rep.th);
713 
714 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
715 #ifdef CONFIG_TCP_MD5SIG
716 	rcu_read_lock();
717 	hash_location = tcp_parse_md5sig_option(th);
718 	if (sk && sk_fullsock(sk)) {
719 		const union tcp_md5_addr *addr;
720 		int l3index;
721 
722 		/* sdif set, means packet ingressed via a device
723 		 * in an L3 domain and inet_iif is set to it.
724 		 */
725 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
726 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
727 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
728 	} else if (hash_location) {
729 		const union tcp_md5_addr *addr;
730 		int sdif = tcp_v4_sdif(skb);
731 		int dif = inet_iif(skb);
732 		int l3index;
733 
734 		/*
735 		 * active side is lost. Try to find listening socket through
736 		 * source port, and then find md5 key through listening socket.
737 		 * we are not loose security here:
738 		 * Incoming packet is checked with md5 hash with finding key,
739 		 * no RST generated if md5 hash doesn't match.
740 		 */
741 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
742 					     ip_hdr(skb)->saddr,
743 					     th->source, ip_hdr(skb)->daddr,
744 					     ntohs(th->source), dif, sdif);
745 		/* don't send rst if it can't find key */
746 		if (!sk1)
747 			goto out;
748 
749 		/* sdif set, means packet ingressed via a device
750 		 * in an L3 domain and dif is set to it.
751 		 */
752 		l3index = sdif ? dif : 0;
753 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
754 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
755 		if (!key)
756 			goto out;
757 
758 
759 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
760 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
761 			goto out;
762 
763 	}
764 
765 	if (key) {
766 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
767 				   (TCPOPT_NOP << 16) |
768 				   (TCPOPT_MD5SIG << 8) |
769 				   TCPOLEN_MD5SIG);
770 		/* Update length and the length the header thinks exists */
771 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
772 		rep.th.doff = arg.iov[0].iov_len / 4;
773 
774 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
775 				     key, ip_hdr(skb)->saddr,
776 				     ip_hdr(skb)->daddr, &rep.th);
777 	}
778 #endif
779 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
780 	if (rep.opt[0] == 0) {
781 		__be32 mrst = mptcp_reset_option(skb);
782 
783 		if (mrst) {
784 			rep.opt[0] = mrst;
785 			arg.iov[0].iov_len += sizeof(mrst);
786 			rep.th.doff = arg.iov[0].iov_len / 4;
787 		}
788 	}
789 
790 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
791 				      ip_hdr(skb)->saddr, /* XXX */
792 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
793 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
794 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
795 
796 	/* When socket is gone, all binding information is lost.
797 	 * routing might fail in this case. No choice here, if we choose to force
798 	 * input interface, we will misroute in case of asymmetric route.
799 	 */
800 	if (sk) {
801 		arg.bound_dev_if = sk->sk_bound_dev_if;
802 		if (sk_fullsock(sk))
803 			trace_tcp_send_reset(sk, skb);
804 	}
805 
806 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
807 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
808 
809 	arg.tos = ip_hdr(skb)->tos;
810 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
811 	local_bh_disable();
812 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
813 	if (sk) {
814 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
815 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
816 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
817 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
818 		transmit_time = tcp_transmit_time(sk);
819 		xfrm_sk_clone_policy(ctl_sk, sk);
820 	} else {
821 		ctl_sk->sk_mark = 0;
822 		ctl_sk->sk_priority = 0;
823 	}
824 	ip_send_unicast_reply(ctl_sk,
825 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 			      &arg, arg.iov[0].iov_len,
828 			      transmit_time);
829 
830 	xfrm_sk_free_policy(ctl_sk);
831 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
833 	local_bh_enable();
834 
835 #ifdef CONFIG_TCP_MD5SIG
836 out:
837 	rcu_read_unlock();
838 #endif
839 }
840 
841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
842    outside socket context is ugly, certainly. What can I do?
843  */
844 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)845 static void tcp_v4_send_ack(const struct sock *sk,
846 			    struct sk_buff *skb, u32 seq, u32 ack,
847 			    u32 win, u32 tsval, u32 tsecr, int oif,
848 			    struct tcp_md5sig_key *key,
849 			    int reply_flags, u8 tos)
850 {
851 	const struct tcphdr *th = tcp_hdr(skb);
852 	struct {
853 		struct tcphdr th;
854 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855 #ifdef CONFIG_TCP_MD5SIG
856 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857 #endif
858 			];
859 	} rep;
860 	struct net *net = sock_net(sk);
861 	struct ip_reply_arg arg;
862 	struct sock *ctl_sk;
863 	u64 transmit_time;
864 
865 	memset(&rep.th, 0, sizeof(struct tcphdr));
866 	memset(&arg, 0, sizeof(arg));
867 
868 	arg.iov[0].iov_base = (unsigned char *)&rep;
869 	arg.iov[0].iov_len  = sizeof(rep.th);
870 	if (tsecr) {
871 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 				   (TCPOPT_TIMESTAMP << 8) |
873 				   TCPOLEN_TIMESTAMP);
874 		rep.opt[1] = htonl(tsval);
875 		rep.opt[2] = htonl(tsecr);
876 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
877 	}
878 
879 	/* Swap the send and the receive. */
880 	rep.th.dest    = th->source;
881 	rep.th.source  = th->dest;
882 	rep.th.doff    = arg.iov[0].iov_len / 4;
883 	rep.th.seq     = htonl(seq);
884 	rep.th.ack_seq = htonl(ack);
885 	rep.th.ack     = 1;
886 	rep.th.window  = htons(win);
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 	if (key) {
890 		int offset = (tsecr) ? 3 : 0;
891 
892 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
893 					  (TCPOPT_NOP << 16) |
894 					  (TCPOPT_MD5SIG << 8) |
895 					  TCPOLEN_MD5SIG);
896 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 		rep.th.doff = arg.iov[0].iov_len/4;
898 
899 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 				    key, ip_hdr(skb)->saddr,
901 				    ip_hdr(skb)->daddr, &rep.th);
902 	}
903 #endif
904 	arg.flags = reply_flags;
905 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 				      ip_hdr(skb)->saddr, /* XXX */
907 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
909 	if (oif)
910 		arg.bound_dev_if = oif;
911 	arg.tos = tos;
912 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
913 	local_bh_disable();
914 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
915 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
916 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
917 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
918 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
919 	transmit_time = tcp_transmit_time(sk);
920 	ip_send_unicast_reply(ctl_sk,
921 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
922 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
923 			      &arg, arg.iov[0].iov_len,
924 			      transmit_time);
925 
926 	sock_net_set(ctl_sk, &init_net);
927 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
928 	local_bh_enable();
929 }
930 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)931 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
932 {
933 	struct inet_timewait_sock *tw = inet_twsk(sk);
934 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
935 
936 	tcp_v4_send_ack(sk, skb,
937 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
938 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
939 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
940 			tcptw->tw_ts_recent,
941 			tw->tw_bound_dev_if,
942 			tcp_twsk_md5_key(tcptw),
943 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
944 			tw->tw_tos
945 			);
946 
947 	inet_twsk_put(tw);
948 }
949 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)950 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
951 				  struct request_sock *req)
952 {
953 	const union tcp_md5_addr *addr;
954 	int l3index;
955 
956 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
957 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
958 	 */
959 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
960 					     tcp_sk(sk)->snd_nxt;
961 
962 	/* RFC 7323 2.3
963 	 * The window field (SEG.WND) of every outgoing segment, with the
964 	 * exception of <SYN> segments, MUST be right-shifted by
965 	 * Rcv.Wind.Shift bits:
966 	 */
967 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
968 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
969 	tcp_v4_send_ack(sk, skb, seq,
970 			tcp_rsk(req)->rcv_nxt,
971 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
972 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
973 			READ_ONCE(req->ts_recent),
974 			0,
975 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
976 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
977 			ip_hdr(skb)->tos);
978 }
979 
980 /*
981  *	Send a SYN-ACK after having received a SYN.
982  *	This still operates on a request_sock only, not on a big
983  *	socket.
984  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)985 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
986 			      struct flowi *fl,
987 			      struct request_sock *req,
988 			      struct tcp_fastopen_cookie *foc,
989 			      enum tcp_synack_type synack_type,
990 			      struct sk_buff *syn_skb)
991 {
992 	const struct inet_request_sock *ireq = inet_rsk(req);
993 	struct flowi4 fl4;
994 	int err = -1;
995 	struct sk_buff *skb;
996 	u8 tos;
997 
998 	/* First, grab a route. */
999 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1000 		return -1;
1001 
1002 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1003 
1004 	if (skb) {
1005 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1006 
1007 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1008 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1009 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1010 				inet_sk(sk)->tos;
1011 
1012 		if (!INET_ECN_is_capable(tos) &&
1013 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1014 			tos |= INET_ECN_ECT_0;
1015 
1016 		rcu_read_lock();
1017 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1018 					    ireq->ir_rmt_addr,
1019 					    rcu_dereference(ireq->ireq_opt),
1020 					    tos);
1021 		rcu_read_unlock();
1022 		err = net_xmit_eval(err);
1023 	}
1024 
1025 	return err;
1026 }
1027 
1028 /*
1029  *	IPv4 request_sock destructor.
1030  */
tcp_v4_reqsk_destructor(struct request_sock * req)1031 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1032 {
1033 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1034 }
1035 
1036 #ifdef CONFIG_TCP_MD5SIG
1037 /*
1038  * RFC2385 MD5 checksumming requires a mapping of
1039  * IP address->MD5 Key.
1040  * We need to maintain these in the sk structure.
1041  */
1042 
1043 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1044 EXPORT_SYMBOL(tcp_md5_needed);
1045 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1046 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1047 {
1048 	if (!old)
1049 		return true;
1050 
1051 	/* l3index always overrides non-l3index */
1052 	if (old->l3index && new->l3index == 0)
1053 		return false;
1054 	if (old->l3index == 0 && new->l3index)
1055 		return true;
1056 
1057 	return old->prefixlen < new->prefixlen;
1058 }
1059 
1060 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1061 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1062 					   const union tcp_md5_addr *addr,
1063 					   int family)
1064 {
1065 	const struct tcp_sock *tp = tcp_sk(sk);
1066 	struct tcp_md5sig_key *key;
1067 	const struct tcp_md5sig_info *md5sig;
1068 	__be32 mask;
1069 	struct tcp_md5sig_key *best_match = NULL;
1070 	bool match;
1071 
1072 	/* caller either holds rcu_read_lock() or socket lock */
1073 	md5sig = rcu_dereference_check(tp->md5sig_info,
1074 				       lockdep_sock_is_held(sk));
1075 	if (!md5sig)
1076 		return NULL;
1077 
1078 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1079 				 lockdep_sock_is_held(sk)) {
1080 		if (key->family != family)
1081 			continue;
1082 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1083 			continue;
1084 		if (family == AF_INET) {
1085 			mask = inet_make_mask(key->prefixlen);
1086 			match = (key->addr.a4.s_addr & mask) ==
1087 				(addr->a4.s_addr & mask);
1088 #if IS_ENABLED(CONFIG_IPV6)
1089 		} else if (family == AF_INET6) {
1090 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1091 						  key->prefixlen);
1092 #endif
1093 		} else {
1094 			match = false;
1095 		}
1096 
1097 		if (match && better_md5_match(best_match, key))
1098 			best_match = key;
1099 	}
1100 	return best_match;
1101 }
1102 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1103 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1104 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1105 						      const union tcp_md5_addr *addr,
1106 						      int family, u8 prefixlen,
1107 						      int l3index, u8 flags)
1108 {
1109 	const struct tcp_sock *tp = tcp_sk(sk);
1110 	struct tcp_md5sig_key *key;
1111 	unsigned int size = sizeof(struct in_addr);
1112 	const struct tcp_md5sig_info *md5sig;
1113 
1114 	/* caller either holds rcu_read_lock() or socket lock */
1115 	md5sig = rcu_dereference_check(tp->md5sig_info,
1116 				       lockdep_sock_is_held(sk));
1117 	if (!md5sig)
1118 		return NULL;
1119 #if IS_ENABLED(CONFIG_IPV6)
1120 	if (family == AF_INET6)
1121 		size = sizeof(struct in6_addr);
1122 #endif
1123 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1124 				 lockdep_sock_is_held(sk)) {
1125 		if (key->family != family)
1126 			continue;
1127 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1128 			continue;
1129 		if (key->l3index != l3index)
1130 			continue;
1131 		if (!memcmp(&key->addr, addr, size) &&
1132 		    key->prefixlen == prefixlen)
1133 			return key;
1134 	}
1135 	return NULL;
1136 }
1137 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1138 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1139 					 const struct sock *addr_sk)
1140 {
1141 	const union tcp_md5_addr *addr;
1142 	int l3index;
1143 
1144 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1145 						 addr_sk->sk_bound_dev_if);
1146 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1147 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1148 }
1149 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1150 
1151 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1152 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1153 		   int family, u8 prefixlen, int l3index, u8 flags,
1154 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1155 {
1156 	/* Add Key to the list */
1157 	struct tcp_md5sig_key *key;
1158 	struct tcp_sock *tp = tcp_sk(sk);
1159 	struct tcp_md5sig_info *md5sig;
1160 
1161 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1162 	if (key) {
1163 		/* Pre-existing entry - just update that one.
1164 		 * Note that the key might be used concurrently.
1165 		 * data_race() is telling kcsan that we do not care of
1166 		 * key mismatches, since changing MD5 key on live flows
1167 		 * can lead to packet drops.
1168 		 */
1169 		data_race(memcpy(key->key, newkey, newkeylen));
1170 
1171 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1172 		 * Also note that a reader could catch new key->keylen value
1173 		 * but old key->key[], this is the reason we use __GFP_ZERO
1174 		 * at sock_kmalloc() time below these lines.
1175 		 */
1176 		WRITE_ONCE(key->keylen, newkeylen);
1177 
1178 		return 0;
1179 	}
1180 
1181 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1182 					   lockdep_sock_is_held(sk));
1183 	if (!md5sig) {
1184 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1185 		if (!md5sig)
1186 			return -ENOMEM;
1187 
1188 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1189 		INIT_HLIST_HEAD(&md5sig->head);
1190 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1191 	}
1192 
1193 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1194 	if (!key)
1195 		return -ENOMEM;
1196 	if (!tcp_alloc_md5sig_pool()) {
1197 		sock_kfree_s(sk, key, sizeof(*key));
1198 		return -ENOMEM;
1199 	}
1200 
1201 	memcpy(key->key, newkey, newkeylen);
1202 	key->keylen = newkeylen;
1203 	key->family = family;
1204 	key->prefixlen = prefixlen;
1205 	key->l3index = l3index;
1206 	key->flags = flags;
1207 	memcpy(&key->addr, addr,
1208 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1209 				      sizeof(struct in_addr));
1210 	hlist_add_head_rcu(&key->node, &md5sig->head);
1211 	return 0;
1212 }
1213 EXPORT_SYMBOL(tcp_md5_do_add);
1214 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1215 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1216 		   u8 prefixlen, int l3index, u8 flags)
1217 {
1218 	struct tcp_md5sig_key *key;
1219 
1220 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1221 	if (!key)
1222 		return -ENOENT;
1223 	hlist_del_rcu(&key->node);
1224 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1225 	kfree_rcu(key, rcu);
1226 	return 0;
1227 }
1228 EXPORT_SYMBOL(tcp_md5_do_del);
1229 
tcp_clear_md5_list(struct sock * sk)1230 static void tcp_clear_md5_list(struct sock *sk)
1231 {
1232 	struct tcp_sock *tp = tcp_sk(sk);
1233 	struct tcp_md5sig_key *key;
1234 	struct hlist_node *n;
1235 	struct tcp_md5sig_info *md5sig;
1236 
1237 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1238 
1239 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1240 		hlist_del_rcu(&key->node);
1241 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1242 		kfree_rcu(key, rcu);
1243 	}
1244 }
1245 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1246 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1247 				 sockptr_t optval, int optlen)
1248 {
1249 	struct tcp_md5sig cmd;
1250 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1251 	const union tcp_md5_addr *addr;
1252 	u8 prefixlen = 32;
1253 	int l3index = 0;
1254 	u8 flags;
1255 
1256 	if (optlen < sizeof(cmd))
1257 		return -EINVAL;
1258 
1259 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1260 		return -EFAULT;
1261 
1262 	if (sin->sin_family != AF_INET)
1263 		return -EINVAL;
1264 
1265 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1266 
1267 	if (optname == TCP_MD5SIG_EXT &&
1268 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1269 		prefixlen = cmd.tcpm_prefixlen;
1270 		if (prefixlen > 32)
1271 			return -EINVAL;
1272 	}
1273 
1274 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1275 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1276 		struct net_device *dev;
1277 
1278 		rcu_read_lock();
1279 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1280 		if (dev && netif_is_l3_master(dev))
1281 			l3index = dev->ifindex;
1282 
1283 		rcu_read_unlock();
1284 
1285 		/* ok to reference set/not set outside of rcu;
1286 		 * right now device MUST be an L3 master
1287 		 */
1288 		if (!dev || !l3index)
1289 			return -EINVAL;
1290 	}
1291 
1292 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1293 
1294 	if (!cmd.tcpm_keylen)
1295 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1296 
1297 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1298 		return -EINVAL;
1299 
1300 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1301 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1302 }
1303 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1304 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1305 				   __be32 daddr, __be32 saddr,
1306 				   const struct tcphdr *th, int nbytes)
1307 {
1308 	struct tcp4_pseudohdr *bp;
1309 	struct scatterlist sg;
1310 	struct tcphdr *_th;
1311 
1312 	bp = hp->scratch;
1313 	bp->saddr = saddr;
1314 	bp->daddr = daddr;
1315 	bp->pad = 0;
1316 	bp->protocol = IPPROTO_TCP;
1317 	bp->len = cpu_to_be16(nbytes);
1318 
1319 	_th = (struct tcphdr *)(bp + 1);
1320 	memcpy(_th, th, sizeof(*th));
1321 	_th->check = 0;
1322 
1323 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1324 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1325 				sizeof(*bp) + sizeof(*th));
1326 	return crypto_ahash_update(hp->md5_req);
1327 }
1328 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1329 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1330 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1331 {
1332 	struct tcp_md5sig_pool *hp;
1333 	struct ahash_request *req;
1334 
1335 	hp = tcp_get_md5sig_pool();
1336 	if (!hp)
1337 		goto clear_hash_noput;
1338 	req = hp->md5_req;
1339 
1340 	if (crypto_ahash_init(req))
1341 		goto clear_hash;
1342 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1343 		goto clear_hash;
1344 	if (tcp_md5_hash_key(hp, key))
1345 		goto clear_hash;
1346 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1347 	if (crypto_ahash_final(req))
1348 		goto clear_hash;
1349 
1350 	tcp_put_md5sig_pool();
1351 	return 0;
1352 
1353 clear_hash:
1354 	tcp_put_md5sig_pool();
1355 clear_hash_noput:
1356 	memset(md5_hash, 0, 16);
1357 	return 1;
1358 }
1359 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1360 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1361 			const struct sock *sk,
1362 			const struct sk_buff *skb)
1363 {
1364 	struct tcp_md5sig_pool *hp;
1365 	struct ahash_request *req;
1366 	const struct tcphdr *th = tcp_hdr(skb);
1367 	__be32 saddr, daddr;
1368 
1369 	if (sk) { /* valid for establish/request sockets */
1370 		saddr = sk->sk_rcv_saddr;
1371 		daddr = sk->sk_daddr;
1372 	} else {
1373 		const struct iphdr *iph = ip_hdr(skb);
1374 		saddr = iph->saddr;
1375 		daddr = iph->daddr;
1376 	}
1377 
1378 	hp = tcp_get_md5sig_pool();
1379 	if (!hp)
1380 		goto clear_hash_noput;
1381 	req = hp->md5_req;
1382 
1383 	if (crypto_ahash_init(req))
1384 		goto clear_hash;
1385 
1386 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1387 		goto clear_hash;
1388 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1389 		goto clear_hash;
1390 	if (tcp_md5_hash_key(hp, key))
1391 		goto clear_hash;
1392 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1393 	if (crypto_ahash_final(req))
1394 		goto clear_hash;
1395 
1396 	tcp_put_md5sig_pool();
1397 	return 0;
1398 
1399 clear_hash:
1400 	tcp_put_md5sig_pool();
1401 clear_hash_noput:
1402 	memset(md5_hash, 0, 16);
1403 	return 1;
1404 }
1405 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1406 
1407 #endif
1408 
1409 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1410 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1411 				    const struct sk_buff *skb,
1412 				    int dif, int sdif)
1413 {
1414 #ifdef CONFIG_TCP_MD5SIG
1415 	/*
1416 	 * This gets called for each TCP segment that arrives
1417 	 * so we want to be efficient.
1418 	 * We have 3 drop cases:
1419 	 * o No MD5 hash and one expected.
1420 	 * o MD5 hash and we're not expecting one.
1421 	 * o MD5 hash and its wrong.
1422 	 */
1423 	const __u8 *hash_location = NULL;
1424 	struct tcp_md5sig_key *hash_expected;
1425 	const struct iphdr *iph = ip_hdr(skb);
1426 	const struct tcphdr *th = tcp_hdr(skb);
1427 	const union tcp_md5_addr *addr;
1428 	unsigned char newhash[16];
1429 	int genhash, l3index;
1430 
1431 	/* sdif set, means packet ingressed via a device
1432 	 * in an L3 domain and dif is set to the l3mdev
1433 	 */
1434 	l3index = sdif ? dif : 0;
1435 
1436 	addr = (union tcp_md5_addr *)&iph->saddr;
1437 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1438 	hash_location = tcp_parse_md5sig_option(th);
1439 
1440 	/* We've parsed the options - do we have a hash? */
1441 	if (!hash_expected && !hash_location)
1442 		return false;
1443 
1444 	if (hash_expected && !hash_location) {
1445 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1446 		return true;
1447 	}
1448 
1449 	if (!hash_expected && hash_location) {
1450 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1451 		return true;
1452 	}
1453 
1454 	/* Okay, so this is hash_expected and hash_location -
1455 	 * so we need to calculate the checksum.
1456 	 */
1457 	genhash = tcp_v4_md5_hash_skb(newhash,
1458 				      hash_expected,
1459 				      NULL, skb);
1460 
1461 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1462 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1463 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1464 				     &iph->saddr, ntohs(th->source),
1465 				     &iph->daddr, ntohs(th->dest),
1466 				     genhash ? " tcp_v4_calc_md5_hash failed"
1467 				     : "", l3index);
1468 		return true;
1469 	}
1470 	return false;
1471 #endif
1472 	return false;
1473 }
1474 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1475 static void tcp_v4_init_req(struct request_sock *req,
1476 			    const struct sock *sk_listener,
1477 			    struct sk_buff *skb)
1478 {
1479 	struct inet_request_sock *ireq = inet_rsk(req);
1480 	struct net *net = sock_net(sk_listener);
1481 
1482 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1483 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1484 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1485 }
1486 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1487 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1488 					  struct sk_buff *skb,
1489 					  struct flowi *fl,
1490 					  struct request_sock *req)
1491 {
1492 	tcp_v4_init_req(req, sk, skb);
1493 
1494 	if (security_inet_conn_request(sk, skb, req))
1495 		return NULL;
1496 
1497 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1498 }
1499 
1500 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1501 	.family		=	PF_INET,
1502 	.obj_size	=	sizeof(struct tcp_request_sock),
1503 	.rtx_syn_ack	=	tcp_rtx_synack,
1504 	.send_ack	=	tcp_v4_reqsk_send_ack,
1505 	.destructor	=	tcp_v4_reqsk_destructor,
1506 	.send_reset	=	tcp_v4_send_reset,
1507 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1508 };
1509 
1510 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1511 	.mss_clamp	=	TCP_MSS_DEFAULT,
1512 #ifdef CONFIG_TCP_MD5SIG
1513 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1514 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1515 #endif
1516 #ifdef CONFIG_SYN_COOKIES
1517 	.cookie_init_seq =	cookie_v4_init_sequence,
1518 #endif
1519 	.route_req	=	tcp_v4_route_req,
1520 	.init_seq	=	tcp_v4_init_seq,
1521 	.init_ts_off	=	tcp_v4_init_ts_off,
1522 	.send_synack	=	tcp_v4_send_synack,
1523 };
1524 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1525 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1526 {
1527 	/* Never answer to SYNs send to broadcast or multicast */
1528 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1529 		goto drop;
1530 
1531 	return tcp_conn_request(&tcp_request_sock_ops,
1532 				&tcp_request_sock_ipv4_ops, sk, skb);
1533 
1534 drop:
1535 	tcp_listendrop(sk);
1536 	return 0;
1537 }
1538 EXPORT_SYMBOL(tcp_v4_conn_request);
1539 
1540 
1541 /*
1542  * The three way handshake has completed - we got a valid synack -
1543  * now create the new socket.
1544  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1545 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1546 				  struct request_sock *req,
1547 				  struct dst_entry *dst,
1548 				  struct request_sock *req_unhash,
1549 				  bool *own_req)
1550 {
1551 	struct inet_request_sock *ireq;
1552 	bool found_dup_sk = false;
1553 	struct inet_sock *newinet;
1554 	struct tcp_sock *newtp;
1555 	struct sock *newsk;
1556 #ifdef CONFIG_TCP_MD5SIG
1557 	const union tcp_md5_addr *addr;
1558 	struct tcp_md5sig_key *key;
1559 	int l3index;
1560 #endif
1561 	struct ip_options_rcu *inet_opt;
1562 
1563 	if (sk_acceptq_is_full(sk))
1564 		goto exit_overflow;
1565 
1566 	newsk = tcp_create_openreq_child(sk, req, skb);
1567 	if (!newsk)
1568 		goto exit_nonewsk;
1569 
1570 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1571 	inet_sk_rx_dst_set(newsk, skb);
1572 
1573 	newtp		      = tcp_sk(newsk);
1574 	newinet		      = inet_sk(newsk);
1575 	ireq		      = inet_rsk(req);
1576 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1577 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1578 	newsk->sk_bound_dev_if = ireq->ir_iif;
1579 	newinet->inet_saddr   = ireq->ir_loc_addr;
1580 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1581 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1582 	newinet->mc_index     = inet_iif(skb);
1583 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1584 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1585 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1586 	if (inet_opt)
1587 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1588 	newinet->inet_id = prandom_u32();
1589 
1590 	/* Set ToS of the new socket based upon the value of incoming SYN.
1591 	 * ECT bits are set later in tcp_init_transfer().
1592 	 */
1593 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1594 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1595 
1596 	if (!dst) {
1597 		dst = inet_csk_route_child_sock(sk, newsk, req);
1598 		if (!dst)
1599 			goto put_and_exit;
1600 	} else {
1601 		/* syncookie case : see end of cookie_v4_check() */
1602 	}
1603 	sk_setup_caps(newsk, dst);
1604 
1605 	tcp_ca_openreq_child(newsk, dst);
1606 
1607 	tcp_sync_mss(newsk, dst_mtu(dst));
1608 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1609 
1610 	tcp_initialize_rcv_mss(newsk);
1611 
1612 #ifdef CONFIG_TCP_MD5SIG
1613 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1614 	/* Copy over the MD5 key from the original socket */
1615 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1616 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1617 	if (key) {
1618 		/*
1619 		 * We're using one, so create a matching key
1620 		 * on the newsk structure. If we fail to get
1621 		 * memory, then we end up not copying the key
1622 		 * across. Shucks.
1623 		 */
1624 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1625 			       key->key, key->keylen, GFP_ATOMIC);
1626 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1627 	}
1628 #endif
1629 
1630 	if (__inet_inherit_port(sk, newsk) < 0)
1631 		goto put_and_exit;
1632 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1633 				       &found_dup_sk);
1634 	if (likely(*own_req)) {
1635 		tcp_move_syn(newtp, req);
1636 		ireq->ireq_opt = NULL;
1637 	} else {
1638 		newinet->inet_opt = NULL;
1639 
1640 		if (!req_unhash && found_dup_sk) {
1641 			/* This code path should only be executed in the
1642 			 * syncookie case only
1643 			 */
1644 			bh_unlock_sock(newsk);
1645 			sock_put(newsk);
1646 			newsk = NULL;
1647 		}
1648 	}
1649 	return newsk;
1650 
1651 exit_overflow:
1652 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1653 exit_nonewsk:
1654 	dst_release(dst);
1655 exit:
1656 	tcp_listendrop(sk);
1657 	return NULL;
1658 put_and_exit:
1659 	newinet->inet_opt = NULL;
1660 	inet_csk_prepare_forced_close(newsk);
1661 	tcp_done(newsk);
1662 	goto exit;
1663 }
1664 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1665 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1666 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1667 {
1668 #ifdef CONFIG_SYN_COOKIES
1669 	const struct tcphdr *th = tcp_hdr(skb);
1670 
1671 	if (!th->syn)
1672 		sk = cookie_v4_check(sk, skb);
1673 #endif
1674 	return sk;
1675 }
1676 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1677 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1678 			 struct tcphdr *th, u32 *cookie)
1679 {
1680 	u16 mss = 0;
1681 #ifdef CONFIG_SYN_COOKIES
1682 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1683 				    &tcp_request_sock_ipv4_ops, sk, th);
1684 	if (mss) {
1685 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1686 		tcp_synq_overflow(sk);
1687 	}
1688 #endif
1689 	return mss;
1690 }
1691 
1692 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1693 							   u32));
1694 /* The socket must have it's spinlock held when we get
1695  * here, unless it is a TCP_LISTEN socket.
1696  *
1697  * We have a potential double-lock case here, so even when
1698  * doing backlog processing we use the BH locking scheme.
1699  * This is because we cannot sleep with the original spinlock
1700  * held.
1701  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1702 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1703 {
1704 	struct sock *rsk;
1705 
1706 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1707 		struct dst_entry *dst;
1708 
1709 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1710 						lockdep_sock_is_held(sk));
1711 
1712 		sock_rps_save_rxhash(sk, skb);
1713 		sk_mark_napi_id(sk, skb);
1714 		if (dst) {
1715 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1716 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1717 					     dst, 0)) {
1718 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1719 				dst_release(dst);
1720 			}
1721 		}
1722 		tcp_rcv_established(sk, skb);
1723 		return 0;
1724 	}
1725 
1726 	if (tcp_checksum_complete(skb))
1727 		goto csum_err;
1728 
1729 	if (sk->sk_state == TCP_LISTEN) {
1730 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1731 
1732 		if (!nsk)
1733 			goto discard;
1734 		if (nsk != sk) {
1735 			if (tcp_child_process(sk, nsk, skb)) {
1736 				rsk = nsk;
1737 				goto reset;
1738 			}
1739 			return 0;
1740 		}
1741 	} else
1742 		sock_rps_save_rxhash(sk, skb);
1743 
1744 	if (tcp_rcv_state_process(sk, skb)) {
1745 		rsk = sk;
1746 		goto reset;
1747 	}
1748 	return 0;
1749 
1750 reset:
1751 	tcp_v4_send_reset(rsk, skb);
1752 discard:
1753 	kfree_skb(skb);
1754 	/* Be careful here. If this function gets more complicated and
1755 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1756 	 * might be destroyed here. This current version compiles correctly,
1757 	 * but you have been warned.
1758 	 */
1759 	return 0;
1760 
1761 csum_err:
1762 	trace_tcp_bad_csum(skb);
1763 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1764 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1765 	goto discard;
1766 }
1767 EXPORT_SYMBOL(tcp_v4_do_rcv);
1768 
tcp_v4_early_demux(struct sk_buff * skb)1769 int tcp_v4_early_demux(struct sk_buff *skb)
1770 {
1771 	const struct iphdr *iph;
1772 	const struct tcphdr *th;
1773 	struct sock *sk;
1774 
1775 	if (skb->pkt_type != PACKET_HOST)
1776 		return 0;
1777 
1778 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1779 		return 0;
1780 
1781 	iph = ip_hdr(skb);
1782 	th = tcp_hdr(skb);
1783 
1784 	if (th->doff < sizeof(struct tcphdr) / 4)
1785 		return 0;
1786 
1787 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1788 				       iph->saddr, th->source,
1789 				       iph->daddr, ntohs(th->dest),
1790 				       skb->skb_iif, inet_sdif(skb));
1791 	if (sk) {
1792 		skb->sk = sk;
1793 		skb->destructor = sock_edemux;
1794 		if (sk_fullsock(sk)) {
1795 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1796 
1797 			if (dst)
1798 				dst = dst_check(dst, 0);
1799 			if (dst &&
1800 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1801 				skb_dst_set_noref(skb, dst);
1802 		}
1803 	}
1804 	return 0;
1805 }
1806 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1807 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1808 {
1809 	u32 limit, tail_gso_size, tail_gso_segs;
1810 	struct skb_shared_info *shinfo;
1811 	const struct tcphdr *th;
1812 	struct tcphdr *thtail;
1813 	struct sk_buff *tail;
1814 	unsigned int hdrlen;
1815 	bool fragstolen;
1816 	u32 gso_segs;
1817 	u32 gso_size;
1818 	int delta;
1819 
1820 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1821 	 * we can fix skb->truesize to its real value to avoid future drops.
1822 	 * This is valid because skb is not yet charged to the socket.
1823 	 * It has been noticed pure SACK packets were sometimes dropped
1824 	 * (if cooked by drivers without copybreak feature).
1825 	 */
1826 	skb_condense(skb);
1827 
1828 	skb_dst_drop(skb);
1829 
1830 	if (unlikely(tcp_checksum_complete(skb))) {
1831 		bh_unlock_sock(sk);
1832 		trace_tcp_bad_csum(skb);
1833 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1834 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1835 		return true;
1836 	}
1837 
1838 	/* Attempt coalescing to last skb in backlog, even if we are
1839 	 * above the limits.
1840 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1841 	 */
1842 	th = (const struct tcphdr *)skb->data;
1843 	hdrlen = th->doff * 4;
1844 
1845 	tail = sk->sk_backlog.tail;
1846 	if (!tail)
1847 		goto no_coalesce;
1848 	thtail = (struct tcphdr *)tail->data;
1849 
1850 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1851 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1852 	    ((TCP_SKB_CB(tail)->tcp_flags |
1853 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1854 	    !((TCP_SKB_CB(tail)->tcp_flags &
1855 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1856 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1857 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1858 #ifdef CONFIG_TLS_DEVICE
1859 	    tail->decrypted != skb->decrypted ||
1860 #endif
1861 	    !mptcp_skb_can_collapse(tail, skb) ||
1862 	    thtail->doff != th->doff ||
1863 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1864 		goto no_coalesce;
1865 
1866 	__skb_pull(skb, hdrlen);
1867 
1868 	shinfo = skb_shinfo(skb);
1869 	gso_size = shinfo->gso_size ?: skb->len;
1870 	gso_segs = shinfo->gso_segs ?: 1;
1871 
1872 	shinfo = skb_shinfo(tail);
1873 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1874 	tail_gso_segs = shinfo->gso_segs ?: 1;
1875 
1876 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1877 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1878 
1879 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1880 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1881 			thtail->window = th->window;
1882 		}
1883 
1884 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1885 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1886 		 * is not entered if we append a packet with a FIN.
1887 		 * SYN, RST, URG are not present.
1888 		 * ACK is set on both packets.
1889 		 * PSH : we do not really care in TCP stack,
1890 		 *       at least for 'GRO' packets.
1891 		 */
1892 		thtail->fin |= th->fin;
1893 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1894 
1895 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1896 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1897 			tail->tstamp = skb->tstamp;
1898 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1899 		}
1900 
1901 		/* Not as strict as GRO. We only need to carry mss max value */
1902 		shinfo->gso_size = max(gso_size, tail_gso_size);
1903 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1904 
1905 		sk->sk_backlog.len += delta;
1906 		__NET_INC_STATS(sock_net(sk),
1907 				LINUX_MIB_TCPBACKLOGCOALESCE);
1908 		kfree_skb_partial(skb, fragstolen);
1909 		return false;
1910 	}
1911 	__skb_push(skb, hdrlen);
1912 
1913 no_coalesce:
1914 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1915 
1916 	/* Only socket owner can try to collapse/prune rx queues
1917 	 * to reduce memory overhead, so add a little headroom here.
1918 	 * Few sockets backlog are possibly concurrently non empty.
1919 	 */
1920 	limit += 64 * 1024;
1921 
1922 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1923 		bh_unlock_sock(sk);
1924 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1925 		return true;
1926 	}
1927 	return false;
1928 }
1929 EXPORT_SYMBOL(tcp_add_backlog);
1930 
tcp_filter(struct sock * sk,struct sk_buff * skb)1931 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1932 {
1933 	struct tcphdr *th = (struct tcphdr *)skb->data;
1934 
1935 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1936 }
1937 EXPORT_SYMBOL(tcp_filter);
1938 
tcp_v4_restore_cb(struct sk_buff * skb)1939 static void tcp_v4_restore_cb(struct sk_buff *skb)
1940 {
1941 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1942 		sizeof(struct inet_skb_parm));
1943 }
1944 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1945 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1946 			   const struct tcphdr *th)
1947 {
1948 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1949 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1950 	 */
1951 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1952 		sizeof(struct inet_skb_parm));
1953 	barrier();
1954 
1955 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1956 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1957 				    skb->len - th->doff * 4);
1958 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1959 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1960 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1961 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1962 	TCP_SKB_CB(skb)->sacked	 = 0;
1963 	TCP_SKB_CB(skb)->has_rxtstamp =
1964 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1965 }
1966 
1967 /*
1968  *	From tcp_input.c
1969  */
1970 
tcp_v4_rcv(struct sk_buff * skb)1971 int tcp_v4_rcv(struct sk_buff *skb)
1972 {
1973 	struct net *net = dev_net(skb->dev);
1974 	struct sk_buff *skb_to_free;
1975 	int sdif = inet_sdif(skb);
1976 	int dif = inet_iif(skb);
1977 	const struct iphdr *iph;
1978 	const struct tcphdr *th;
1979 	bool refcounted;
1980 	struct sock *sk;
1981 	int drop_reason;
1982 	int ret;
1983 
1984 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1985 	if (skb->pkt_type != PACKET_HOST)
1986 		goto discard_it;
1987 
1988 	/* Count it even if it's bad */
1989 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1990 
1991 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1992 		goto discard_it;
1993 
1994 	th = (const struct tcphdr *)skb->data;
1995 
1996 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1997 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1998 		goto bad_packet;
1999 	}
2000 	if (!pskb_may_pull(skb, th->doff * 4))
2001 		goto discard_it;
2002 
2003 	/* An explanation is required here, I think.
2004 	 * Packet length and doff are validated by header prediction,
2005 	 * provided case of th->doff==0 is eliminated.
2006 	 * So, we defer the checks. */
2007 
2008 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2009 		goto csum_error;
2010 
2011 	th = (const struct tcphdr *)skb->data;
2012 	iph = ip_hdr(skb);
2013 lookup:
2014 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2015 			       th->dest, sdif, &refcounted);
2016 	if (!sk)
2017 		goto no_tcp_socket;
2018 
2019 process:
2020 	if (sk->sk_state == TCP_TIME_WAIT)
2021 		goto do_time_wait;
2022 
2023 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2024 		struct request_sock *req = inet_reqsk(sk);
2025 		bool req_stolen = false;
2026 		struct sock *nsk;
2027 
2028 		sk = req->rsk_listener;
2029 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2030 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2031 			sk_drops_add(sk, skb);
2032 			reqsk_put(req);
2033 			goto discard_it;
2034 		}
2035 		if (tcp_checksum_complete(skb)) {
2036 			reqsk_put(req);
2037 			goto csum_error;
2038 		}
2039 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2040 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2041 			if (!nsk) {
2042 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2043 				goto lookup;
2044 			}
2045 			sk = nsk;
2046 			/* reuseport_migrate_sock() has already held one sk_refcnt
2047 			 * before returning.
2048 			 */
2049 		} else {
2050 			/* We own a reference on the listener, increase it again
2051 			 * as we might lose it too soon.
2052 			 */
2053 			sock_hold(sk);
2054 		}
2055 		refcounted = true;
2056 		nsk = NULL;
2057 		if (!tcp_filter(sk, skb)) {
2058 			th = (const struct tcphdr *)skb->data;
2059 			iph = ip_hdr(skb);
2060 			tcp_v4_fill_cb(skb, iph, th);
2061 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2062 		}
2063 		if (!nsk) {
2064 			reqsk_put(req);
2065 			if (req_stolen) {
2066 				/* Another cpu got exclusive access to req
2067 				 * and created a full blown socket.
2068 				 * Try to feed this packet to this socket
2069 				 * instead of discarding it.
2070 				 */
2071 				tcp_v4_restore_cb(skb);
2072 				sock_put(sk);
2073 				goto lookup;
2074 			}
2075 			goto discard_and_relse;
2076 		}
2077 		nf_reset_ct(skb);
2078 		if (nsk == sk) {
2079 			reqsk_put(req);
2080 			tcp_v4_restore_cb(skb);
2081 		} else if (tcp_child_process(sk, nsk, skb)) {
2082 			tcp_v4_send_reset(nsk, skb);
2083 			goto discard_and_relse;
2084 		} else {
2085 			sock_put(sk);
2086 			return 0;
2087 		}
2088 	}
2089 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2090 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2091 		goto discard_and_relse;
2092 	}
2093 
2094 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2095 		goto discard_and_relse;
2096 
2097 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2098 		goto discard_and_relse;
2099 
2100 	nf_reset_ct(skb);
2101 
2102 	if (tcp_filter(sk, skb)) {
2103 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2104 		goto discard_and_relse;
2105 	}
2106 	th = (const struct tcphdr *)skb->data;
2107 	iph = ip_hdr(skb);
2108 	tcp_v4_fill_cb(skb, iph, th);
2109 
2110 	skb->dev = NULL;
2111 
2112 	if (sk->sk_state == TCP_LISTEN) {
2113 		ret = tcp_v4_do_rcv(sk, skb);
2114 		goto put_and_return;
2115 	}
2116 
2117 	sk_incoming_cpu_update(sk);
2118 
2119 	bh_lock_sock_nested(sk);
2120 	tcp_segs_in(tcp_sk(sk), skb);
2121 	ret = 0;
2122 	if (!sock_owned_by_user(sk)) {
2123 		skb_to_free = sk->sk_rx_skb_cache;
2124 		sk->sk_rx_skb_cache = NULL;
2125 		ret = tcp_v4_do_rcv(sk, skb);
2126 	} else {
2127 		if (tcp_add_backlog(sk, skb))
2128 			goto discard_and_relse;
2129 		skb_to_free = NULL;
2130 	}
2131 	bh_unlock_sock(sk);
2132 	if (skb_to_free)
2133 		__kfree_skb(skb_to_free);
2134 
2135 put_and_return:
2136 	if (refcounted)
2137 		sock_put(sk);
2138 
2139 	return ret;
2140 
2141 no_tcp_socket:
2142 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2143 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2144 		goto discard_it;
2145 
2146 	tcp_v4_fill_cb(skb, iph, th);
2147 
2148 	if (tcp_checksum_complete(skb)) {
2149 csum_error:
2150 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2151 		trace_tcp_bad_csum(skb);
2152 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2153 bad_packet:
2154 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2155 	} else {
2156 		tcp_v4_send_reset(NULL, skb);
2157 	}
2158 
2159 discard_it:
2160 	/* Discard frame. */
2161 	kfree_skb_reason(skb, drop_reason);
2162 	return 0;
2163 
2164 discard_and_relse:
2165 	sk_drops_add(sk, skb);
2166 	if (refcounted)
2167 		sock_put(sk);
2168 	goto discard_it;
2169 
2170 do_time_wait:
2171 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2172 		inet_twsk_put(inet_twsk(sk));
2173 		goto discard_it;
2174 	}
2175 
2176 	tcp_v4_fill_cb(skb, iph, th);
2177 
2178 	if (tcp_checksum_complete(skb)) {
2179 		inet_twsk_put(inet_twsk(sk));
2180 		goto csum_error;
2181 	}
2182 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2183 	case TCP_TW_SYN: {
2184 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2185 							&tcp_hashinfo, skb,
2186 							__tcp_hdrlen(th),
2187 							iph->saddr, th->source,
2188 							iph->daddr, th->dest,
2189 							inet_iif(skb),
2190 							sdif);
2191 		if (sk2) {
2192 			inet_twsk_deschedule_put(inet_twsk(sk));
2193 			sk = sk2;
2194 			tcp_v4_restore_cb(skb);
2195 			refcounted = false;
2196 			goto process;
2197 		}
2198 	}
2199 		/* to ACK */
2200 		fallthrough;
2201 	case TCP_TW_ACK:
2202 		tcp_v4_timewait_ack(sk, skb);
2203 		break;
2204 	case TCP_TW_RST:
2205 		tcp_v4_send_reset(sk, skb);
2206 		inet_twsk_deschedule_put(inet_twsk(sk));
2207 		goto discard_it;
2208 	case TCP_TW_SUCCESS:;
2209 	}
2210 	goto discard_it;
2211 }
2212 
2213 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2214 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2215 	.twsk_unique	= tcp_twsk_unique,
2216 	.twsk_destructor= tcp_twsk_destructor,
2217 };
2218 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2219 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2220 {
2221 	struct dst_entry *dst = skb_dst(skb);
2222 
2223 	if (dst && dst_hold_safe(dst)) {
2224 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2225 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2226 	}
2227 }
2228 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2229 
2230 const struct inet_connection_sock_af_ops ipv4_specific = {
2231 	.queue_xmit	   = ip_queue_xmit,
2232 	.send_check	   = tcp_v4_send_check,
2233 	.rebuild_header	   = inet_sk_rebuild_header,
2234 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2235 	.conn_request	   = tcp_v4_conn_request,
2236 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2237 	.net_header_len	   = sizeof(struct iphdr),
2238 	.setsockopt	   = ip_setsockopt,
2239 	.getsockopt	   = ip_getsockopt,
2240 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2241 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2242 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2243 };
2244 EXPORT_SYMBOL(ipv4_specific);
2245 
2246 #ifdef CONFIG_TCP_MD5SIG
2247 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2248 	.md5_lookup		= tcp_v4_md5_lookup,
2249 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2250 	.md5_parse		= tcp_v4_parse_md5_keys,
2251 };
2252 #endif
2253 
2254 /* NOTE: A lot of things set to zero explicitly by call to
2255  *       sk_alloc() so need not be done here.
2256  */
tcp_v4_init_sock(struct sock * sk)2257 static int tcp_v4_init_sock(struct sock *sk)
2258 {
2259 	struct inet_connection_sock *icsk = inet_csk(sk);
2260 
2261 	tcp_init_sock(sk);
2262 
2263 	icsk->icsk_af_ops = &ipv4_specific;
2264 
2265 #ifdef CONFIG_TCP_MD5SIG
2266 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2267 #endif
2268 
2269 	return 0;
2270 }
2271 
tcp_v4_destroy_sock(struct sock * sk)2272 void tcp_v4_destroy_sock(struct sock *sk)
2273 {
2274 	struct tcp_sock *tp = tcp_sk(sk);
2275 
2276 	trace_tcp_destroy_sock(sk);
2277 
2278 	tcp_clear_xmit_timers(sk);
2279 
2280 	tcp_cleanup_congestion_control(sk);
2281 
2282 	tcp_cleanup_ulp(sk);
2283 
2284 	/* Cleanup up the write buffer. */
2285 	tcp_write_queue_purge(sk);
2286 
2287 	/* Check if we want to disable active TFO */
2288 	tcp_fastopen_active_disable_ofo_check(sk);
2289 
2290 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2291 	skb_rbtree_purge(&tp->out_of_order_queue);
2292 
2293 #ifdef CONFIG_TCP_MD5SIG
2294 	/* Clean up the MD5 key list, if any */
2295 	if (tp->md5sig_info) {
2296 		tcp_clear_md5_list(sk);
2297 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2298 		tp->md5sig_info = NULL;
2299 	}
2300 #endif
2301 
2302 	/* Clean up a referenced TCP bind bucket. */
2303 	if (inet_csk(sk)->icsk_bind_hash)
2304 		inet_put_port(sk);
2305 
2306 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2307 
2308 	/* If socket is aborted during connect operation */
2309 	tcp_free_fastopen_req(tp);
2310 	tcp_fastopen_destroy_cipher(sk);
2311 	tcp_saved_syn_free(tp);
2312 
2313 	sk_sockets_allocated_dec(sk);
2314 }
2315 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2316 
2317 #ifdef CONFIG_PROC_FS
2318 /* Proc filesystem TCP sock list dumping. */
2319 
2320 static unsigned short seq_file_family(const struct seq_file *seq);
2321 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2322 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2323 {
2324 	unsigned short family = seq_file_family(seq);
2325 
2326 	/* AF_UNSPEC is used as a match all */
2327 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2328 		net_eq(sock_net(sk), seq_file_net(seq)));
2329 }
2330 
2331 /* Find a non empty bucket (starting from st->bucket)
2332  * and return the first sk from it.
2333  */
listening_get_first(struct seq_file * seq)2334 static void *listening_get_first(struct seq_file *seq)
2335 {
2336 	struct tcp_iter_state *st = seq->private;
2337 
2338 	st->offset = 0;
2339 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2340 		struct inet_listen_hashbucket *ilb2;
2341 		struct inet_connection_sock *icsk;
2342 		struct sock *sk;
2343 
2344 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2345 		if (hlist_empty(&ilb2->head))
2346 			continue;
2347 
2348 		spin_lock(&ilb2->lock);
2349 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2350 			sk = (struct sock *)icsk;
2351 			if (seq_sk_match(seq, sk))
2352 				return sk;
2353 		}
2354 		spin_unlock(&ilb2->lock);
2355 	}
2356 
2357 	return NULL;
2358 }
2359 
2360 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2361  * If "cur" is the last one in the st->bucket,
2362  * call listening_get_first() to return the first sk of the next
2363  * non empty bucket.
2364  */
listening_get_next(struct seq_file * seq,void * cur)2365 static void *listening_get_next(struct seq_file *seq, void *cur)
2366 {
2367 	struct tcp_iter_state *st = seq->private;
2368 	struct inet_listen_hashbucket *ilb2;
2369 	struct inet_connection_sock *icsk;
2370 	struct sock *sk = cur;
2371 
2372 	++st->num;
2373 	++st->offset;
2374 
2375 	icsk = inet_csk(sk);
2376 	inet_lhash2_for_each_icsk_continue(icsk) {
2377 		sk = (struct sock *)icsk;
2378 		if (seq_sk_match(seq, sk))
2379 			return sk;
2380 	}
2381 
2382 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2383 	spin_unlock(&ilb2->lock);
2384 	++st->bucket;
2385 	return listening_get_first(seq);
2386 }
2387 
listening_get_idx(struct seq_file * seq,loff_t * pos)2388 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2389 {
2390 	struct tcp_iter_state *st = seq->private;
2391 	void *rc;
2392 
2393 	st->bucket = 0;
2394 	st->offset = 0;
2395 	rc = listening_get_first(seq);
2396 
2397 	while (rc && *pos) {
2398 		rc = listening_get_next(seq, rc);
2399 		--*pos;
2400 	}
2401 	return rc;
2402 }
2403 
empty_bucket(const struct tcp_iter_state * st)2404 static inline bool empty_bucket(const struct tcp_iter_state *st)
2405 {
2406 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2407 }
2408 
2409 /*
2410  * Get first established socket starting from bucket given in st->bucket.
2411  * If st->bucket is zero, the very first socket in the hash is returned.
2412  */
established_get_first(struct seq_file * seq)2413 static void *established_get_first(struct seq_file *seq)
2414 {
2415 	struct tcp_iter_state *st = seq->private;
2416 
2417 	st->offset = 0;
2418 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2419 		struct sock *sk;
2420 		struct hlist_nulls_node *node;
2421 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2422 
2423 		/* Lockless fast path for the common case of empty buckets */
2424 		if (empty_bucket(st))
2425 			continue;
2426 
2427 		spin_lock_bh(lock);
2428 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2429 			if (seq_sk_match(seq, sk))
2430 				return sk;
2431 		}
2432 		spin_unlock_bh(lock);
2433 	}
2434 
2435 	return NULL;
2436 }
2437 
established_get_next(struct seq_file * seq,void * cur)2438 static void *established_get_next(struct seq_file *seq, void *cur)
2439 {
2440 	struct sock *sk = cur;
2441 	struct hlist_nulls_node *node;
2442 	struct tcp_iter_state *st = seq->private;
2443 
2444 	++st->num;
2445 	++st->offset;
2446 
2447 	sk = sk_nulls_next(sk);
2448 
2449 	sk_nulls_for_each_from(sk, node) {
2450 		if (seq_sk_match(seq, sk))
2451 			return sk;
2452 	}
2453 
2454 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2455 	++st->bucket;
2456 	return established_get_first(seq);
2457 }
2458 
established_get_idx(struct seq_file * seq,loff_t pos)2459 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2460 {
2461 	struct tcp_iter_state *st = seq->private;
2462 	void *rc;
2463 
2464 	st->bucket = 0;
2465 	rc = established_get_first(seq);
2466 
2467 	while (rc && pos) {
2468 		rc = established_get_next(seq, rc);
2469 		--pos;
2470 	}
2471 	return rc;
2472 }
2473 
tcp_get_idx(struct seq_file * seq,loff_t pos)2474 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2475 {
2476 	void *rc;
2477 	struct tcp_iter_state *st = seq->private;
2478 
2479 	st->state = TCP_SEQ_STATE_LISTENING;
2480 	rc	  = listening_get_idx(seq, &pos);
2481 
2482 	if (!rc) {
2483 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2484 		rc	  = established_get_idx(seq, pos);
2485 	}
2486 
2487 	return rc;
2488 }
2489 
tcp_seek_last_pos(struct seq_file * seq)2490 static void *tcp_seek_last_pos(struct seq_file *seq)
2491 {
2492 	struct tcp_iter_state *st = seq->private;
2493 	int bucket = st->bucket;
2494 	int offset = st->offset;
2495 	int orig_num = st->num;
2496 	void *rc = NULL;
2497 
2498 	switch (st->state) {
2499 	case TCP_SEQ_STATE_LISTENING:
2500 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2501 			break;
2502 		st->state = TCP_SEQ_STATE_LISTENING;
2503 		rc = listening_get_first(seq);
2504 		while (offset-- && rc && bucket == st->bucket)
2505 			rc = listening_get_next(seq, rc);
2506 		if (rc)
2507 			break;
2508 		st->bucket = 0;
2509 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 		fallthrough;
2511 	case TCP_SEQ_STATE_ESTABLISHED:
2512 		if (st->bucket > tcp_hashinfo.ehash_mask)
2513 			break;
2514 		rc = established_get_first(seq);
2515 		while (offset-- && rc && bucket == st->bucket)
2516 			rc = established_get_next(seq, rc);
2517 	}
2518 
2519 	st->num = orig_num;
2520 
2521 	return rc;
2522 }
2523 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2524 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2525 {
2526 	struct tcp_iter_state *st = seq->private;
2527 	void *rc;
2528 
2529 	if (*pos && *pos == st->last_pos) {
2530 		rc = tcp_seek_last_pos(seq);
2531 		if (rc)
2532 			goto out;
2533 	}
2534 
2535 	st->state = TCP_SEQ_STATE_LISTENING;
2536 	st->num = 0;
2537 	st->bucket = 0;
2538 	st->offset = 0;
2539 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2540 
2541 out:
2542 	st->last_pos = *pos;
2543 	return rc;
2544 }
2545 EXPORT_SYMBOL(tcp_seq_start);
2546 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2547 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2548 {
2549 	struct tcp_iter_state *st = seq->private;
2550 	void *rc = NULL;
2551 
2552 	if (v == SEQ_START_TOKEN) {
2553 		rc = tcp_get_idx(seq, 0);
2554 		goto out;
2555 	}
2556 
2557 	switch (st->state) {
2558 	case TCP_SEQ_STATE_LISTENING:
2559 		rc = listening_get_next(seq, v);
2560 		if (!rc) {
2561 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2562 			st->bucket = 0;
2563 			st->offset = 0;
2564 			rc	  = established_get_first(seq);
2565 		}
2566 		break;
2567 	case TCP_SEQ_STATE_ESTABLISHED:
2568 		rc = established_get_next(seq, v);
2569 		break;
2570 	}
2571 out:
2572 	++*pos;
2573 	st->last_pos = *pos;
2574 	return rc;
2575 }
2576 EXPORT_SYMBOL(tcp_seq_next);
2577 
tcp_seq_stop(struct seq_file * seq,void * v)2578 void tcp_seq_stop(struct seq_file *seq, void *v)
2579 {
2580 	struct tcp_iter_state *st = seq->private;
2581 
2582 	switch (st->state) {
2583 	case TCP_SEQ_STATE_LISTENING:
2584 		if (v != SEQ_START_TOKEN)
2585 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2586 		break;
2587 	case TCP_SEQ_STATE_ESTABLISHED:
2588 		if (v)
2589 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2590 		break;
2591 	}
2592 }
2593 EXPORT_SYMBOL(tcp_seq_stop);
2594 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2595 static void get_openreq4(const struct request_sock *req,
2596 			 struct seq_file *f, int i)
2597 {
2598 	const struct inet_request_sock *ireq = inet_rsk(req);
2599 	long delta = req->rsk_timer.expires - jiffies;
2600 
2601 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2602 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2603 		i,
2604 		ireq->ir_loc_addr,
2605 		ireq->ir_num,
2606 		ireq->ir_rmt_addr,
2607 		ntohs(ireq->ir_rmt_port),
2608 		TCP_SYN_RECV,
2609 		0, 0, /* could print option size, but that is af dependent. */
2610 		1,    /* timers active (only the expire timer) */
2611 		jiffies_delta_to_clock_t(delta),
2612 		req->num_timeout,
2613 		from_kuid_munged(seq_user_ns(f),
2614 				 sock_i_uid(req->rsk_listener)),
2615 		0,  /* non standard timer */
2616 		0, /* open_requests have no inode */
2617 		0,
2618 		req);
2619 }
2620 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2621 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2622 {
2623 	int timer_active;
2624 	unsigned long timer_expires;
2625 	const struct tcp_sock *tp = tcp_sk(sk);
2626 	const struct inet_connection_sock *icsk = inet_csk(sk);
2627 	const struct inet_sock *inet = inet_sk(sk);
2628 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2629 	__be32 dest = inet->inet_daddr;
2630 	__be32 src = inet->inet_rcv_saddr;
2631 	__u16 destp = ntohs(inet->inet_dport);
2632 	__u16 srcp = ntohs(inet->inet_sport);
2633 	int rx_queue;
2634 	int state;
2635 
2636 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2637 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2638 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2639 		timer_active	= 1;
2640 		timer_expires	= icsk->icsk_timeout;
2641 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2642 		timer_active	= 4;
2643 		timer_expires	= icsk->icsk_timeout;
2644 	} else if (timer_pending(&sk->sk_timer)) {
2645 		timer_active	= 2;
2646 		timer_expires	= sk->sk_timer.expires;
2647 	} else {
2648 		timer_active	= 0;
2649 		timer_expires = jiffies;
2650 	}
2651 
2652 	state = inet_sk_state_load(sk);
2653 	if (state == TCP_LISTEN)
2654 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2655 	else
2656 		/* Because we don't lock the socket,
2657 		 * we might find a transient negative value.
2658 		 */
2659 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2660 				      READ_ONCE(tp->copied_seq), 0);
2661 
2662 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2663 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2664 		i, src, srcp, dest, destp, state,
2665 		READ_ONCE(tp->write_seq) - tp->snd_una,
2666 		rx_queue,
2667 		timer_active,
2668 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2669 		icsk->icsk_retransmits,
2670 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2671 		icsk->icsk_probes_out,
2672 		sock_i_ino(sk),
2673 		refcount_read(&sk->sk_refcnt), sk,
2674 		jiffies_to_clock_t(icsk->icsk_rto),
2675 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2676 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2677 		tcp_snd_cwnd(tp),
2678 		state == TCP_LISTEN ?
2679 		    fastopenq->max_qlen :
2680 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2681 }
2682 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2683 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2684 			       struct seq_file *f, int i)
2685 {
2686 	long delta = tw->tw_timer.expires - jiffies;
2687 	__be32 dest, src;
2688 	__u16 destp, srcp;
2689 
2690 	dest  = tw->tw_daddr;
2691 	src   = tw->tw_rcv_saddr;
2692 	destp = ntohs(tw->tw_dport);
2693 	srcp  = ntohs(tw->tw_sport);
2694 
2695 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2696 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2697 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2698 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2699 		refcount_read(&tw->tw_refcnt), tw);
2700 }
2701 
2702 #define TMPSZ 150
2703 
tcp4_seq_show(struct seq_file * seq,void * v)2704 static int tcp4_seq_show(struct seq_file *seq, void *v)
2705 {
2706 	struct tcp_iter_state *st;
2707 	struct sock *sk = v;
2708 
2709 	seq_setwidth(seq, TMPSZ - 1);
2710 	if (v == SEQ_START_TOKEN) {
2711 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2712 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2713 			   "inode");
2714 		goto out;
2715 	}
2716 	st = seq->private;
2717 
2718 	if (sk->sk_state == TCP_TIME_WAIT)
2719 		get_timewait4_sock(v, seq, st->num);
2720 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2721 		get_openreq4(v, seq, st->num);
2722 	else
2723 		get_tcp4_sock(v, seq, st->num);
2724 out:
2725 	seq_pad(seq, '\n');
2726 	return 0;
2727 }
2728 
2729 #ifdef CONFIG_BPF_SYSCALL
2730 struct bpf_tcp_iter_state {
2731 	struct tcp_iter_state state;
2732 	unsigned int cur_sk;
2733 	unsigned int end_sk;
2734 	unsigned int max_sk;
2735 	struct sock **batch;
2736 	bool st_bucket_done;
2737 };
2738 
2739 struct bpf_iter__tcp {
2740 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2741 	__bpf_md_ptr(struct sock_common *, sk_common);
2742 	uid_t uid __aligned(8);
2743 };
2744 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2745 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2746 			     struct sock_common *sk_common, uid_t uid)
2747 {
2748 	struct bpf_iter__tcp ctx;
2749 
2750 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2751 	ctx.meta = meta;
2752 	ctx.sk_common = sk_common;
2753 	ctx.uid = uid;
2754 	return bpf_iter_run_prog(prog, &ctx);
2755 }
2756 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2757 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2758 {
2759 	while (iter->cur_sk < iter->end_sk)
2760 		sock_gen_put(iter->batch[iter->cur_sk++]);
2761 }
2762 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2763 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2764 				      unsigned int new_batch_sz)
2765 {
2766 	struct sock **new_batch;
2767 
2768 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2769 			     GFP_USER | __GFP_NOWARN);
2770 	if (!new_batch)
2771 		return -ENOMEM;
2772 
2773 	bpf_iter_tcp_put_batch(iter);
2774 	kvfree(iter->batch);
2775 	iter->batch = new_batch;
2776 	iter->max_sk = new_batch_sz;
2777 
2778 	return 0;
2779 }
2780 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2781 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2782 						 struct sock *start_sk)
2783 {
2784 	struct bpf_tcp_iter_state *iter = seq->private;
2785 	struct tcp_iter_state *st = &iter->state;
2786 	struct inet_connection_sock *icsk;
2787 	unsigned int expected = 1;
2788 	struct sock *sk;
2789 
2790 	sock_hold(start_sk);
2791 	iter->batch[iter->end_sk++] = start_sk;
2792 
2793 	icsk = inet_csk(start_sk);
2794 	inet_lhash2_for_each_icsk_continue(icsk) {
2795 		sk = (struct sock *)icsk;
2796 		if (seq_sk_match(seq, sk)) {
2797 			if (iter->end_sk < iter->max_sk) {
2798 				sock_hold(sk);
2799 				iter->batch[iter->end_sk++] = sk;
2800 			}
2801 			expected++;
2802 		}
2803 	}
2804 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2805 
2806 	return expected;
2807 }
2808 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2809 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2810 						   struct sock *start_sk)
2811 {
2812 	struct bpf_tcp_iter_state *iter = seq->private;
2813 	struct tcp_iter_state *st = &iter->state;
2814 	struct hlist_nulls_node *node;
2815 	unsigned int expected = 1;
2816 	struct sock *sk;
2817 
2818 	sock_hold(start_sk);
2819 	iter->batch[iter->end_sk++] = start_sk;
2820 
2821 	sk = sk_nulls_next(start_sk);
2822 	sk_nulls_for_each_from(sk, node) {
2823 		if (seq_sk_match(seq, sk)) {
2824 			if (iter->end_sk < iter->max_sk) {
2825 				sock_hold(sk);
2826 				iter->batch[iter->end_sk++] = sk;
2827 			}
2828 			expected++;
2829 		}
2830 	}
2831 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2832 
2833 	return expected;
2834 }
2835 
bpf_iter_tcp_batch(struct seq_file * seq)2836 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2837 {
2838 	struct bpf_tcp_iter_state *iter = seq->private;
2839 	struct tcp_iter_state *st = &iter->state;
2840 	unsigned int expected;
2841 	bool resized = false;
2842 	struct sock *sk;
2843 
2844 	/* The st->bucket is done.  Directly advance to the next
2845 	 * bucket instead of having the tcp_seek_last_pos() to skip
2846 	 * one by one in the current bucket and eventually find out
2847 	 * it has to advance to the next bucket.
2848 	 */
2849 	if (iter->st_bucket_done) {
2850 		st->offset = 0;
2851 		st->bucket++;
2852 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2853 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2854 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2855 			st->bucket = 0;
2856 		}
2857 	}
2858 
2859 again:
2860 	/* Get a new batch */
2861 	iter->cur_sk = 0;
2862 	iter->end_sk = 0;
2863 	iter->st_bucket_done = false;
2864 
2865 	sk = tcp_seek_last_pos(seq);
2866 	if (!sk)
2867 		return NULL; /* Done */
2868 
2869 	if (st->state == TCP_SEQ_STATE_LISTENING)
2870 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2871 	else
2872 		expected = bpf_iter_tcp_established_batch(seq, sk);
2873 
2874 	if (iter->end_sk == expected) {
2875 		iter->st_bucket_done = true;
2876 		return sk;
2877 	}
2878 
2879 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2880 		resized = true;
2881 		goto again;
2882 	}
2883 
2884 	return sk;
2885 }
2886 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2887 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2888 {
2889 	/* bpf iter does not support lseek, so it always
2890 	 * continue from where it was stop()-ped.
2891 	 */
2892 	if (*pos)
2893 		return bpf_iter_tcp_batch(seq);
2894 
2895 	return SEQ_START_TOKEN;
2896 }
2897 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2898 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2899 {
2900 	struct bpf_tcp_iter_state *iter = seq->private;
2901 	struct tcp_iter_state *st = &iter->state;
2902 	struct sock *sk;
2903 
2904 	/* Whenever seq_next() is called, the iter->cur_sk is
2905 	 * done with seq_show(), so advance to the next sk in
2906 	 * the batch.
2907 	 */
2908 	if (iter->cur_sk < iter->end_sk) {
2909 		/* Keeping st->num consistent in tcp_iter_state.
2910 		 * bpf_iter_tcp does not use st->num.
2911 		 * meta.seq_num is used instead.
2912 		 */
2913 		st->num++;
2914 		/* Move st->offset to the next sk in the bucket such that
2915 		 * the future start() will resume at st->offset in
2916 		 * st->bucket.  See tcp_seek_last_pos().
2917 		 */
2918 		st->offset++;
2919 		sock_gen_put(iter->batch[iter->cur_sk++]);
2920 	}
2921 
2922 	if (iter->cur_sk < iter->end_sk)
2923 		sk = iter->batch[iter->cur_sk];
2924 	else
2925 		sk = bpf_iter_tcp_batch(seq);
2926 
2927 	++*pos;
2928 	/* Keeping st->last_pos consistent in tcp_iter_state.
2929 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2930 	 */
2931 	st->last_pos = *pos;
2932 	return sk;
2933 }
2934 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2935 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2936 {
2937 	struct bpf_iter_meta meta;
2938 	struct bpf_prog *prog;
2939 	struct sock *sk = v;
2940 	bool slow;
2941 	uid_t uid;
2942 	int ret;
2943 
2944 	if (v == SEQ_START_TOKEN)
2945 		return 0;
2946 
2947 	if (sk_fullsock(sk))
2948 		slow = lock_sock_fast(sk);
2949 
2950 	if (unlikely(sk_unhashed(sk))) {
2951 		ret = SEQ_SKIP;
2952 		goto unlock;
2953 	}
2954 
2955 	if (sk->sk_state == TCP_TIME_WAIT) {
2956 		uid = 0;
2957 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2958 		const struct request_sock *req = v;
2959 
2960 		uid = from_kuid_munged(seq_user_ns(seq),
2961 				       sock_i_uid(req->rsk_listener));
2962 	} else {
2963 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2964 	}
2965 
2966 	meta.seq = seq;
2967 	prog = bpf_iter_get_info(&meta, false);
2968 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2969 
2970 unlock:
2971 	if (sk_fullsock(sk))
2972 		unlock_sock_fast(sk, slow);
2973 	return ret;
2974 
2975 }
2976 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2977 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2978 {
2979 	struct bpf_tcp_iter_state *iter = seq->private;
2980 	struct bpf_iter_meta meta;
2981 	struct bpf_prog *prog;
2982 
2983 	if (!v) {
2984 		meta.seq = seq;
2985 		prog = bpf_iter_get_info(&meta, true);
2986 		if (prog)
2987 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2988 	}
2989 
2990 	if (iter->cur_sk < iter->end_sk) {
2991 		bpf_iter_tcp_put_batch(iter);
2992 		iter->st_bucket_done = false;
2993 	}
2994 }
2995 
2996 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2997 	.show		= bpf_iter_tcp_seq_show,
2998 	.start		= bpf_iter_tcp_seq_start,
2999 	.next		= bpf_iter_tcp_seq_next,
3000 	.stop		= bpf_iter_tcp_seq_stop,
3001 };
3002 #endif
seq_file_family(const struct seq_file * seq)3003 static unsigned short seq_file_family(const struct seq_file *seq)
3004 {
3005 	const struct tcp_seq_afinfo *afinfo;
3006 
3007 #ifdef CONFIG_BPF_SYSCALL
3008 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3009 	if (seq->op == &bpf_iter_tcp_seq_ops)
3010 		return AF_UNSPEC;
3011 #endif
3012 
3013 	/* Iterated from proc fs */
3014 	afinfo = PDE_DATA(file_inode(seq->file));
3015 	return afinfo->family;
3016 }
3017 
3018 static const struct seq_operations tcp4_seq_ops = {
3019 	.show		= tcp4_seq_show,
3020 	.start		= tcp_seq_start,
3021 	.next		= tcp_seq_next,
3022 	.stop		= tcp_seq_stop,
3023 };
3024 
3025 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3026 	.family		= AF_INET,
3027 };
3028 
tcp4_proc_init_net(struct net * net)3029 static int __net_init tcp4_proc_init_net(struct net *net)
3030 {
3031 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3032 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3033 		return -ENOMEM;
3034 	return 0;
3035 }
3036 
tcp4_proc_exit_net(struct net * net)3037 static void __net_exit tcp4_proc_exit_net(struct net *net)
3038 {
3039 	remove_proc_entry("tcp", net->proc_net);
3040 }
3041 
3042 static struct pernet_operations tcp4_net_ops = {
3043 	.init = tcp4_proc_init_net,
3044 	.exit = tcp4_proc_exit_net,
3045 };
3046 
tcp4_proc_init(void)3047 int __init tcp4_proc_init(void)
3048 {
3049 	return register_pernet_subsys(&tcp4_net_ops);
3050 }
3051 
tcp4_proc_exit(void)3052 void tcp4_proc_exit(void)
3053 {
3054 	unregister_pernet_subsys(&tcp4_net_ops);
3055 }
3056 #endif /* CONFIG_PROC_FS */
3057 
3058 /* @wake is one when sk_stream_write_space() calls us.
3059  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3060  * This mimics the strategy used in sock_def_write_space().
3061  */
tcp_stream_memory_free(const struct sock * sk,int wake)3062 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3063 {
3064 	const struct tcp_sock *tp = tcp_sk(sk);
3065 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3066 			    READ_ONCE(tp->snd_nxt);
3067 
3068 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3069 }
3070 EXPORT_SYMBOL(tcp_stream_memory_free);
3071 
3072 struct proto tcp_prot = {
3073 	.name			= "TCP",
3074 	.owner			= THIS_MODULE,
3075 	.close			= tcp_close,
3076 	.pre_connect		= tcp_v4_pre_connect,
3077 	.connect		= tcp_v4_connect,
3078 	.disconnect		= tcp_disconnect,
3079 	.accept			= inet_csk_accept,
3080 	.ioctl			= tcp_ioctl,
3081 	.init			= tcp_v4_init_sock,
3082 	.destroy		= tcp_v4_destroy_sock,
3083 	.shutdown		= tcp_shutdown,
3084 	.setsockopt		= tcp_setsockopt,
3085 	.getsockopt		= tcp_getsockopt,
3086 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3087 	.keepalive		= tcp_set_keepalive,
3088 	.recvmsg		= tcp_recvmsg,
3089 	.sendmsg		= tcp_sendmsg,
3090 	.sendpage		= tcp_sendpage,
3091 	.backlog_rcv		= tcp_v4_do_rcv,
3092 	.release_cb		= tcp_release_cb,
3093 	.hash			= inet_hash,
3094 	.unhash			= inet_unhash,
3095 	.get_port		= inet_csk_get_port,
3096 #ifdef CONFIG_BPF_SYSCALL
3097 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3098 #endif
3099 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3100 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3101 	.stream_memory_free	= tcp_stream_memory_free,
3102 	.sockets_allocated	= &tcp_sockets_allocated,
3103 	.orphan_count		= &tcp_orphan_count,
3104 	.memory_allocated	= &tcp_memory_allocated,
3105 	.memory_pressure	= &tcp_memory_pressure,
3106 	.sysctl_mem		= sysctl_tcp_mem,
3107 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3108 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3109 	.max_header		= MAX_TCP_HEADER,
3110 	.obj_size		= sizeof(struct tcp_sock),
3111 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3112 	.twsk_prot		= &tcp_timewait_sock_ops,
3113 	.rsk_prot		= &tcp_request_sock_ops,
3114 	.h.hashinfo		= &tcp_hashinfo,
3115 	.no_autobind		= true,
3116 	.diag_destroy		= tcp_abort,
3117 };
3118 EXPORT_SYMBOL(tcp_prot);
3119 
tcp_sk_exit(struct net * net)3120 static void __net_exit tcp_sk_exit(struct net *net)
3121 {
3122 	int cpu;
3123 
3124 	if (net->ipv4.tcp_congestion_control)
3125 		bpf_module_put(net->ipv4.tcp_congestion_control,
3126 			       net->ipv4.tcp_congestion_control->owner);
3127 
3128 	for_each_possible_cpu(cpu)
3129 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3130 	free_percpu(net->ipv4.tcp_sk);
3131 }
3132 
tcp_sk_init(struct net * net)3133 static int __net_init tcp_sk_init(struct net *net)
3134 {
3135 	int res, cpu, cnt;
3136 
3137 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3138 	if (!net->ipv4.tcp_sk)
3139 		return -ENOMEM;
3140 
3141 	for_each_possible_cpu(cpu) {
3142 		struct sock *sk;
3143 
3144 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3145 					   IPPROTO_TCP, net);
3146 		if (res)
3147 			goto fail;
3148 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3149 
3150 		/* Please enforce IP_DF and IPID==0 for RST and
3151 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3152 		 */
3153 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3154 
3155 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3156 	}
3157 
3158 	net->ipv4.sysctl_tcp_ecn = 2;
3159 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3160 
3161 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3162 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3163 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3164 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3165 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3166 
3167 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3168 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3169 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3170 
3171 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3172 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3173 	net->ipv4.sysctl_tcp_syncookies = 1;
3174 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3175 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3176 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3177 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3178 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3179 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3180 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3181 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3182 
3183 	cnt = tcp_hashinfo.ehash_mask + 1;
3184 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3185 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3186 
3187 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3188 	net->ipv4.sysctl_tcp_sack = 1;
3189 	net->ipv4.sysctl_tcp_window_scaling = 1;
3190 	net->ipv4.sysctl_tcp_timestamps = 1;
3191 	net->ipv4.sysctl_tcp_early_retrans = 3;
3192 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3193 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3194 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3195 	net->ipv4.sysctl_tcp_max_reordering = 300;
3196 	net->ipv4.sysctl_tcp_dsack = 1;
3197 	net->ipv4.sysctl_tcp_app_win = 31;
3198 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3199 	net->ipv4.sysctl_tcp_frto = 2;
3200 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3201 	/* This limits the percentage of the congestion window which we
3202 	 * will allow a single TSO frame to consume.  Building TSO frames
3203 	 * which are too large can cause TCP streams to be bursty.
3204 	 */
3205 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3206 	/* Default TSQ limit of 16 TSO segments */
3207 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3208 	/* rfc5961 challenge ack rate limiting */
3209 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3210 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3211 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3212 	net->ipv4.sysctl_tcp_autocorking = 1;
3213 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3214 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3215 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3216 	if (net != &init_net) {
3217 		memcpy(net->ipv4.sysctl_tcp_rmem,
3218 		       init_net.ipv4.sysctl_tcp_rmem,
3219 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3220 		memcpy(net->ipv4.sysctl_tcp_wmem,
3221 		       init_net.ipv4.sysctl_tcp_wmem,
3222 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3223 	}
3224 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3225 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3226 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3227 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3228 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3229 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3230 
3231 	/* Reno is always built in */
3232 	if (!net_eq(net, &init_net) &&
3233 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3234 			       init_net.ipv4.tcp_congestion_control->owner))
3235 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3236 	else
3237 		net->ipv4.tcp_congestion_control = &tcp_reno;
3238 
3239 	return 0;
3240 fail:
3241 	tcp_sk_exit(net);
3242 
3243 	return res;
3244 }
3245 
tcp_sk_exit_batch(struct list_head * net_exit_list)3246 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3247 {
3248 	struct net *net;
3249 
3250 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3251 
3252 	list_for_each_entry(net, net_exit_list, exit_list)
3253 		tcp_fastopen_ctx_destroy(net);
3254 }
3255 
3256 static struct pernet_operations __net_initdata tcp_sk_ops = {
3257        .init	   = tcp_sk_init,
3258        .exit	   = tcp_sk_exit,
3259        .exit_batch = tcp_sk_exit_batch,
3260 };
3261 
3262 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3263 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3264 		     struct sock_common *sk_common, uid_t uid)
3265 
3266 #define INIT_BATCH_SZ 16
3267 
3268 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3269 {
3270 	struct bpf_tcp_iter_state *iter = priv_data;
3271 	int err;
3272 
3273 	err = bpf_iter_init_seq_net(priv_data, aux);
3274 	if (err)
3275 		return err;
3276 
3277 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3278 	if (err) {
3279 		bpf_iter_fini_seq_net(priv_data);
3280 		return err;
3281 	}
3282 
3283 	return 0;
3284 }
3285 
bpf_iter_fini_tcp(void * priv_data)3286 static void bpf_iter_fini_tcp(void *priv_data)
3287 {
3288 	struct bpf_tcp_iter_state *iter = priv_data;
3289 
3290 	bpf_iter_fini_seq_net(priv_data);
3291 	kvfree(iter->batch);
3292 }
3293 
3294 static const struct bpf_iter_seq_info tcp_seq_info = {
3295 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3296 	.init_seq_private	= bpf_iter_init_tcp,
3297 	.fini_seq_private	= bpf_iter_fini_tcp,
3298 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3299 };
3300 
3301 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3302 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3303 			    const struct bpf_prog *prog)
3304 {
3305 	switch (func_id) {
3306 	case BPF_FUNC_setsockopt:
3307 		return &bpf_sk_setsockopt_proto;
3308 	case BPF_FUNC_getsockopt:
3309 		return &bpf_sk_getsockopt_proto;
3310 	default:
3311 		return NULL;
3312 	}
3313 }
3314 
3315 static struct bpf_iter_reg tcp_reg_info = {
3316 	.target			= "tcp",
3317 	.ctx_arg_info_size	= 1,
3318 	.ctx_arg_info		= {
3319 		{ offsetof(struct bpf_iter__tcp, sk_common),
3320 		  PTR_TO_BTF_ID_OR_NULL },
3321 	},
3322 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3323 	.seq_info		= &tcp_seq_info,
3324 };
3325 
bpf_iter_register(void)3326 static void __init bpf_iter_register(void)
3327 {
3328 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3329 	if (bpf_iter_reg_target(&tcp_reg_info))
3330 		pr_warn("Warning: could not register bpf iterator tcp\n");
3331 }
3332 
3333 #endif
3334 
tcp_v4_init(void)3335 void __init tcp_v4_init(void)
3336 {
3337 	if (register_pernet_subsys(&tcp_sk_ops))
3338 		panic("Failed to create the TCP control socket.\n");
3339 
3340 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3341 	bpf_iter_register();
3342 #endif
3343 }
3344