• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 	struct tcp_sock *tp = tcp_sk(sk);
113 
114 	/* With PAWS, it is safe from the viewpoint
115 	   of data integrity. Even without PAWS it is safe provided sequence
116 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 
118 	   Actually, the idea is close to VJ's one, only timestamp cache is
119 	   held not per host, but per port pair and TW bucket is used as state
120 	   holder.
121 
122 	   If TW bucket has been already destroyed we fall back to VJ's scheme
123 	   and use initial timestamp retrieved from peer table.
124 	 */
125 	if (tcptw->tw_ts_recent_stamp &&
126 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
127 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 		if (tp->write_seq == 0)
130 			tp->write_seq = 1;
131 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
132 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 		sock_hold(sktw);
134 		return 1;
135 	}
136 
137 	return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 
141 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct tcp_sock *tp = tcp_sk(sk);
147 	__be16 orig_sport, orig_dport;
148 	__be32 daddr, nexthop;
149 	struct flowi4 *fl4;
150 	struct rtable *rt;
151 	int err;
152 	struct ip_options_rcu *inet_opt;
153 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
154 
155 	if (addr_len < sizeof(struct sockaddr_in))
156 		return -EINVAL;
157 
158 	if (usin->sin_family != AF_INET)
159 		return -EAFNOSUPPORT;
160 
161 	nexthop = daddr = usin->sin_addr.s_addr;
162 	inet_opt = rcu_dereference_protected(inet->inet_opt,
163 					     lockdep_sock_is_held(sk));
164 	if (inet_opt && inet_opt->opt.srr) {
165 		if (!daddr)
166 			return -EINVAL;
167 		nexthop = inet_opt->opt.faddr;
168 	}
169 
170 	orig_sport = inet->inet_sport;
171 	orig_dport = usin->sin_port;
172 	fl4 = &inet->cork.fl.u.ip4;
173 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 			      IPPROTO_TCP,
176 			      orig_sport, orig_dport, sk);
177 	if (IS_ERR(rt)) {
178 		err = PTR_ERR(rt);
179 		if (err == -ENETUNREACH)
180 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 		return err;
182 	}
183 
184 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 		ip_rt_put(rt);
186 		return -ENETUNREACH;
187 	}
188 
189 	if (!inet_opt || !inet_opt->opt.srr)
190 		daddr = fl4->daddr;
191 
192 	if (!inet->inet_saddr)
193 		inet->inet_saddr = fl4->saddr;
194 	sk_rcv_saddr_set(sk, inet->inet_saddr);
195 
196 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 		/* Reset inherited state */
198 		tp->rx_opt.ts_recent	   = 0;
199 		tp->rx_opt.ts_recent_stamp = 0;
200 		if (likely(!tp->repair))
201 			tp->write_seq	   = 0;
202 	}
203 
204 	inet->inet_dport = usin->sin_port;
205 	sk_daddr_set(sk, daddr);
206 
207 	inet_csk(sk)->icsk_ext_hdr_len = 0;
208 	if (inet_opt)
209 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210 
211 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212 
213 	/* Socket identity is still unknown (sport may be zero).
214 	 * However we set state to SYN-SENT and not releasing socket
215 	 * lock select source port, enter ourselves into the hash tables and
216 	 * complete initialization after this.
217 	 */
218 	tcp_set_state(sk, TCP_SYN_SENT);
219 	err = inet_hash_connect(tcp_death_row, sk);
220 	if (err)
221 		goto failure;
222 
223 	sk_set_txhash(sk);
224 
225 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 			       inet->inet_sport, inet->inet_dport, sk);
227 	if (IS_ERR(rt)) {
228 		err = PTR_ERR(rt);
229 		rt = NULL;
230 		goto failure;
231 	}
232 	/* OK, now commit destination to socket.  */
233 	sk->sk_gso_type = SKB_GSO_TCPV4;
234 	sk_setup_caps(sk, &rt->dst);
235 	rt = NULL;
236 
237 	if (likely(!tp->repair)) {
238 		if (!tp->write_seq)
239 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
240 						       inet->inet_daddr,
241 						       inet->inet_sport,
242 						       usin->sin_port);
243 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
244 						 inet->inet_saddr,
245 						 inet->inet_daddr);
246 	}
247 
248 	inet->inet_id = prandom_u32();
249 
250 	if (tcp_fastopen_defer_connect(sk, &err))
251 		return err;
252 	if (err)
253 		goto failure;
254 
255 	err = tcp_connect(sk);
256 
257 	if (err)
258 		goto failure;
259 
260 	return 0;
261 
262 failure:
263 	/*
264 	 * This unhashes the socket and releases the local port,
265 	 * if necessary.
266 	 */
267 	tcp_set_state(sk, TCP_CLOSE);
268 	ip_rt_put(rt);
269 	sk->sk_route_caps = 0;
270 	inet->inet_dport = 0;
271 	return err;
272 }
273 EXPORT_SYMBOL(tcp_v4_connect);
274 
275 /*
276  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277  * It can be called through tcp_release_cb() if socket was owned by user
278  * at the time tcp_v4_err() was called to handle ICMP message.
279  */
tcp_v4_mtu_reduced(struct sock * sk)280 void tcp_v4_mtu_reduced(struct sock *sk)
281 {
282 	struct inet_sock *inet = inet_sk(sk);
283 	struct dst_entry *dst;
284 	u32 mtu;
285 
286 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 		return;
288 	mtu = tcp_sk(sk)->mtu_info;
289 	dst = inet_csk_update_pmtu(sk, mtu);
290 	if (!dst)
291 		return;
292 
293 	/* Something is about to be wrong... Remember soft error
294 	 * for the case, if this connection will not able to recover.
295 	 */
296 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297 		sk->sk_err_soft = EMSGSIZE;
298 
299 	mtu = dst_mtu(dst);
300 
301 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
302 	    ip_sk_accept_pmtu(sk) &&
303 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304 		tcp_sync_mss(sk, mtu);
305 
306 		/* Resend the TCP packet because it's
307 		 * clear that the old packet has been
308 		 * dropped. This is the new "fast" path mtu
309 		 * discovery.
310 		 */
311 		tcp_simple_retransmit(sk);
312 	} /* else let the usual retransmit timer handle it */
313 }
314 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
315 
do_redirect(struct sk_buff * skb,struct sock * sk)316 static void do_redirect(struct sk_buff *skb, struct sock *sk)
317 {
318 	struct dst_entry *dst = __sk_dst_check(sk, 0);
319 
320 	if (dst)
321 		dst->ops->redirect(dst, sk, skb);
322 }
323 
324 
325 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)326 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
327 {
328 	struct request_sock *req = inet_reqsk(sk);
329 	struct net *net = sock_net(sk);
330 
331 	/* ICMPs are not backlogged, hence we cannot get
332 	 * an established socket here.
333 	 */
334 	if (seq != tcp_rsk(req)->snt_isn) {
335 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
336 	} else if (abort) {
337 		/*
338 		 * Still in SYN_RECV, just remove it silently.
339 		 * There is no good way to pass the error to the newly
340 		 * created socket, and POSIX does not want network
341 		 * errors returned from accept().
342 		 */
343 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
344 		tcp_listendrop(req->rsk_listener);
345 	}
346 	reqsk_put(req);
347 }
348 EXPORT_SYMBOL(tcp_req_err);
349 
350 /*
351  * This routine is called by the ICMP module when it gets some
352  * sort of error condition.  If err < 0 then the socket should
353  * be closed and the error returned to the user.  If err > 0
354  * it's just the icmp type << 8 | icmp code.  After adjustment
355  * header points to the first 8 bytes of the tcp header.  We need
356  * to find the appropriate port.
357  *
358  * The locking strategy used here is very "optimistic". When
359  * someone else accesses the socket the ICMP is just dropped
360  * and for some paths there is no check at all.
361  * A more general error queue to queue errors for later handling
362  * is probably better.
363  *
364  */
365 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)366 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
367 {
368 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
369 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
370 	struct inet_connection_sock *icsk;
371 	struct tcp_sock *tp;
372 	struct inet_sock *inet;
373 	const int type = icmp_hdr(icmp_skb)->type;
374 	const int code = icmp_hdr(icmp_skb)->code;
375 	struct sock *sk;
376 	struct sk_buff *skb;
377 	struct request_sock *fastopen;
378 	u32 seq, snd_una;
379 	s32 remaining;
380 	u32 delta_us;
381 	int err;
382 	struct net *net = dev_net(icmp_skb->dev);
383 
384 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
385 				       th->dest, iph->saddr, ntohs(th->source),
386 				       inet_iif(icmp_skb), 0);
387 	if (!sk) {
388 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
389 		return;
390 	}
391 	if (sk->sk_state == TCP_TIME_WAIT) {
392 		inet_twsk_put(inet_twsk(sk));
393 		return;
394 	}
395 	seq = ntohl(th->seq);
396 	if (sk->sk_state == TCP_NEW_SYN_RECV)
397 		return tcp_req_err(sk, seq,
398 				  type == ICMP_PARAMETERPROB ||
399 				  type == ICMP_TIME_EXCEEDED ||
400 				  (type == ICMP_DEST_UNREACH &&
401 				   (code == ICMP_NET_UNREACH ||
402 				    code == ICMP_HOST_UNREACH)));
403 
404 	bh_lock_sock(sk);
405 	/* If too many ICMPs get dropped on busy
406 	 * servers this needs to be solved differently.
407 	 * We do take care of PMTU discovery (RFC1191) special case :
408 	 * we can receive locally generated ICMP messages while socket is held.
409 	 */
410 	if (sock_owned_by_user(sk)) {
411 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
412 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
413 	}
414 	if (sk->sk_state == TCP_CLOSE)
415 		goto out;
416 
417 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
418 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
419 		goto out;
420 	}
421 
422 	icsk = inet_csk(sk);
423 	tp = tcp_sk(sk);
424 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
425 	fastopen = tp->fastopen_rsk;
426 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
427 	if (sk->sk_state != TCP_LISTEN &&
428 	    !between(seq, snd_una, tp->snd_nxt)) {
429 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
430 		goto out;
431 	}
432 
433 	switch (type) {
434 	case ICMP_REDIRECT:
435 		if (!sock_owned_by_user(sk))
436 			do_redirect(icmp_skb, sk);
437 		goto out;
438 	case ICMP_SOURCE_QUENCH:
439 		/* Just silently ignore these. */
440 		goto out;
441 	case ICMP_PARAMETERPROB:
442 		err = EPROTO;
443 		break;
444 	case ICMP_DEST_UNREACH:
445 		if (code > NR_ICMP_UNREACH)
446 			goto out;
447 
448 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
449 			/* We are not interested in TCP_LISTEN and open_requests
450 			 * (SYN-ACKs send out by Linux are always <576bytes so
451 			 * they should go through unfragmented).
452 			 */
453 			if (sk->sk_state == TCP_LISTEN)
454 				goto out;
455 
456 			tp->mtu_info = info;
457 			if (!sock_owned_by_user(sk)) {
458 				tcp_v4_mtu_reduced(sk);
459 			} else {
460 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
461 					sock_hold(sk);
462 			}
463 			goto out;
464 		}
465 
466 		err = icmp_err_convert[code].errno;
467 		/* check if icmp_skb allows revert of backoff
468 		 * (see draft-zimmermann-tcp-lcd) */
469 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
470 			break;
471 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
472 		    !icsk->icsk_backoff || fastopen)
473 			break;
474 
475 		if (sock_owned_by_user(sk))
476 			break;
477 
478 		skb = tcp_write_queue_head(sk);
479 		if (WARN_ON_ONCE(!skb))
480 			break;
481 
482 		icsk->icsk_backoff--;
483 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
484 					       TCP_TIMEOUT_INIT;
485 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
486 
487 		tcp_mstamp_refresh(tp);
488 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
489 		remaining = icsk->icsk_rto -
490 			    usecs_to_jiffies(delta_us);
491 
492 		if (remaining > 0) {
493 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
494 						  remaining, TCP_RTO_MAX);
495 		} else {
496 			/* RTO revert clocked out retransmission.
497 			 * Will retransmit now */
498 			tcp_retransmit_timer(sk);
499 		}
500 
501 		break;
502 	case ICMP_TIME_EXCEEDED:
503 		err = EHOSTUNREACH;
504 		break;
505 	default:
506 		goto out;
507 	}
508 
509 	switch (sk->sk_state) {
510 	case TCP_SYN_SENT:
511 	case TCP_SYN_RECV:
512 		/* Only in fast or simultaneous open. If a fast open socket is
513 		 * is already accepted it is treated as a connected one below.
514 		 */
515 		if (fastopen && !fastopen->sk)
516 			break;
517 
518 		if (!sock_owned_by_user(sk)) {
519 			sk->sk_err = err;
520 
521 			sk->sk_error_report(sk);
522 
523 			tcp_done(sk);
524 		} else {
525 			sk->sk_err_soft = err;
526 		}
527 		goto out;
528 	}
529 
530 	/* If we've already connected we will keep trying
531 	 * until we time out, or the user gives up.
532 	 *
533 	 * rfc1122 4.2.3.9 allows to consider as hard errors
534 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535 	 * but it is obsoleted by pmtu discovery).
536 	 *
537 	 * Note, that in modern internet, where routing is unreliable
538 	 * and in each dark corner broken firewalls sit, sending random
539 	 * errors ordered by their masters even this two messages finally lose
540 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
541 	 *
542 	 * Now we are in compliance with RFCs.
543 	 *							--ANK (980905)
544 	 */
545 
546 	inet = inet_sk(sk);
547 	if (!sock_owned_by_user(sk) && inet->recverr) {
548 		sk->sk_err = err;
549 		sk->sk_error_report(sk);
550 	} else	{ /* Only an error on timeout */
551 		sk->sk_err_soft = err;
552 	}
553 
554 out:
555 	bh_unlock_sock(sk);
556 	sock_put(sk);
557 }
558 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)559 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
560 {
561 	struct tcphdr *th = tcp_hdr(skb);
562 
563 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
564 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
565 		skb->csum_start = skb_transport_header(skb) - skb->head;
566 		skb->csum_offset = offsetof(struct tcphdr, check);
567 	} else {
568 		th->check = tcp_v4_check(skb->len, saddr, daddr,
569 					 csum_partial(th,
570 						      th->doff << 2,
571 						      skb->csum));
572 	}
573 }
574 
575 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)576 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
577 {
578 	const struct inet_sock *inet = inet_sk(sk);
579 
580 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
581 }
582 EXPORT_SYMBOL(tcp_v4_send_check);
583 
584 /*
585  *	This routine will send an RST to the other tcp.
586  *
587  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
588  *		      for reset.
589  *	Answer: if a packet caused RST, it is not for a socket
590  *		existing in our system, if it is matched to a socket,
591  *		it is just duplicate segment or bug in other side's TCP.
592  *		So that we build reply only basing on parameters
593  *		arrived with segment.
594  *	Exception: precedence violation. We do not implement it in any case.
595  */
596 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)597 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
598 {
599 	const struct tcphdr *th = tcp_hdr(skb);
600 	struct {
601 		struct tcphdr th;
602 #ifdef CONFIG_TCP_MD5SIG
603 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
604 #endif
605 	} rep;
606 	struct ip_reply_arg arg;
607 #ifdef CONFIG_TCP_MD5SIG
608 	struct tcp_md5sig_key *key = NULL;
609 	const __u8 *hash_location = NULL;
610 	unsigned char newhash[16];
611 	int genhash;
612 	struct sock *sk1 = NULL;
613 #endif
614 	struct net *net;
615 
616 	/* Never send a reset in response to a reset. */
617 	if (th->rst)
618 		return;
619 
620 	/* If sk not NULL, it means we did a successful lookup and incoming
621 	 * route had to be correct. prequeue might have dropped our dst.
622 	 */
623 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
624 		return;
625 
626 	/* Swap the send and the receive. */
627 	memset(&rep, 0, sizeof(rep));
628 	rep.th.dest   = th->source;
629 	rep.th.source = th->dest;
630 	rep.th.doff   = sizeof(struct tcphdr) / 4;
631 	rep.th.rst    = 1;
632 
633 	if (th->ack) {
634 		rep.th.seq = th->ack_seq;
635 	} else {
636 		rep.th.ack = 1;
637 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
638 				       skb->len - (th->doff << 2));
639 	}
640 
641 	memset(&arg, 0, sizeof(arg));
642 	arg.iov[0].iov_base = (unsigned char *)&rep;
643 	arg.iov[0].iov_len  = sizeof(rep.th);
644 
645 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
646 #ifdef CONFIG_TCP_MD5SIG
647 	rcu_read_lock();
648 	hash_location = tcp_parse_md5sig_option(th);
649 	if (sk && sk_fullsock(sk)) {
650 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651 					&ip_hdr(skb)->saddr, AF_INET);
652 	} else if (hash_location) {
653 		/*
654 		 * active side is lost. Try to find listening socket through
655 		 * source port, and then find md5 key through listening socket.
656 		 * we are not loose security here:
657 		 * Incoming packet is checked with md5 hash with finding key,
658 		 * no RST generated if md5 hash doesn't match.
659 		 */
660 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
661 					     ip_hdr(skb)->saddr,
662 					     th->source, ip_hdr(skb)->daddr,
663 					     ntohs(th->source), inet_iif(skb),
664 					     tcp_v4_sdif(skb));
665 		/* don't send rst if it can't find key */
666 		if (!sk1)
667 			goto out;
668 
669 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670 					&ip_hdr(skb)->saddr, AF_INET);
671 		if (!key)
672 			goto out;
673 
674 
675 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
676 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
677 			goto out;
678 
679 	}
680 
681 	if (key) {
682 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683 				   (TCPOPT_NOP << 16) |
684 				   (TCPOPT_MD5SIG << 8) |
685 				   TCPOLEN_MD5SIG);
686 		/* Update length and the length the header thinks exists */
687 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688 		rep.th.doff = arg.iov[0].iov_len / 4;
689 
690 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691 				     key, ip_hdr(skb)->saddr,
692 				     ip_hdr(skb)->daddr, &rep.th);
693 	}
694 #endif
695 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696 				      ip_hdr(skb)->saddr, /* XXX */
697 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
698 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700 
701 	/* When socket is gone, all binding information is lost.
702 	 * routing might fail in this case. No choice here, if we choose to force
703 	 * input interface, we will misroute in case of asymmetric route.
704 	 */
705 	if (sk)
706 		arg.bound_dev_if = sk->sk_bound_dev_if;
707 
708 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
709 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
710 
711 	arg.tos = ip_hdr(skb)->tos;
712 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
713 	local_bh_disable();
714 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
715 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
716 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
717 			      &arg, arg.iov[0].iov_len);
718 
719 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
720 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
721 	local_bh_enable();
722 
723 #ifdef CONFIG_TCP_MD5SIG
724 out:
725 	rcu_read_unlock();
726 #endif
727 }
728 
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730    outside socket context is ugly, certainly. What can I do?
731  */
732 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)733 static void tcp_v4_send_ack(const struct sock *sk,
734 			    struct sk_buff *skb, u32 seq, u32 ack,
735 			    u32 win, u32 tsval, u32 tsecr, int oif,
736 			    struct tcp_md5sig_key *key,
737 			    int reply_flags, u8 tos)
738 {
739 	const struct tcphdr *th = tcp_hdr(skb);
740 	struct {
741 		struct tcphdr th;
742 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
745 #endif
746 			];
747 	} rep;
748 	struct net *net = sock_net(sk);
749 	struct ip_reply_arg arg;
750 
751 	memset(&rep.th, 0, sizeof(struct tcphdr));
752 	memset(&arg, 0, sizeof(arg));
753 
754 	arg.iov[0].iov_base = (unsigned char *)&rep;
755 	arg.iov[0].iov_len  = sizeof(rep.th);
756 	if (tsecr) {
757 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
758 				   (TCPOPT_TIMESTAMP << 8) |
759 				   TCPOLEN_TIMESTAMP);
760 		rep.opt[1] = htonl(tsval);
761 		rep.opt[2] = htonl(tsecr);
762 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
763 	}
764 
765 	/* Swap the send and the receive. */
766 	rep.th.dest    = th->source;
767 	rep.th.source  = th->dest;
768 	rep.th.doff    = arg.iov[0].iov_len / 4;
769 	rep.th.seq     = htonl(seq);
770 	rep.th.ack_seq = htonl(ack);
771 	rep.th.ack     = 1;
772 	rep.th.window  = htons(win);
773 
774 #ifdef CONFIG_TCP_MD5SIG
775 	if (key) {
776 		int offset = (tsecr) ? 3 : 0;
777 
778 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
779 					  (TCPOPT_NOP << 16) |
780 					  (TCPOPT_MD5SIG << 8) |
781 					  TCPOLEN_MD5SIG);
782 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783 		rep.th.doff = arg.iov[0].iov_len/4;
784 
785 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
786 				    key, ip_hdr(skb)->saddr,
787 				    ip_hdr(skb)->daddr, &rep.th);
788 	}
789 #endif
790 	arg.flags = reply_flags;
791 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 				      ip_hdr(skb)->saddr, /* XXX */
793 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 	if (oif)
796 		arg.bound_dev_if = oif;
797 	arg.tos = tos;
798 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
799 	local_bh_disable();
800 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
801 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
802 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
803 			      &arg, arg.iov[0].iov_len);
804 
805 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
806 	local_bh_enable();
807 }
808 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)809 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
810 {
811 	struct inet_timewait_sock *tw = inet_twsk(sk);
812 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813 
814 	tcp_v4_send_ack(sk, skb,
815 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
816 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
817 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
818 			tcptw->tw_ts_recent,
819 			tw->tw_bound_dev_if,
820 			tcp_twsk_md5_key(tcptw),
821 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
822 			tw->tw_tos
823 			);
824 
825 	inet_twsk_put(tw);
826 }
827 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)828 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
829 				  struct request_sock *req)
830 {
831 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
833 	 */
834 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
835 					     tcp_sk(sk)->snd_nxt;
836 
837 	/* RFC 7323 2.3
838 	 * The window field (SEG.WND) of every outgoing segment, with the
839 	 * exception of <SYN> segments, MUST be right-shifted by
840 	 * Rcv.Wind.Shift bits:
841 	 */
842 	tcp_v4_send_ack(sk, skb, seq,
843 			tcp_rsk(req)->rcv_nxt,
844 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
845 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
846 			req->ts_recent,
847 			0,
848 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
849 					  AF_INET),
850 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
851 			ip_hdr(skb)->tos);
852 }
853 
854 /*
855  *	Send a SYN-ACK after having received a SYN.
856  *	This still operates on a request_sock only, not on a big
857  *	socket.
858  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)859 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
860 			      struct flowi *fl,
861 			      struct request_sock *req,
862 			      struct tcp_fastopen_cookie *foc,
863 			      enum tcp_synack_type synack_type)
864 {
865 	const struct inet_request_sock *ireq = inet_rsk(req);
866 	struct flowi4 fl4;
867 	int err = -1;
868 	struct sk_buff *skb;
869 
870 	/* First, grab a route. */
871 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
872 		return -1;
873 
874 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
875 
876 	if (skb) {
877 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
878 
879 		rcu_read_lock();
880 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
881 					    ireq->ir_rmt_addr,
882 					    rcu_dereference(ireq->ireq_opt));
883 		rcu_read_unlock();
884 		err = net_xmit_eval(err);
885 	}
886 
887 	return err;
888 }
889 
890 /*
891  *	IPv4 request_sock destructor.
892  */
tcp_v4_reqsk_destructor(struct request_sock * req)893 static void tcp_v4_reqsk_destructor(struct request_sock *req)
894 {
895 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
896 }
897 
898 #ifdef CONFIG_TCP_MD5SIG
899 /*
900  * RFC2385 MD5 checksumming requires a mapping of
901  * IP address->MD5 Key.
902  * We need to maintain these in the sk structure.
903  */
904 
905 /* Find the Key structure for an address.  */
tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)906 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
907 					 const union tcp_md5_addr *addr,
908 					 int family)
909 {
910 	const struct tcp_sock *tp = tcp_sk(sk);
911 	struct tcp_md5sig_key *key;
912 	const struct tcp_md5sig_info *md5sig;
913 	__be32 mask;
914 	struct tcp_md5sig_key *best_match = NULL;
915 	bool match;
916 
917 	/* caller either holds rcu_read_lock() or socket lock */
918 	md5sig = rcu_dereference_check(tp->md5sig_info,
919 				       lockdep_sock_is_held(sk));
920 	if (!md5sig)
921 		return NULL;
922 
923 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
924 		if (key->family != family)
925 			continue;
926 
927 		if (family == AF_INET) {
928 			mask = inet_make_mask(key->prefixlen);
929 			match = (key->addr.a4.s_addr & mask) ==
930 				(addr->a4.s_addr & mask);
931 #if IS_ENABLED(CONFIG_IPV6)
932 		} else if (family == AF_INET6) {
933 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
934 						  key->prefixlen);
935 #endif
936 		} else {
937 			match = false;
938 		}
939 
940 		if (match && (!best_match ||
941 			      key->prefixlen > best_match->prefixlen))
942 			best_match = key;
943 	}
944 	return best_match;
945 }
946 EXPORT_SYMBOL(tcp_md5_do_lookup);
947 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)948 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
949 						      const union tcp_md5_addr *addr,
950 						      int family, u8 prefixlen)
951 {
952 	const struct tcp_sock *tp = tcp_sk(sk);
953 	struct tcp_md5sig_key *key;
954 	unsigned int size = sizeof(struct in_addr);
955 	const struct tcp_md5sig_info *md5sig;
956 
957 	/* caller either holds rcu_read_lock() or socket lock */
958 	md5sig = rcu_dereference_check(tp->md5sig_info,
959 				       lockdep_sock_is_held(sk));
960 	if (!md5sig)
961 		return NULL;
962 #if IS_ENABLED(CONFIG_IPV6)
963 	if (family == AF_INET6)
964 		size = sizeof(struct in6_addr);
965 #endif
966 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
967 		if (key->family != family)
968 			continue;
969 		if (!memcmp(&key->addr, addr, size) &&
970 		    key->prefixlen == prefixlen)
971 			return key;
972 	}
973 	return NULL;
974 }
975 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)976 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
977 					 const struct sock *addr_sk)
978 {
979 	const union tcp_md5_addr *addr;
980 
981 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
982 	return tcp_md5_do_lookup(sk, addr, AF_INET);
983 }
984 EXPORT_SYMBOL(tcp_v4_md5_lookup);
985 
986 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)987 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
988 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
989 		   gfp_t gfp)
990 {
991 	/* Add Key to the list */
992 	struct tcp_md5sig_key *key;
993 	struct tcp_sock *tp = tcp_sk(sk);
994 	struct tcp_md5sig_info *md5sig;
995 
996 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
997 	if (key) {
998 		/* Pre-existing entry - just update that one. */
999 		memcpy(key->key, newkey, newkeylen);
1000 		key->keylen = newkeylen;
1001 		return 0;
1002 	}
1003 
1004 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1005 					   lockdep_sock_is_held(sk));
1006 	if (!md5sig) {
1007 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1008 		if (!md5sig)
1009 			return -ENOMEM;
1010 
1011 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1012 		INIT_HLIST_HEAD(&md5sig->head);
1013 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1014 	}
1015 
1016 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1017 	if (!key)
1018 		return -ENOMEM;
1019 	if (!tcp_alloc_md5sig_pool()) {
1020 		sock_kfree_s(sk, key, sizeof(*key));
1021 		return -ENOMEM;
1022 	}
1023 
1024 	memcpy(key->key, newkey, newkeylen);
1025 	key->keylen = newkeylen;
1026 	key->family = family;
1027 	key->prefixlen = prefixlen;
1028 	memcpy(&key->addr, addr,
1029 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1030 				      sizeof(struct in_addr));
1031 	hlist_add_head_rcu(&key->node, &md5sig->head);
1032 	return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_add);
1035 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1036 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1037 		   u8 prefixlen)
1038 {
1039 	struct tcp_md5sig_key *key;
1040 
1041 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1042 	if (!key)
1043 		return -ENOENT;
1044 	hlist_del_rcu(&key->node);
1045 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1046 	kfree_rcu(key, rcu);
1047 	return 0;
1048 }
1049 EXPORT_SYMBOL(tcp_md5_do_del);
1050 
tcp_clear_md5_list(struct sock * sk)1051 static void tcp_clear_md5_list(struct sock *sk)
1052 {
1053 	struct tcp_sock *tp = tcp_sk(sk);
1054 	struct tcp_md5sig_key *key;
1055 	struct hlist_node *n;
1056 	struct tcp_md5sig_info *md5sig;
1057 
1058 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1059 
1060 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1061 		hlist_del_rcu(&key->node);
1062 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1063 		kfree_rcu(key, rcu);
1064 	}
1065 }
1066 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1067 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1068 				 char __user *optval, int optlen)
1069 {
1070 	struct tcp_md5sig cmd;
1071 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1072 	u8 prefixlen = 32;
1073 
1074 	if (optlen < sizeof(cmd))
1075 		return -EINVAL;
1076 
1077 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1078 		return -EFAULT;
1079 
1080 	if (sin->sin_family != AF_INET)
1081 		return -EINVAL;
1082 
1083 	if (optname == TCP_MD5SIG_EXT &&
1084 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1085 		prefixlen = cmd.tcpm_prefixlen;
1086 		if (prefixlen > 32)
1087 			return -EINVAL;
1088 	}
1089 
1090 	if (!cmd.tcpm_keylen)
1091 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1092 				      AF_INET, prefixlen);
1093 
1094 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1095 		return -EINVAL;
1096 
1097 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1098 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1099 			      GFP_KERNEL);
1100 }
1101 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1102 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1103 				   __be32 daddr, __be32 saddr,
1104 				   const struct tcphdr *th, int nbytes)
1105 {
1106 	struct tcp4_pseudohdr *bp;
1107 	struct scatterlist sg;
1108 	struct tcphdr *_th;
1109 
1110 	bp = hp->scratch;
1111 	bp->saddr = saddr;
1112 	bp->daddr = daddr;
1113 	bp->pad = 0;
1114 	bp->protocol = IPPROTO_TCP;
1115 	bp->len = cpu_to_be16(nbytes);
1116 
1117 	_th = (struct tcphdr *)(bp + 1);
1118 	memcpy(_th, th, sizeof(*th));
1119 	_th->check = 0;
1120 
1121 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1122 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1123 				sizeof(*bp) + sizeof(*th));
1124 	return crypto_ahash_update(hp->md5_req);
1125 }
1126 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1127 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1128 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1129 {
1130 	struct tcp_md5sig_pool *hp;
1131 	struct ahash_request *req;
1132 
1133 	hp = tcp_get_md5sig_pool();
1134 	if (!hp)
1135 		goto clear_hash_noput;
1136 	req = hp->md5_req;
1137 
1138 	if (crypto_ahash_init(req))
1139 		goto clear_hash;
1140 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1141 		goto clear_hash;
1142 	if (tcp_md5_hash_key(hp, key))
1143 		goto clear_hash;
1144 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1145 	if (crypto_ahash_final(req))
1146 		goto clear_hash;
1147 
1148 	tcp_put_md5sig_pool();
1149 	return 0;
1150 
1151 clear_hash:
1152 	tcp_put_md5sig_pool();
1153 clear_hash_noput:
1154 	memset(md5_hash, 0, 16);
1155 	return 1;
1156 }
1157 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1158 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1159 			const struct sock *sk,
1160 			const struct sk_buff *skb)
1161 {
1162 	struct tcp_md5sig_pool *hp;
1163 	struct ahash_request *req;
1164 	const struct tcphdr *th = tcp_hdr(skb);
1165 	__be32 saddr, daddr;
1166 
1167 	if (sk) { /* valid for establish/request sockets */
1168 		saddr = sk->sk_rcv_saddr;
1169 		daddr = sk->sk_daddr;
1170 	} else {
1171 		const struct iphdr *iph = ip_hdr(skb);
1172 		saddr = iph->saddr;
1173 		daddr = iph->daddr;
1174 	}
1175 
1176 	hp = tcp_get_md5sig_pool();
1177 	if (!hp)
1178 		goto clear_hash_noput;
1179 	req = hp->md5_req;
1180 
1181 	if (crypto_ahash_init(req))
1182 		goto clear_hash;
1183 
1184 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1185 		goto clear_hash;
1186 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1187 		goto clear_hash;
1188 	if (tcp_md5_hash_key(hp, key))
1189 		goto clear_hash;
1190 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1191 	if (crypto_ahash_final(req))
1192 		goto clear_hash;
1193 
1194 	tcp_put_md5sig_pool();
1195 	return 0;
1196 
1197 clear_hash:
1198 	tcp_put_md5sig_pool();
1199 clear_hash_noput:
1200 	memset(md5_hash, 0, 16);
1201 	return 1;
1202 }
1203 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1204 
1205 #endif
1206 
1207 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1208 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1209 				    const struct sk_buff *skb)
1210 {
1211 #ifdef CONFIG_TCP_MD5SIG
1212 	/*
1213 	 * This gets called for each TCP segment that arrives
1214 	 * so we want to be efficient.
1215 	 * We have 3 drop cases:
1216 	 * o No MD5 hash and one expected.
1217 	 * o MD5 hash and we're not expecting one.
1218 	 * o MD5 hash and its wrong.
1219 	 */
1220 	const __u8 *hash_location = NULL;
1221 	struct tcp_md5sig_key *hash_expected;
1222 	const struct iphdr *iph = ip_hdr(skb);
1223 	const struct tcphdr *th = tcp_hdr(skb);
1224 	int genhash;
1225 	unsigned char newhash[16];
1226 
1227 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1228 					  AF_INET);
1229 	hash_location = tcp_parse_md5sig_option(th);
1230 
1231 	/* We've parsed the options - do we have a hash? */
1232 	if (!hash_expected && !hash_location)
1233 		return false;
1234 
1235 	if (hash_expected && !hash_location) {
1236 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1237 		return true;
1238 	}
1239 
1240 	if (!hash_expected && hash_location) {
1241 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1242 		return true;
1243 	}
1244 
1245 	/* Okay, so this is hash_expected and hash_location -
1246 	 * so we need to calculate the checksum.
1247 	 */
1248 	genhash = tcp_v4_md5_hash_skb(newhash,
1249 				      hash_expected,
1250 				      NULL, skb);
1251 
1252 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1253 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1254 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1255 				     &iph->saddr, ntohs(th->source),
1256 				     &iph->daddr, ntohs(th->dest),
1257 				     genhash ? " tcp_v4_calc_md5_hash failed"
1258 				     : "");
1259 		return true;
1260 	}
1261 	return false;
1262 #endif
1263 	return false;
1264 }
1265 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1266 static void tcp_v4_init_req(struct request_sock *req,
1267 			    const struct sock *sk_listener,
1268 			    struct sk_buff *skb)
1269 {
1270 	struct inet_request_sock *ireq = inet_rsk(req);
1271 	struct net *net = sock_net(sk_listener);
1272 
1273 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1274 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1275 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1276 }
1277 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1278 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1279 					  struct flowi *fl,
1280 					  const struct request_sock *req)
1281 {
1282 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1283 }
1284 
1285 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1286 	.family		=	PF_INET,
1287 	.obj_size	=	sizeof(struct tcp_request_sock),
1288 	.rtx_syn_ack	=	tcp_rtx_synack,
1289 	.send_ack	=	tcp_v4_reqsk_send_ack,
1290 	.destructor	=	tcp_v4_reqsk_destructor,
1291 	.send_reset	=	tcp_v4_send_reset,
1292 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1293 };
1294 
1295 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1296 	.mss_clamp	=	TCP_MSS_DEFAULT,
1297 #ifdef CONFIG_TCP_MD5SIG
1298 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1299 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1300 #endif
1301 	.init_req	=	tcp_v4_init_req,
1302 #ifdef CONFIG_SYN_COOKIES
1303 	.cookie_init_seq =	cookie_v4_init_sequence,
1304 #endif
1305 	.route_req	=	tcp_v4_route_req,
1306 	.init_seq	=	tcp_v4_init_seq,
1307 	.init_ts_off	=	tcp_v4_init_ts_off,
1308 	.send_synack	=	tcp_v4_send_synack,
1309 };
1310 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1311 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1312 {
1313 	/* Never answer to SYNs send to broadcast or multicast */
1314 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1315 		goto drop;
1316 
1317 	return tcp_conn_request(&tcp_request_sock_ops,
1318 				&tcp_request_sock_ipv4_ops, sk, skb);
1319 
1320 drop:
1321 	tcp_listendrop(sk);
1322 	return 0;
1323 }
1324 EXPORT_SYMBOL(tcp_v4_conn_request);
1325 
1326 
1327 /*
1328  * The three way handshake has completed - we got a valid synack -
1329  * now create the new socket.
1330  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1331 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1332 				  struct request_sock *req,
1333 				  struct dst_entry *dst,
1334 				  struct request_sock *req_unhash,
1335 				  bool *own_req)
1336 {
1337 	struct inet_request_sock *ireq;
1338 	struct inet_sock *newinet;
1339 	struct tcp_sock *newtp;
1340 	struct sock *newsk;
1341 #ifdef CONFIG_TCP_MD5SIG
1342 	struct tcp_md5sig_key *key;
1343 #endif
1344 	struct ip_options_rcu *inet_opt;
1345 
1346 	if (sk_acceptq_is_full(sk))
1347 		goto exit_overflow;
1348 
1349 	newsk = tcp_create_openreq_child(sk, req, skb);
1350 	if (!newsk)
1351 		goto exit_nonewsk;
1352 
1353 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1354 	inet_sk_rx_dst_set(newsk, skb);
1355 
1356 	newtp		      = tcp_sk(newsk);
1357 	newinet		      = inet_sk(newsk);
1358 	ireq		      = inet_rsk(req);
1359 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1360 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1361 	newsk->sk_bound_dev_if = ireq->ir_iif;
1362 	newinet->inet_saddr   = ireq->ir_loc_addr;
1363 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1364 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1365 	newinet->mc_index     = inet_iif(skb);
1366 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1367 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1368 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1369 	if (inet_opt)
1370 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1371 	newinet->inet_id = prandom_u32();
1372 
1373 	if (!dst) {
1374 		dst = inet_csk_route_child_sock(sk, newsk, req);
1375 		if (!dst)
1376 			goto put_and_exit;
1377 	} else {
1378 		/* syncookie case : see end of cookie_v4_check() */
1379 	}
1380 	sk_setup_caps(newsk, dst);
1381 
1382 	tcp_ca_openreq_child(newsk, dst);
1383 
1384 	tcp_sync_mss(newsk, dst_mtu(dst));
1385 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1386 
1387 	tcp_initialize_rcv_mss(newsk);
1388 
1389 #ifdef CONFIG_TCP_MD5SIG
1390 	/* Copy over the MD5 key from the original socket */
1391 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1392 				AF_INET);
1393 	if (key) {
1394 		/*
1395 		 * We're using one, so create a matching key
1396 		 * on the newsk structure. If we fail to get
1397 		 * memory, then we end up not copying the key
1398 		 * across. Shucks.
1399 		 */
1400 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1401 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1402 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1403 	}
1404 #endif
1405 
1406 	if (__inet_inherit_port(sk, newsk) < 0)
1407 		goto put_and_exit;
1408 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1409 	if (likely(*own_req)) {
1410 		tcp_move_syn(newtp, req);
1411 		ireq->ireq_opt = NULL;
1412 	} else {
1413 		newinet->inet_opt = NULL;
1414 	}
1415 	return newsk;
1416 
1417 exit_overflow:
1418 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1419 exit_nonewsk:
1420 	dst_release(dst);
1421 exit:
1422 	tcp_listendrop(sk);
1423 	return NULL;
1424 put_and_exit:
1425 	newinet->inet_opt = NULL;
1426 	inet_csk_prepare_forced_close(newsk);
1427 	tcp_done(newsk);
1428 	goto exit;
1429 }
1430 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1431 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1432 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1433 {
1434 #ifdef CONFIG_SYN_COOKIES
1435 	const struct tcphdr *th = tcp_hdr(skb);
1436 
1437 	if (!th->syn)
1438 		sk = cookie_v4_check(sk, skb);
1439 #endif
1440 	return sk;
1441 }
1442 
1443 /* The socket must have it's spinlock held when we get
1444  * here, unless it is a TCP_LISTEN socket.
1445  *
1446  * We have a potential double-lock case here, so even when
1447  * doing backlog processing we use the BH locking scheme.
1448  * This is because we cannot sleep with the original spinlock
1449  * held.
1450  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1451 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1452 {
1453 	struct sock *rsk;
1454 
1455 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1456 		struct dst_entry *dst = sk->sk_rx_dst;
1457 
1458 		sock_rps_save_rxhash(sk, skb);
1459 		sk_mark_napi_id(sk, skb);
1460 		if (dst) {
1461 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1462 			    !dst->ops->check(dst, 0)) {
1463 				dst_release(dst);
1464 				sk->sk_rx_dst = NULL;
1465 			}
1466 		}
1467 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1468 		return 0;
1469 	}
1470 
1471 	if (tcp_checksum_complete(skb))
1472 		goto csum_err;
1473 
1474 	if (sk->sk_state == TCP_LISTEN) {
1475 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1476 
1477 		if (!nsk)
1478 			goto discard;
1479 		if (nsk != sk) {
1480 			if (tcp_child_process(sk, nsk, skb)) {
1481 				rsk = nsk;
1482 				goto reset;
1483 			}
1484 			return 0;
1485 		}
1486 	} else
1487 		sock_rps_save_rxhash(sk, skb);
1488 
1489 	if (tcp_rcv_state_process(sk, skb)) {
1490 		rsk = sk;
1491 		goto reset;
1492 	}
1493 	return 0;
1494 
1495 reset:
1496 	tcp_v4_send_reset(rsk, skb);
1497 discard:
1498 	kfree_skb(skb);
1499 	/* Be careful here. If this function gets more complicated and
1500 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1501 	 * might be destroyed here. This current version compiles correctly,
1502 	 * but you have been warned.
1503 	 */
1504 	return 0;
1505 
1506 csum_err:
1507 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1508 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1509 	goto discard;
1510 }
1511 EXPORT_SYMBOL(tcp_v4_do_rcv);
1512 
tcp_v4_early_demux(struct sk_buff * skb)1513 int tcp_v4_early_demux(struct sk_buff *skb)
1514 {
1515 	const struct iphdr *iph;
1516 	const struct tcphdr *th;
1517 	struct sock *sk;
1518 
1519 	if (skb->pkt_type != PACKET_HOST)
1520 		return 0;
1521 
1522 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1523 		return 0;
1524 
1525 	iph = ip_hdr(skb);
1526 	th = tcp_hdr(skb);
1527 
1528 	if (th->doff < sizeof(struct tcphdr) / 4)
1529 		return 0;
1530 
1531 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1532 				       iph->saddr, th->source,
1533 				       iph->daddr, ntohs(th->dest),
1534 				       skb->skb_iif, inet_sdif(skb));
1535 	if (sk) {
1536 		skb->sk = sk;
1537 		skb->destructor = sock_edemux;
1538 		if (sk_fullsock(sk)) {
1539 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1540 
1541 			if (dst)
1542 				dst = dst_check(dst, 0);
1543 			if (dst &&
1544 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1545 				skb_dst_set_noref(skb, dst);
1546 		}
1547 	}
1548 	return 0;
1549 }
1550 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1551 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1552 {
1553 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1554 
1555 	/* Only socket owner can try to collapse/prune rx queues
1556 	 * to reduce memory overhead, so add a little headroom here.
1557 	 * Few sockets backlog are possibly concurrently non empty.
1558 	 */
1559 	limit += 64*1024;
1560 
1561 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1562 	 * we can fix skb->truesize to its real value to avoid future drops.
1563 	 * This is valid because skb is not yet charged to the socket.
1564 	 * It has been noticed pure SACK packets were sometimes dropped
1565 	 * (if cooked by drivers without copybreak feature).
1566 	 */
1567 	skb_condense(skb);
1568 
1569 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1570 		bh_unlock_sock(sk);
1571 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1572 		return true;
1573 	}
1574 	return false;
1575 }
1576 EXPORT_SYMBOL(tcp_add_backlog);
1577 
tcp_filter(struct sock * sk,struct sk_buff * skb)1578 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1579 {
1580 	struct tcphdr *th = (struct tcphdr *)skb->data;
1581 
1582 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1583 }
1584 EXPORT_SYMBOL(tcp_filter);
1585 
tcp_v4_restore_cb(struct sk_buff * skb)1586 static void tcp_v4_restore_cb(struct sk_buff *skb)
1587 {
1588 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1589 		sizeof(struct inet_skb_parm));
1590 }
1591 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1592 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1593 			   const struct tcphdr *th)
1594 {
1595 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1596 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1597 	 */
1598 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1599 		sizeof(struct inet_skb_parm));
1600 	barrier();
1601 
1602 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1603 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1604 				    skb->len - th->doff * 4);
1605 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1606 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1607 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1608 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1609 	TCP_SKB_CB(skb)->sacked	 = 0;
1610 	TCP_SKB_CB(skb)->has_rxtstamp =
1611 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1612 }
1613 
1614 /*
1615  *	From tcp_input.c
1616  */
1617 
tcp_v4_rcv(struct sk_buff * skb)1618 int tcp_v4_rcv(struct sk_buff *skb)
1619 {
1620 	struct net *net = dev_net(skb->dev);
1621 	int sdif = inet_sdif(skb);
1622 	const struct iphdr *iph;
1623 	const struct tcphdr *th;
1624 	bool refcounted;
1625 	struct sock *sk;
1626 	int ret;
1627 
1628 	if (skb->pkt_type != PACKET_HOST)
1629 		goto discard_it;
1630 
1631 	/* Count it even if it's bad */
1632 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1633 
1634 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635 		goto discard_it;
1636 
1637 	th = (const struct tcphdr *)skb->data;
1638 
1639 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1640 		goto bad_packet;
1641 	if (!pskb_may_pull(skb, th->doff * 4))
1642 		goto discard_it;
1643 
1644 	/* An explanation is required here, I think.
1645 	 * Packet length and doff are validated by header prediction,
1646 	 * provided case of th->doff==0 is eliminated.
1647 	 * So, we defer the checks. */
1648 
1649 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1650 		goto csum_error;
1651 
1652 	th = (const struct tcphdr *)skb->data;
1653 	iph = ip_hdr(skb);
1654 lookup:
1655 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1656 			       th->dest, sdif, &refcounted);
1657 	if (!sk)
1658 		goto no_tcp_socket;
1659 
1660 process:
1661 	if (sk->sk_state == TCP_TIME_WAIT)
1662 		goto do_time_wait;
1663 
1664 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1665 		struct request_sock *req = inet_reqsk(sk);
1666 		struct sock *nsk;
1667 
1668 		sk = req->rsk_listener;
1669 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1670 			sk_drops_add(sk, skb);
1671 			reqsk_put(req);
1672 			goto discard_it;
1673 		}
1674 		if (tcp_checksum_complete(skb)) {
1675 			reqsk_put(req);
1676 			goto csum_error;
1677 		}
1678 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1679 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1680 			goto lookup;
1681 		}
1682 		/* We own a reference on the listener, increase it again
1683 		 * as we might lose it too soon.
1684 		 */
1685 		sock_hold(sk);
1686 		refcounted = true;
1687 		nsk = NULL;
1688 		if (!tcp_filter(sk, skb)) {
1689 			th = (const struct tcphdr *)skb->data;
1690 			iph = ip_hdr(skb);
1691 			tcp_v4_fill_cb(skb, iph, th);
1692 			nsk = tcp_check_req(sk, skb, req, false);
1693 		}
1694 		if (!nsk) {
1695 			reqsk_put(req);
1696 			goto discard_and_relse;
1697 		}
1698 		if (nsk == sk) {
1699 			reqsk_put(req);
1700 			tcp_v4_restore_cb(skb);
1701 		} else if (tcp_child_process(sk, nsk, skb)) {
1702 			tcp_v4_send_reset(nsk, skb);
1703 			goto discard_and_relse;
1704 		} else {
1705 			sock_put(sk);
1706 			return 0;
1707 		}
1708 	}
1709 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1710 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1711 		goto discard_and_relse;
1712 	}
1713 
1714 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1715 		goto discard_and_relse;
1716 
1717 	if (tcp_v4_inbound_md5_hash(sk, skb))
1718 		goto discard_and_relse;
1719 
1720 	nf_reset(skb);
1721 
1722 	if (tcp_filter(sk, skb))
1723 		goto discard_and_relse;
1724 	th = (const struct tcphdr *)skb->data;
1725 	iph = ip_hdr(skb);
1726 	tcp_v4_fill_cb(skb, iph, th);
1727 
1728 	skb->dev = NULL;
1729 
1730 	if (sk->sk_state == TCP_LISTEN) {
1731 		ret = tcp_v4_do_rcv(sk, skb);
1732 		goto put_and_return;
1733 	}
1734 
1735 	sk_incoming_cpu_update(sk);
1736 
1737 	bh_lock_sock_nested(sk);
1738 	tcp_segs_in(tcp_sk(sk), skb);
1739 	ret = 0;
1740 	if (!sock_owned_by_user(sk)) {
1741 		ret = tcp_v4_do_rcv(sk, skb);
1742 	} else if (tcp_add_backlog(sk, skb)) {
1743 		goto discard_and_relse;
1744 	}
1745 	bh_unlock_sock(sk);
1746 
1747 put_and_return:
1748 	if (refcounted)
1749 		sock_put(sk);
1750 
1751 	return ret;
1752 
1753 no_tcp_socket:
1754 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1755 		goto discard_it;
1756 
1757 	tcp_v4_fill_cb(skb, iph, th);
1758 
1759 	if (tcp_checksum_complete(skb)) {
1760 csum_error:
1761 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1762 bad_packet:
1763 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1764 	} else {
1765 		tcp_v4_send_reset(NULL, skb);
1766 	}
1767 
1768 discard_it:
1769 	/* Discard frame. */
1770 	kfree_skb(skb);
1771 	return 0;
1772 
1773 discard_and_relse:
1774 	sk_drops_add(sk, skb);
1775 	if (refcounted)
1776 		sock_put(sk);
1777 	goto discard_it;
1778 
1779 do_time_wait:
1780 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1781 		inet_twsk_put(inet_twsk(sk));
1782 		goto discard_it;
1783 	}
1784 
1785 	tcp_v4_fill_cb(skb, iph, th);
1786 
1787 	if (tcp_checksum_complete(skb)) {
1788 		inet_twsk_put(inet_twsk(sk));
1789 		goto csum_error;
1790 	}
1791 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1792 	case TCP_TW_SYN: {
1793 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1794 							&tcp_hashinfo, skb,
1795 							__tcp_hdrlen(th),
1796 							iph->saddr, th->source,
1797 							iph->daddr, th->dest,
1798 							inet_iif(skb),
1799 							sdif);
1800 		if (sk2) {
1801 			inet_twsk_deschedule_put(inet_twsk(sk));
1802 			sk = sk2;
1803 			tcp_v4_restore_cb(skb);
1804 			refcounted = false;
1805 			goto process;
1806 		}
1807 		/* Fall through to ACK */
1808 	}
1809 	case TCP_TW_ACK:
1810 		tcp_v4_timewait_ack(sk, skb);
1811 		break;
1812 	case TCP_TW_RST:
1813 		tcp_v4_send_reset(sk, skb);
1814 		inet_twsk_deschedule_put(inet_twsk(sk));
1815 		goto discard_it;
1816 	case TCP_TW_SUCCESS:;
1817 	}
1818 	goto discard_it;
1819 }
1820 
1821 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1822 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1823 	.twsk_unique	= tcp_twsk_unique,
1824 	.twsk_destructor= tcp_twsk_destructor,
1825 };
1826 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)1827 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1828 {
1829 	struct dst_entry *dst = skb_dst(skb);
1830 
1831 	if (dst && dst_hold_safe(dst)) {
1832 		sk->sk_rx_dst = dst;
1833 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1834 	}
1835 }
1836 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1837 
1838 const struct inet_connection_sock_af_ops ipv4_specific = {
1839 	.queue_xmit	   = ip_queue_xmit,
1840 	.send_check	   = tcp_v4_send_check,
1841 	.rebuild_header	   = inet_sk_rebuild_header,
1842 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1843 	.conn_request	   = tcp_v4_conn_request,
1844 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1845 	.net_header_len	   = sizeof(struct iphdr),
1846 	.setsockopt	   = ip_setsockopt,
1847 	.getsockopt	   = ip_getsockopt,
1848 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1849 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1850 #ifdef CONFIG_COMPAT
1851 	.compat_setsockopt = compat_ip_setsockopt,
1852 	.compat_getsockopt = compat_ip_getsockopt,
1853 #endif
1854 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1855 };
1856 EXPORT_SYMBOL(ipv4_specific);
1857 
1858 #ifdef CONFIG_TCP_MD5SIG
1859 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1860 	.md5_lookup		= tcp_v4_md5_lookup,
1861 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1862 	.md5_parse		= tcp_v4_parse_md5_keys,
1863 };
1864 #endif
1865 
1866 /* NOTE: A lot of things set to zero explicitly by call to
1867  *       sk_alloc() so need not be done here.
1868  */
tcp_v4_init_sock(struct sock * sk)1869 static int tcp_v4_init_sock(struct sock *sk)
1870 {
1871 	struct inet_connection_sock *icsk = inet_csk(sk);
1872 
1873 	tcp_init_sock(sk);
1874 
1875 	icsk->icsk_af_ops = &ipv4_specific;
1876 
1877 #ifdef CONFIG_TCP_MD5SIG
1878 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1879 #endif
1880 
1881 	return 0;
1882 }
1883 
tcp_v4_destroy_sock(struct sock * sk)1884 void tcp_v4_destroy_sock(struct sock *sk)
1885 {
1886 	struct tcp_sock *tp = tcp_sk(sk);
1887 
1888 	tcp_clear_xmit_timers(sk);
1889 
1890 	tcp_cleanup_congestion_control(sk);
1891 
1892 	tcp_cleanup_ulp(sk);
1893 
1894 	/* Cleanup up the write buffer. */
1895 	tcp_write_queue_purge(sk);
1896 
1897 	/* Check if we want to disable active TFO */
1898 	tcp_fastopen_active_disable_ofo_check(sk);
1899 
1900 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1901 	skb_rbtree_purge(&tp->out_of_order_queue);
1902 
1903 #ifdef CONFIG_TCP_MD5SIG
1904 	/* Clean up the MD5 key list, if any */
1905 	if (tp->md5sig_info) {
1906 		tcp_clear_md5_list(sk);
1907 		kfree_rcu(tp->md5sig_info, rcu);
1908 		tp->md5sig_info = NULL;
1909 	}
1910 #endif
1911 
1912 	/* Clean up a referenced TCP bind bucket. */
1913 	if (inet_csk(sk)->icsk_bind_hash)
1914 		inet_put_port(sk);
1915 
1916 	BUG_ON(tp->fastopen_rsk);
1917 
1918 	/* If socket is aborted during connect operation */
1919 	tcp_free_fastopen_req(tp);
1920 	tcp_saved_syn_free(tp);
1921 
1922 	sk_sockets_allocated_dec(sk);
1923 }
1924 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1925 
1926 #ifdef CONFIG_PROC_FS
1927 /* Proc filesystem TCP sock list dumping. */
1928 
1929 /*
1930  * Get next listener socket follow cur.  If cur is NULL, get first socket
1931  * starting from bucket given in st->bucket; when st->bucket is zero the
1932  * very first socket in the hash table is returned.
1933  */
listening_get_next(struct seq_file * seq,void * cur)1934 static void *listening_get_next(struct seq_file *seq, void *cur)
1935 {
1936 	struct tcp_iter_state *st = seq->private;
1937 	struct net *net = seq_file_net(seq);
1938 	struct inet_listen_hashbucket *ilb;
1939 	struct hlist_nulls_node *node;
1940 	struct sock *sk = cur;
1941 
1942 	if (!sk) {
1943 get_head:
1944 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1945 		spin_lock(&ilb->lock);
1946 		sk = sk_nulls_head(&ilb->nulls_head);
1947 		st->offset = 0;
1948 		goto get_sk;
1949 	}
1950 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1951 	++st->num;
1952 	++st->offset;
1953 
1954 	sk = sk_nulls_next(sk);
1955 get_sk:
1956 	sk_nulls_for_each_from(sk, node) {
1957 		if (!net_eq(sock_net(sk), net))
1958 			continue;
1959 		if (sk->sk_family == st->family)
1960 			return sk;
1961 	}
1962 	spin_unlock(&ilb->lock);
1963 	st->offset = 0;
1964 	if (++st->bucket < INET_LHTABLE_SIZE)
1965 		goto get_head;
1966 	return NULL;
1967 }
1968 
listening_get_idx(struct seq_file * seq,loff_t * pos)1969 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1970 {
1971 	struct tcp_iter_state *st = seq->private;
1972 	void *rc;
1973 
1974 	st->bucket = 0;
1975 	st->offset = 0;
1976 	rc = listening_get_next(seq, NULL);
1977 
1978 	while (rc && *pos) {
1979 		rc = listening_get_next(seq, rc);
1980 		--*pos;
1981 	}
1982 	return rc;
1983 }
1984 
empty_bucket(const struct tcp_iter_state * st)1985 static inline bool empty_bucket(const struct tcp_iter_state *st)
1986 {
1987 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1988 }
1989 
1990 /*
1991  * Get first established socket starting from bucket given in st->bucket.
1992  * If st->bucket is zero, the very first socket in the hash is returned.
1993  */
established_get_first(struct seq_file * seq)1994 static void *established_get_first(struct seq_file *seq)
1995 {
1996 	struct tcp_iter_state *st = seq->private;
1997 	struct net *net = seq_file_net(seq);
1998 	void *rc = NULL;
1999 
2000 	st->offset = 0;
2001 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2002 		struct sock *sk;
2003 		struct hlist_nulls_node *node;
2004 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2005 
2006 		/* Lockless fast path for the common case of empty buckets */
2007 		if (empty_bucket(st))
2008 			continue;
2009 
2010 		spin_lock_bh(lock);
2011 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2012 			if (sk->sk_family != st->family ||
2013 			    !net_eq(sock_net(sk), net)) {
2014 				continue;
2015 			}
2016 			rc = sk;
2017 			goto out;
2018 		}
2019 		spin_unlock_bh(lock);
2020 	}
2021 out:
2022 	return rc;
2023 }
2024 
established_get_next(struct seq_file * seq,void * cur)2025 static void *established_get_next(struct seq_file *seq, void *cur)
2026 {
2027 	struct sock *sk = cur;
2028 	struct hlist_nulls_node *node;
2029 	struct tcp_iter_state *st = seq->private;
2030 	struct net *net = seq_file_net(seq);
2031 
2032 	++st->num;
2033 	++st->offset;
2034 
2035 	sk = sk_nulls_next(sk);
2036 
2037 	sk_nulls_for_each_from(sk, node) {
2038 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2039 			return sk;
2040 	}
2041 
2042 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2043 	++st->bucket;
2044 	return established_get_first(seq);
2045 }
2046 
established_get_idx(struct seq_file * seq,loff_t pos)2047 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2048 {
2049 	struct tcp_iter_state *st = seq->private;
2050 	void *rc;
2051 
2052 	st->bucket = 0;
2053 	rc = established_get_first(seq);
2054 
2055 	while (rc && pos) {
2056 		rc = established_get_next(seq, rc);
2057 		--pos;
2058 	}
2059 	return rc;
2060 }
2061 
tcp_get_idx(struct seq_file * seq,loff_t pos)2062 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2063 {
2064 	void *rc;
2065 	struct tcp_iter_state *st = seq->private;
2066 
2067 	st->state = TCP_SEQ_STATE_LISTENING;
2068 	rc	  = listening_get_idx(seq, &pos);
2069 
2070 	if (!rc) {
2071 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2072 		rc	  = established_get_idx(seq, pos);
2073 	}
2074 
2075 	return rc;
2076 }
2077 
tcp_seek_last_pos(struct seq_file * seq)2078 static void *tcp_seek_last_pos(struct seq_file *seq)
2079 {
2080 	struct tcp_iter_state *st = seq->private;
2081 	int offset = st->offset;
2082 	int orig_num = st->num;
2083 	void *rc = NULL;
2084 
2085 	switch (st->state) {
2086 	case TCP_SEQ_STATE_LISTENING:
2087 		if (st->bucket >= INET_LHTABLE_SIZE)
2088 			break;
2089 		st->state = TCP_SEQ_STATE_LISTENING;
2090 		rc = listening_get_next(seq, NULL);
2091 		while (offset-- && rc)
2092 			rc = listening_get_next(seq, rc);
2093 		if (rc)
2094 			break;
2095 		st->bucket = 0;
2096 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2097 		/* Fallthrough */
2098 	case TCP_SEQ_STATE_ESTABLISHED:
2099 		if (st->bucket > tcp_hashinfo.ehash_mask)
2100 			break;
2101 		rc = established_get_first(seq);
2102 		while (offset-- && rc)
2103 			rc = established_get_next(seq, rc);
2104 	}
2105 
2106 	st->num = orig_num;
2107 
2108 	return rc;
2109 }
2110 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2111 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2112 {
2113 	struct tcp_iter_state *st = seq->private;
2114 	void *rc;
2115 
2116 	if (*pos && *pos == st->last_pos) {
2117 		rc = tcp_seek_last_pos(seq);
2118 		if (rc)
2119 			goto out;
2120 	}
2121 
2122 	st->state = TCP_SEQ_STATE_LISTENING;
2123 	st->num = 0;
2124 	st->bucket = 0;
2125 	st->offset = 0;
2126 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2127 
2128 out:
2129 	st->last_pos = *pos;
2130 	return rc;
2131 }
2132 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2133 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2134 {
2135 	struct tcp_iter_state *st = seq->private;
2136 	void *rc = NULL;
2137 
2138 	if (v == SEQ_START_TOKEN) {
2139 		rc = tcp_get_idx(seq, 0);
2140 		goto out;
2141 	}
2142 
2143 	switch (st->state) {
2144 	case TCP_SEQ_STATE_LISTENING:
2145 		rc = listening_get_next(seq, v);
2146 		if (!rc) {
2147 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2148 			st->bucket = 0;
2149 			st->offset = 0;
2150 			rc	  = established_get_first(seq);
2151 		}
2152 		break;
2153 	case TCP_SEQ_STATE_ESTABLISHED:
2154 		rc = established_get_next(seq, v);
2155 		break;
2156 	}
2157 out:
2158 	++*pos;
2159 	st->last_pos = *pos;
2160 	return rc;
2161 }
2162 
tcp_seq_stop(struct seq_file * seq,void * v)2163 static void tcp_seq_stop(struct seq_file *seq, void *v)
2164 {
2165 	struct tcp_iter_state *st = seq->private;
2166 
2167 	switch (st->state) {
2168 	case TCP_SEQ_STATE_LISTENING:
2169 		if (v != SEQ_START_TOKEN)
2170 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2171 		break;
2172 	case TCP_SEQ_STATE_ESTABLISHED:
2173 		if (v)
2174 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2175 		break;
2176 	}
2177 }
2178 
tcp_seq_open(struct inode * inode,struct file * file)2179 int tcp_seq_open(struct inode *inode, struct file *file)
2180 {
2181 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2182 	struct tcp_iter_state *s;
2183 	int err;
2184 
2185 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2186 			  sizeof(struct tcp_iter_state));
2187 	if (err < 0)
2188 		return err;
2189 
2190 	s = ((struct seq_file *)file->private_data)->private;
2191 	s->family		= afinfo->family;
2192 	s->last_pos		= 0;
2193 	return 0;
2194 }
2195 EXPORT_SYMBOL(tcp_seq_open);
2196 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2197 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2198 {
2199 	int rc = 0;
2200 	struct proc_dir_entry *p;
2201 
2202 	afinfo->seq_ops.start		= tcp_seq_start;
2203 	afinfo->seq_ops.next		= tcp_seq_next;
2204 	afinfo->seq_ops.stop		= tcp_seq_stop;
2205 
2206 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2207 			     afinfo->seq_fops, afinfo);
2208 	if (!p)
2209 		rc = -ENOMEM;
2210 	return rc;
2211 }
2212 EXPORT_SYMBOL(tcp_proc_register);
2213 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2214 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2215 {
2216 	remove_proc_entry(afinfo->name, net->proc_net);
2217 }
2218 EXPORT_SYMBOL(tcp_proc_unregister);
2219 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2220 static void get_openreq4(const struct request_sock *req,
2221 			 struct seq_file *f, int i)
2222 {
2223 	const struct inet_request_sock *ireq = inet_rsk(req);
2224 	long delta = req->rsk_timer.expires - jiffies;
2225 
2226 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2227 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2228 		i,
2229 		ireq->ir_loc_addr,
2230 		ireq->ir_num,
2231 		ireq->ir_rmt_addr,
2232 		ntohs(ireq->ir_rmt_port),
2233 		TCP_SYN_RECV,
2234 		0, 0, /* could print option size, but that is af dependent. */
2235 		1,    /* timers active (only the expire timer) */
2236 		jiffies_delta_to_clock_t(delta),
2237 		req->num_timeout,
2238 		from_kuid_munged(seq_user_ns(f),
2239 				 sock_i_uid(req->rsk_listener)),
2240 		0,  /* non standard timer */
2241 		0, /* open_requests have no inode */
2242 		0,
2243 		req);
2244 }
2245 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2246 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2247 {
2248 	int timer_active;
2249 	unsigned long timer_expires;
2250 	const struct tcp_sock *tp = tcp_sk(sk);
2251 	const struct inet_connection_sock *icsk = inet_csk(sk);
2252 	const struct inet_sock *inet = inet_sk(sk);
2253 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2254 	__be32 dest = inet->inet_daddr;
2255 	__be32 src = inet->inet_rcv_saddr;
2256 	__u16 destp = ntohs(inet->inet_dport);
2257 	__u16 srcp = ntohs(inet->inet_sport);
2258 	int rx_queue;
2259 	int state;
2260 
2261 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2262 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2263 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2264 		timer_active	= 1;
2265 		timer_expires	= icsk->icsk_timeout;
2266 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2267 		timer_active	= 4;
2268 		timer_expires	= icsk->icsk_timeout;
2269 	} else if (timer_pending(&sk->sk_timer)) {
2270 		timer_active	= 2;
2271 		timer_expires	= sk->sk_timer.expires;
2272 	} else {
2273 		timer_active	= 0;
2274 		timer_expires = jiffies;
2275 	}
2276 
2277 	state = sk_state_load(sk);
2278 	if (state == TCP_LISTEN)
2279 		rx_queue = sk->sk_ack_backlog;
2280 	else
2281 		/* Because we don't lock the socket,
2282 		 * we might find a transient negative value.
2283 		 */
2284 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2285 
2286 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2287 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2288 		i, src, srcp, dest, destp, state,
2289 		tp->write_seq - tp->snd_una,
2290 		rx_queue,
2291 		timer_active,
2292 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2293 		icsk->icsk_retransmits,
2294 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2295 		icsk->icsk_probes_out,
2296 		sock_i_ino(sk),
2297 		refcount_read(&sk->sk_refcnt), sk,
2298 		jiffies_to_clock_t(icsk->icsk_rto),
2299 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2300 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2301 		tp->snd_cwnd,
2302 		state == TCP_LISTEN ?
2303 		    fastopenq->max_qlen :
2304 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2305 }
2306 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2307 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2308 			       struct seq_file *f, int i)
2309 {
2310 	long delta = tw->tw_timer.expires - jiffies;
2311 	__be32 dest, src;
2312 	__u16 destp, srcp;
2313 
2314 	dest  = tw->tw_daddr;
2315 	src   = tw->tw_rcv_saddr;
2316 	destp = ntohs(tw->tw_dport);
2317 	srcp  = ntohs(tw->tw_sport);
2318 
2319 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2320 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2321 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2322 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2323 		refcount_read(&tw->tw_refcnt), tw);
2324 }
2325 
2326 #define TMPSZ 150
2327 
tcp4_seq_show(struct seq_file * seq,void * v)2328 static int tcp4_seq_show(struct seq_file *seq, void *v)
2329 {
2330 	struct tcp_iter_state *st;
2331 	struct sock *sk = v;
2332 
2333 	seq_setwidth(seq, TMPSZ - 1);
2334 	if (v == SEQ_START_TOKEN) {
2335 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2336 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2337 			   "inode");
2338 		goto out;
2339 	}
2340 	st = seq->private;
2341 
2342 	if (sk->sk_state == TCP_TIME_WAIT)
2343 		get_timewait4_sock(v, seq, st->num);
2344 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2345 		get_openreq4(v, seq, st->num);
2346 	else
2347 		get_tcp4_sock(v, seq, st->num);
2348 out:
2349 	seq_pad(seq, '\n');
2350 	return 0;
2351 }
2352 
2353 static const struct file_operations tcp_afinfo_seq_fops = {
2354 	.owner   = THIS_MODULE,
2355 	.open    = tcp_seq_open,
2356 	.read    = seq_read,
2357 	.llseek  = seq_lseek,
2358 	.release = seq_release_net
2359 };
2360 
2361 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2362 	.name		= "tcp",
2363 	.family		= AF_INET,
2364 	.seq_fops	= &tcp_afinfo_seq_fops,
2365 	.seq_ops	= {
2366 		.show		= tcp4_seq_show,
2367 	},
2368 };
2369 
tcp4_proc_init_net(struct net * net)2370 static int __net_init tcp4_proc_init_net(struct net *net)
2371 {
2372 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2373 }
2374 
tcp4_proc_exit_net(struct net * net)2375 static void __net_exit tcp4_proc_exit_net(struct net *net)
2376 {
2377 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2378 }
2379 
2380 static struct pernet_operations tcp4_net_ops = {
2381 	.init = tcp4_proc_init_net,
2382 	.exit = tcp4_proc_exit_net,
2383 };
2384 
tcp4_proc_init(void)2385 int __init tcp4_proc_init(void)
2386 {
2387 	return register_pernet_subsys(&tcp4_net_ops);
2388 }
2389 
tcp4_proc_exit(void)2390 void tcp4_proc_exit(void)
2391 {
2392 	unregister_pernet_subsys(&tcp4_net_ops);
2393 }
2394 #endif /* CONFIG_PROC_FS */
2395 
2396 struct proto tcp_prot = {
2397 	.name			= "TCP",
2398 	.owner			= THIS_MODULE,
2399 	.close			= tcp_close,
2400 	.connect		= tcp_v4_connect,
2401 	.disconnect		= tcp_disconnect,
2402 	.accept			= inet_csk_accept,
2403 	.ioctl			= tcp_ioctl,
2404 	.init			= tcp_v4_init_sock,
2405 	.destroy		= tcp_v4_destroy_sock,
2406 	.shutdown		= tcp_shutdown,
2407 	.setsockopt		= tcp_setsockopt,
2408 	.getsockopt		= tcp_getsockopt,
2409 	.keepalive		= tcp_set_keepalive,
2410 	.recvmsg		= tcp_recvmsg,
2411 	.sendmsg		= tcp_sendmsg,
2412 	.sendpage		= tcp_sendpage,
2413 	.backlog_rcv		= tcp_v4_do_rcv,
2414 	.release_cb		= tcp_release_cb,
2415 	.hash			= inet_hash,
2416 	.unhash			= inet_unhash,
2417 	.get_port		= inet_csk_get_port,
2418 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2419 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2420 	.stream_memory_free	= tcp_stream_memory_free,
2421 	.sockets_allocated	= &tcp_sockets_allocated,
2422 	.orphan_count		= &tcp_orphan_count,
2423 	.memory_allocated	= &tcp_memory_allocated,
2424 	.memory_pressure	= &tcp_memory_pressure,
2425 	.sysctl_mem		= sysctl_tcp_mem,
2426 	.sysctl_wmem		= sysctl_tcp_wmem,
2427 	.sysctl_rmem		= sysctl_tcp_rmem,
2428 	.max_header		= MAX_TCP_HEADER,
2429 	.obj_size		= sizeof(struct tcp_sock),
2430 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2431 	.twsk_prot		= &tcp_timewait_sock_ops,
2432 	.rsk_prot		= &tcp_request_sock_ops,
2433 	.h.hashinfo		= &tcp_hashinfo,
2434 	.no_autobind		= true,
2435 #ifdef CONFIG_COMPAT
2436 	.compat_setsockopt	= compat_tcp_setsockopt,
2437 	.compat_getsockopt	= compat_tcp_getsockopt,
2438 #endif
2439 	.diag_destroy		= tcp_abort,
2440 };
2441 EXPORT_SYMBOL(tcp_prot);
2442 
tcp_sk_exit(struct net * net)2443 static void __net_exit tcp_sk_exit(struct net *net)
2444 {
2445 	int cpu;
2446 
2447 	for_each_possible_cpu(cpu)
2448 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2449 	free_percpu(net->ipv4.tcp_sk);
2450 }
2451 
tcp_sk_init(struct net * net)2452 static int __net_init tcp_sk_init(struct net *net)
2453 {
2454 	int res, cpu, cnt;
2455 
2456 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2457 	if (!net->ipv4.tcp_sk)
2458 		return -ENOMEM;
2459 
2460 	for_each_possible_cpu(cpu) {
2461 		struct sock *sk;
2462 
2463 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2464 					   IPPROTO_TCP, net);
2465 		if (res)
2466 			goto fail;
2467 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2468 
2469 		/* Please enforce IP_DF and IPID==0 for RST and
2470 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2471 		 */
2472 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2473 
2474 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2475 	}
2476 
2477 	net->ipv4.sysctl_tcp_ecn = 2;
2478 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2479 
2480 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2481 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2482 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2483 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2484 
2485 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2486 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2487 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2488 
2489 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2490 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2491 	net->ipv4.sysctl_tcp_syncookies = 1;
2492 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2493 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2494 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2495 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2496 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2497 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2498 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2499 
2500 	cnt = tcp_hashinfo.ehash_mask + 1;
2501 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2502 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2503 
2504 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2505 	net->ipv4.sysctl_tcp_sack = 1;
2506 	net->ipv4.sysctl_tcp_window_scaling = 1;
2507 	net->ipv4.sysctl_tcp_timestamps = 1;
2508 
2509 	return 0;
2510 fail:
2511 	tcp_sk_exit(net);
2512 
2513 	return res;
2514 }
2515 
tcp_sk_exit_batch(struct list_head * net_exit_list)2516 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2517 {
2518 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2519 }
2520 
2521 static struct pernet_operations __net_initdata tcp_sk_ops = {
2522        .init	   = tcp_sk_init,
2523        .exit	   = tcp_sk_exit,
2524        .exit_batch = tcp_sk_exit_batch,
2525 };
2526 
tcp_v4_init(void)2527 void __init tcp_v4_init(void)
2528 {
2529 	if (register_pernet_subsys(&tcp_sk_ops))
2530 		panic("Failed to create the TCP control socket.\n");
2531 }
2532