• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * INET		An implementation of the TCP/IP protocol suite for the LINUX
3   *		operating system.  INET is implemented using the  BSD Socket
4   *		interface as the means of communication with the user level.
5   *
6   *		Implementation of the Transmission Control Protocol(TCP).
7   *
8   *		IPv4 specific functions
9   *
10   *
11   *		code split from:
12   *		linux/ipv4/tcp.c
13   *		linux/ipv4/tcp_input.c
14   *		linux/ipv4/tcp_output.c
15   *
16   *		See tcp.c for author information
17   *
18   *	This program is free software; you can redistribute it and/or
19   *      modify it under the terms of the GNU General Public License
20   *      as published by the Free Software Foundation; either version
21   *      2 of the License, or (at your option) any later version.
22   */
23  
24  /*
25   * Changes:
26   *		David S. Miller	:	New socket lookup architecture.
27   *					This code is dedicated to John Dyson.
28   *		David S. Miller :	Change semantics of established hash,
29   *					half is devoted to TIME_WAIT sockets
30   *					and the rest go in the other half.
31   *		Andi Kleen :		Add support for syncookies and fixed
32   *					some bugs: ip options weren't passed to
33   *					the TCP layer, missed a check for an
34   *					ACK bit.
35   *		Andi Kleen :		Implemented fast path mtu discovery.
36   *	     				Fixed many serious bugs in the
37   *					request_sock handling and moved
38   *					most of it into the af independent code.
39   *					Added tail drop and some other bugfixes.
40   *					Added new listen semantics.
41   *		Mike McLagan	:	Routing by source
42   *	Juan Jose Ciarlante:		ip_dynaddr bits
43   *		Andi Kleen:		various fixes.
44   *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45   *					coma.
46   *	Andi Kleen		:	Fix new listen.
47   *	Andi Kleen		:	Fix accept error reporting.
48   *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49   *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50   *					a single port at the same time.
51   */
52  
53  #define pr_fmt(fmt) "TCP: " fmt
54  
55  #include <linux/bottom_half.h>
56  #include <linux/types.h>
57  #include <linux/fcntl.h>
58  #include <linux/module.h>
59  #include <linux/random.h>
60  #include <linux/cache.h>
61  #include <linux/jhash.h>
62  #include <linux/init.h>
63  #include <linux/times.h>
64  #include <linux/slab.h>
65  
66  #include <net/net_namespace.h>
67  #include <net/icmp.h>
68  #include <net/inet_hashtables.h>
69  #include <net/tcp.h>
70  #include <net/transp_v6.h>
71  #include <net/ipv6.h>
72  #include <net/inet_common.h>
73  #include <net/timewait_sock.h>
74  #include <net/xfrm.h>
75  #include <net/netdma.h>
76  #include <net/secure_seq.h>
77  #include <net/tcp_memcontrol.h>
78  
79  #include <linux/inet.h>
80  #include <linux/ipv6.h>
81  #include <linux/stddef.h>
82  #include <linux/proc_fs.h>
83  #include <linux/seq_file.h>
84  
85  #include <linux/crypto.h>
86  #include <linux/scatterlist.h>
87  
88  int sysctl_tcp_tw_reuse __read_mostly;
89  int sysctl_tcp_low_latency __read_mostly;
90  EXPORT_SYMBOL(sysctl_tcp_low_latency);
91  
92  
93  #ifdef CONFIG_TCP_MD5SIG
94  static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95  			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96  #endif
97  
98  struct inet_hashinfo tcp_hashinfo;
99  EXPORT_SYMBOL(tcp_hashinfo);
100  
tcp_v4_init_sequence(const struct sk_buff * skb)101  static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102  {
103  	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104  					  ip_hdr(skb)->saddr,
105  					  tcp_hdr(skb)->dest,
106  					  tcp_hdr(skb)->source);
107  }
108  
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110  {
111  	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112  	struct tcp_sock *tp = tcp_sk(sk);
113  
114  	/* With PAWS, it is safe from the viewpoint
115  	   of data integrity. Even without PAWS it is safe provided sequence
116  	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117  
118  	   Actually, the idea is close to VJ's one, only timestamp cache is
119  	   held not per host, but per port pair and TW bucket is used as state
120  	   holder.
121  
122  	   If TW bucket has been already destroyed we fall back to VJ's scheme
123  	   and use initial timestamp retrieved from peer table.
124  	 */
125  	if (tcptw->tw_ts_recent_stamp &&
126  	    (twp == NULL || (sysctl_tcp_tw_reuse &&
127  			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128  		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129  		if (tp->write_seq == 0)
130  			tp->write_seq = 1;
131  		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
132  		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133  		sock_hold(sktw);
134  		return 1;
135  	}
136  
137  	return 0;
138  }
139  EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140  
141  /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142  int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143  {
144  	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145  	struct inet_sock *inet = inet_sk(sk);
146  	struct tcp_sock *tp = tcp_sk(sk);
147  	__be16 orig_sport, orig_dport;
148  	__be32 daddr, nexthop;
149  	struct flowi4 *fl4;
150  	struct rtable *rt;
151  	int err;
152  	struct ip_options_rcu *inet_opt;
153  
154  	if (addr_len < sizeof(struct sockaddr_in))
155  		return -EINVAL;
156  
157  	if (usin->sin_family != AF_INET)
158  		return -EAFNOSUPPORT;
159  
160  	nexthop = daddr = usin->sin_addr.s_addr;
161  	inet_opt = rcu_dereference_protected(inet->inet_opt,
162  					     sock_owned_by_user(sk));
163  	if (inet_opt && inet_opt->opt.srr) {
164  		if (!daddr)
165  			return -EINVAL;
166  		nexthop = inet_opt->opt.faddr;
167  	}
168  
169  	orig_sport = inet->inet_sport;
170  	orig_dport = usin->sin_port;
171  	fl4 = &inet->cork.fl.u.ip4;
172  	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173  			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174  			      IPPROTO_TCP,
175  			      orig_sport, orig_dport, sk, true);
176  	if (IS_ERR(rt)) {
177  		err = PTR_ERR(rt);
178  		if (err == -ENETUNREACH)
179  			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180  		return err;
181  	}
182  
183  	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184  		ip_rt_put(rt);
185  		return -ENETUNREACH;
186  	}
187  
188  	if (!inet_opt || !inet_opt->opt.srr)
189  		daddr = fl4->daddr;
190  
191  	if (!inet->inet_saddr)
192  		inet->inet_saddr = fl4->saddr;
193  	inet->inet_rcv_saddr = inet->inet_saddr;
194  
195  	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196  		/* Reset inherited state */
197  		tp->rx_opt.ts_recent	   = 0;
198  		tp->rx_opt.ts_recent_stamp = 0;
199  		if (likely(!tp->repair))
200  			tp->write_seq	   = 0;
201  	}
202  
203  	if (tcp_death_row.sysctl_tw_recycle &&
204  	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
205  		tcp_fetch_timewait_stamp(sk, &rt->dst);
206  
207  	inet->inet_dport = usin->sin_port;
208  	inet->inet_daddr = daddr;
209  
210  	inet_csk(sk)->icsk_ext_hdr_len = 0;
211  	if (inet_opt)
212  		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213  
214  	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215  
216  	/* Socket identity is still unknown (sport may be zero).
217  	 * However we set state to SYN-SENT and not releasing socket
218  	 * lock select source port, enter ourselves into the hash tables and
219  	 * complete initialization after this.
220  	 */
221  	tcp_set_state(sk, TCP_SYN_SENT);
222  	err = inet_hash_connect(&tcp_death_row, sk);
223  	if (err)
224  		goto failure;
225  
226  	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227  			       inet->inet_sport, inet->inet_dport, sk);
228  	if (IS_ERR(rt)) {
229  		err = PTR_ERR(rt);
230  		rt = NULL;
231  		goto failure;
232  	}
233  	/* OK, now commit destination to socket.  */
234  	sk->sk_gso_type = SKB_GSO_TCPV4;
235  	sk_setup_caps(sk, &rt->dst);
236  
237  	if (!tp->write_seq && likely(!tp->repair))
238  		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239  							   inet->inet_daddr,
240  							   inet->inet_sport,
241  							   usin->sin_port);
242  
243  	inet->inet_id = tp->write_seq ^ jiffies;
244  
245  	err = tcp_connect(sk);
246  
247  	rt = NULL;
248  	if (err)
249  		goto failure;
250  
251  	return 0;
252  
253  failure:
254  	/*
255  	 * This unhashes the socket and releases the local port,
256  	 * if necessary.
257  	 */
258  	tcp_set_state(sk, TCP_CLOSE);
259  	ip_rt_put(rt);
260  	sk->sk_route_caps = 0;
261  	inet->inet_dport = 0;
262  	return err;
263  }
264  EXPORT_SYMBOL(tcp_v4_connect);
265  
266  /*
267   * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268   * It can be called through tcp_release_cb() if socket was owned by user
269   * at the time tcp_v4_err() was called to handle ICMP message.
270   */
tcp_v4_mtu_reduced(struct sock * sk)271  static void tcp_v4_mtu_reduced(struct sock *sk)
272  {
273  	struct dst_entry *dst;
274  	struct inet_sock *inet = inet_sk(sk);
275  	u32 mtu = tcp_sk(sk)->mtu_info;
276  
277  	dst = inet_csk_update_pmtu(sk, mtu);
278  	if (!dst)
279  		return;
280  
281  	/* Something is about to be wrong... Remember soft error
282  	 * for the case, if this connection will not able to recover.
283  	 */
284  	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285  		sk->sk_err_soft = EMSGSIZE;
286  
287  	mtu = dst_mtu(dst);
288  
289  	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290  	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
291  		tcp_sync_mss(sk, mtu);
292  
293  		/* Resend the TCP packet because it's
294  		 * clear that the old packet has been
295  		 * dropped. This is the new "fast" path mtu
296  		 * discovery.
297  		 */
298  		tcp_simple_retransmit(sk);
299  	} /* else let the usual retransmit timer handle it */
300  }
301  
do_redirect(struct sk_buff * skb,struct sock * sk)302  static void do_redirect(struct sk_buff *skb, struct sock *sk)
303  {
304  	struct dst_entry *dst = __sk_dst_check(sk, 0);
305  
306  	if (dst)
307  		dst->ops->redirect(dst, sk, skb);
308  }
309  
310  /*
311   * This routine is called by the ICMP module when it gets some
312   * sort of error condition.  If err < 0 then the socket should
313   * be closed and the error returned to the user.  If err > 0
314   * it's just the icmp type << 8 | icmp code.  After adjustment
315   * header points to the first 8 bytes of the tcp header.  We need
316   * to find the appropriate port.
317   *
318   * The locking strategy used here is very "optimistic". When
319   * someone else accesses the socket the ICMP is just dropped
320   * and for some paths there is no check at all.
321   * A more general error queue to queue errors for later handling
322   * is probably better.
323   *
324   */
325  
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)326  void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
327  {
328  	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
329  	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
330  	struct inet_connection_sock *icsk;
331  	struct tcp_sock *tp;
332  	struct inet_sock *inet;
333  	const int type = icmp_hdr(icmp_skb)->type;
334  	const int code = icmp_hdr(icmp_skb)->code;
335  	struct sock *sk;
336  	struct sk_buff *skb;
337  	struct request_sock *req;
338  	__u32 seq;
339  	__u32 remaining;
340  	int err;
341  	struct net *net = dev_net(icmp_skb->dev);
342  
343  	if (icmp_skb->len < (iph->ihl << 2) + 8) {
344  		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
345  		return;
346  	}
347  
348  	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
349  			iph->saddr, th->source, inet_iif(icmp_skb));
350  	if (!sk) {
351  		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352  		return;
353  	}
354  	if (sk->sk_state == TCP_TIME_WAIT) {
355  		inet_twsk_put(inet_twsk(sk));
356  		return;
357  	}
358  
359  	bh_lock_sock(sk);
360  	/* If too many ICMPs get dropped on busy
361  	 * servers this needs to be solved differently.
362  	 * We do take care of PMTU discovery (RFC1191) special case :
363  	 * we can receive locally generated ICMP messages while socket is held.
364  	 */
365  	if (sock_owned_by_user(sk)) {
366  		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
367  			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
368  	}
369  	if (sk->sk_state == TCP_CLOSE)
370  		goto out;
371  
372  	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
373  		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
374  		goto out;
375  	}
376  
377  	icsk = inet_csk(sk);
378  	tp = tcp_sk(sk);
379  	req = tp->fastopen_rsk;
380  	seq = ntohl(th->seq);
381  	if (sk->sk_state != TCP_LISTEN &&
382  	    !between(seq, tp->snd_una, tp->snd_nxt) &&
383  	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
384  		/* For a Fast Open socket, allow seq to be snt_isn. */
385  		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
386  		goto out;
387  	}
388  
389  	switch (type) {
390  	case ICMP_REDIRECT:
391  		do_redirect(icmp_skb, sk);
392  		goto out;
393  	case ICMP_SOURCE_QUENCH:
394  		/* Just silently ignore these. */
395  		goto out;
396  	case ICMP_PARAMETERPROB:
397  		err = EPROTO;
398  		break;
399  	case ICMP_DEST_UNREACH:
400  		if (code > NR_ICMP_UNREACH)
401  			goto out;
402  
403  		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
404  			/* We are not interested in TCP_LISTEN and open_requests
405  			 * (SYN-ACKs send out by Linux are always <576bytes so
406  			 * they should go through unfragmented).
407  			 */
408  			if (sk->sk_state == TCP_LISTEN)
409  				goto out;
410  
411  			tp->mtu_info = info;
412  			if (!sock_owned_by_user(sk)) {
413  				tcp_v4_mtu_reduced(sk);
414  			} else {
415  				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
416  					sock_hold(sk);
417  			}
418  			goto out;
419  		}
420  
421  		err = icmp_err_convert[code].errno;
422  		/* check if icmp_skb allows revert of backoff
423  		 * (see draft-zimmermann-tcp-lcd) */
424  		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
425  			break;
426  		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
427  		    !icsk->icsk_backoff)
428  			break;
429  
430  		/* XXX (TFO) - revisit the following logic for TFO */
431  
432  		if (sock_owned_by_user(sk))
433  			break;
434  
435  		icsk->icsk_backoff--;
436  		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
437  			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
438  		tcp_bound_rto(sk);
439  
440  		skb = tcp_write_queue_head(sk);
441  		BUG_ON(!skb);
442  
443  		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444  				tcp_time_stamp - TCP_SKB_CB(skb)->when);
445  
446  		if (remaining) {
447  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448  						  remaining, TCP_RTO_MAX);
449  		} else {
450  			/* RTO revert clocked out retransmission.
451  			 * Will retransmit now */
452  			tcp_retransmit_timer(sk);
453  		}
454  
455  		break;
456  	case ICMP_TIME_EXCEEDED:
457  		err = EHOSTUNREACH;
458  		break;
459  	default:
460  		goto out;
461  	}
462  
463  	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather
464  	 * than following the TCP_SYN_RECV case and closing the socket,
465  	 * we ignore the ICMP error and keep trying like a fully established
466  	 * socket. Is this the right thing to do?
467  	 */
468  	if (req && req->sk == NULL)
469  		goto out;
470  
471  	switch (sk->sk_state) {
472  		struct request_sock *req, **prev;
473  	case TCP_LISTEN:
474  		if (sock_owned_by_user(sk))
475  			goto out;
476  
477  		req = inet_csk_search_req(sk, &prev, th->dest,
478  					  iph->daddr, iph->saddr);
479  		if (!req)
480  			goto out;
481  
482  		/* ICMPs are not backlogged, hence we cannot get
483  		   an established socket here.
484  		 */
485  		WARN_ON(req->sk);
486  
487  		if (seq != tcp_rsk(req)->snt_isn) {
488  			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
489  			goto out;
490  		}
491  
492  		/*
493  		 * Still in SYN_RECV, just remove it silently.
494  		 * There is no good way to pass the error to the newly
495  		 * created socket, and POSIX does not want network
496  		 * errors returned from accept().
497  		 */
498  		inet_csk_reqsk_queue_drop(sk, req, prev);
499  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
500  		goto out;
501  
502  	case TCP_SYN_SENT:
503  	case TCP_SYN_RECV:  /* Cannot happen.
504  			       It can f.e. if SYNs crossed,
505  			       or Fast Open.
506  			     */
507  		if (!sock_owned_by_user(sk)) {
508  			sk->sk_err = err;
509  
510  			sk->sk_error_report(sk);
511  
512  			tcp_done(sk);
513  		} else {
514  			sk->sk_err_soft = err;
515  		}
516  		goto out;
517  	}
518  
519  	/* If we've already connected we will keep trying
520  	 * until we time out, or the user gives up.
521  	 *
522  	 * rfc1122 4.2.3.9 allows to consider as hard errors
523  	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524  	 * but it is obsoleted by pmtu discovery).
525  	 *
526  	 * Note, that in modern internet, where routing is unreliable
527  	 * and in each dark corner broken firewalls sit, sending random
528  	 * errors ordered by their masters even this two messages finally lose
529  	 * their original sense (even Linux sends invalid PORT_UNREACHs)
530  	 *
531  	 * Now we are in compliance with RFCs.
532  	 *							--ANK (980905)
533  	 */
534  
535  	inet = inet_sk(sk);
536  	if (!sock_owned_by_user(sk) && inet->recverr) {
537  		sk->sk_err = err;
538  		sk->sk_error_report(sk);
539  	} else	{ /* Only an error on timeout */
540  		sk->sk_err_soft = err;
541  	}
542  
543  out:
544  	bh_unlock_sock(sk);
545  	sock_put(sk);
546  }
547  
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)548  static void __tcp_v4_send_check(struct sk_buff *skb,
549  				__be32 saddr, __be32 daddr)
550  {
551  	struct tcphdr *th = tcp_hdr(skb);
552  
553  	if (skb->ip_summed == CHECKSUM_PARTIAL) {
554  		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
555  		skb->csum_start = skb_transport_header(skb) - skb->head;
556  		skb->csum_offset = offsetof(struct tcphdr, check);
557  	} else {
558  		th->check = tcp_v4_check(skb->len, saddr, daddr,
559  					 csum_partial(th,
560  						      th->doff << 2,
561  						      skb->csum));
562  	}
563  }
564  
565  /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)566  void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567  {
568  	const struct inet_sock *inet = inet_sk(sk);
569  
570  	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571  }
572  EXPORT_SYMBOL(tcp_v4_send_check);
573  
tcp_v4_gso_send_check(struct sk_buff * skb)574  int tcp_v4_gso_send_check(struct sk_buff *skb)
575  {
576  	const struct iphdr *iph;
577  	struct tcphdr *th;
578  
579  	if (!pskb_may_pull(skb, sizeof(*th)))
580  		return -EINVAL;
581  
582  	iph = ip_hdr(skb);
583  	th = tcp_hdr(skb);
584  
585  	th->check = 0;
586  	skb->ip_summed = CHECKSUM_PARTIAL;
587  	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
588  	return 0;
589  }
590  
591  /*
592   *	This routine will send an RST to the other tcp.
593   *
594   *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
595   *		      for reset.
596   *	Answer: if a packet caused RST, it is not for a socket
597   *		existing in our system, if it is matched to a socket,
598   *		it is just duplicate segment or bug in other side's TCP.
599   *		So that we build reply only basing on parameters
600   *		arrived with segment.
601   *	Exception: precedence violation. We do not implement it in any case.
602   */
603  
tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)604  static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
605  {
606  	const struct tcphdr *th = tcp_hdr(skb);
607  	struct {
608  		struct tcphdr th;
609  #ifdef CONFIG_TCP_MD5SIG
610  		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
611  #endif
612  	} rep;
613  	struct ip_reply_arg arg;
614  #ifdef CONFIG_TCP_MD5SIG
615  	struct tcp_md5sig_key *key;
616  	const __u8 *hash_location = NULL;
617  	unsigned char newhash[16];
618  	int genhash;
619  	struct sock *sk1 = NULL;
620  #endif
621  	struct net *net;
622  
623  	/* Never send a reset in response to a reset. */
624  	if (th->rst)
625  		return;
626  
627  	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
628  		return;
629  
630  	/* Swap the send and the receive. */
631  	memset(&rep, 0, sizeof(rep));
632  	rep.th.dest   = th->source;
633  	rep.th.source = th->dest;
634  	rep.th.doff   = sizeof(struct tcphdr) / 4;
635  	rep.th.rst    = 1;
636  
637  	if (th->ack) {
638  		rep.th.seq = th->ack_seq;
639  	} else {
640  		rep.th.ack = 1;
641  		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
642  				       skb->len - (th->doff << 2));
643  	}
644  
645  	memset(&arg, 0, sizeof(arg));
646  	arg.iov[0].iov_base = (unsigned char *)&rep;
647  	arg.iov[0].iov_len  = sizeof(rep.th);
648  
649  	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
650  #ifdef CONFIG_TCP_MD5SIG
651  	hash_location = tcp_parse_md5sig_option(th);
652  	if (!sk && hash_location) {
653  		/*
654  		 * active side is lost. Try to find listening socket through
655  		 * source port, and then find md5 key through listening socket.
656  		 * we are not loose security here:
657  		 * Incoming packet is checked with md5 hash with finding key,
658  		 * no RST generated if md5 hash doesn't match.
659  		 */
660  		sk1 = __inet_lookup_listener(net,
661  					     &tcp_hashinfo, ip_hdr(skb)->saddr,
662  					     th->source, ip_hdr(skb)->daddr,
663  					     ntohs(th->source), inet_iif(skb));
664  		/* don't send rst if it can't find key */
665  		if (!sk1)
666  			return;
667  		rcu_read_lock();
668  		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
669  					&ip_hdr(skb)->saddr, AF_INET);
670  		if (!key)
671  			goto release_sk1;
672  
673  		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
674  		if (genhash || memcmp(hash_location, newhash, 16) != 0)
675  			goto release_sk1;
676  	} else {
677  		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
678  					     &ip_hdr(skb)->saddr,
679  					     AF_INET) : NULL;
680  	}
681  
682  	if (key) {
683  		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684  				   (TCPOPT_NOP << 16) |
685  				   (TCPOPT_MD5SIG << 8) |
686  				   TCPOLEN_MD5SIG);
687  		/* Update length and the length the header thinks exists */
688  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689  		rep.th.doff = arg.iov[0].iov_len / 4;
690  
691  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692  				     key, ip_hdr(skb)->saddr,
693  				     ip_hdr(skb)->daddr, &rep.th);
694  	}
695  #endif
696  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697  				      ip_hdr(skb)->saddr, /* XXX */
698  				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
699  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700  	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701  	/* When socket is gone, all binding information is lost.
702  	 * routing might fail in this case. No choice here, if we choose to force
703  	 * input interface, we will misroute in case of asymmetric route.
704  	 */
705  	if (sk)
706  		arg.bound_dev_if = sk->sk_bound_dev_if;
707  
708  	arg.tos = ip_hdr(skb)->tos;
709  	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
710  	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
711  			      skb, ip_hdr(skb)->saddr,
712  			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
713  
714  	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
715  	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
716  
717  #ifdef CONFIG_TCP_MD5SIG
718  release_sk1:
719  	if (sk1) {
720  		rcu_read_unlock();
721  		sock_put(sk1);
722  	}
723  #endif
724  }
725  
726  /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
727     outside socket context is ugly, certainly. What can I do?
728   */
729  
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)730  static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
731  			    u32 seq, u32 ack,
732  			    u32 win, u32 tsval, u32 tsecr, int oif,
733  			    struct tcp_md5sig_key *key,
734  			    int reply_flags, u8 tos)
735  {
736  	const struct tcphdr *th = tcp_hdr(skb);
737  	struct {
738  		struct tcphdr th;
739  		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
740  #ifdef CONFIG_TCP_MD5SIG
741  			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
742  #endif
743  			];
744  	} rep;
745  	struct ip_reply_arg arg;
746  	struct net *net = sock_net(sk);
747  
748  	memset(&rep.th, 0, sizeof(struct tcphdr));
749  	memset(&arg, 0, sizeof(arg));
750  
751  	arg.iov[0].iov_base = (unsigned char *)&rep;
752  	arg.iov[0].iov_len  = sizeof(rep.th);
753  	if (tsecr) {
754  		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
755  				   (TCPOPT_TIMESTAMP << 8) |
756  				   TCPOLEN_TIMESTAMP);
757  		rep.opt[1] = htonl(tsval);
758  		rep.opt[2] = htonl(tsecr);
759  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
760  	}
761  
762  	/* Swap the send and the receive. */
763  	rep.th.dest    = th->source;
764  	rep.th.source  = th->dest;
765  	rep.th.doff    = arg.iov[0].iov_len / 4;
766  	rep.th.seq     = htonl(seq);
767  	rep.th.ack_seq = htonl(ack);
768  	rep.th.ack     = 1;
769  	rep.th.window  = htons(win);
770  
771  #ifdef CONFIG_TCP_MD5SIG
772  	if (key) {
773  		int offset = (tsecr) ? 3 : 0;
774  
775  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
776  					  (TCPOPT_NOP << 16) |
777  					  (TCPOPT_MD5SIG << 8) |
778  					  TCPOLEN_MD5SIG);
779  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
780  		rep.th.doff = arg.iov[0].iov_len/4;
781  
782  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
783  				    key, ip_hdr(skb)->saddr,
784  				    ip_hdr(skb)->daddr, &rep.th);
785  	}
786  #endif
787  	arg.flags = reply_flags;
788  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789  				      ip_hdr(skb)->saddr, /* XXX */
790  				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
791  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792  	if (oif)
793  		arg.bound_dev_if = oif;
794  	arg.tos = tos;
795  	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
796  	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
797  			      skb, ip_hdr(skb)->saddr,
798  			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
799  
800  	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
801  }
802  
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)803  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
804  {
805  	struct inet_timewait_sock *tw = inet_twsk(sk);
806  	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
807  
808  	tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
809  			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
810  			tcp_time_stamp + tcptw->tw_ts_offset,
811  			tcptw->tw_ts_recent,
812  			tw->tw_bound_dev_if,
813  			tcp_twsk_md5_key(tcptw),
814  			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
815  			tw->tw_tos
816  			);
817  
818  	inet_twsk_put(tw);
819  }
820  
tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)821  static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
822  				  struct request_sock *req)
823  {
824  	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
825  	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
826  	 */
827  	tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
828  			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
829  			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
830  			tcp_time_stamp,
831  			req->ts_recent,
832  			0,
833  			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
834  					  AF_INET),
835  			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
836  			ip_hdr(skb)->tos);
837  }
838  
839  /*
840   *	Send a SYN-ACK after having received a SYN.
841   *	This still operates on a request_sock only, not on a big
842   *	socket.
843   */
tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,u16 queue_mapping,bool nocache)844  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
845  			      struct request_sock *req,
846  			      u16 queue_mapping,
847  			      bool nocache)
848  {
849  	const struct inet_request_sock *ireq = inet_rsk(req);
850  	struct flowi4 fl4;
851  	int err = -1;
852  	struct sk_buff * skb;
853  
854  	/* First, grab a route. */
855  	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
856  		return -1;
857  
858  	skb = tcp_make_synack(sk, dst, req, NULL);
859  
860  	if (skb) {
861  		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
862  
863  		skb_set_queue_mapping(skb, queue_mapping);
864  		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
865  					    ireq->rmt_addr,
866  					    ireq->opt);
867  		err = net_xmit_eval(err);
868  		if (!tcp_rsk(req)->snt_synack && !err)
869  			tcp_rsk(req)->snt_synack = tcp_time_stamp;
870  	}
871  
872  	return err;
873  }
874  
tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req)875  static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
876  {
877  	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
878  
879  	if (!res)
880  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
881  	return res;
882  }
883  
884  /*
885   *	IPv4 request_sock destructor.
886   */
tcp_v4_reqsk_destructor(struct request_sock * req)887  static void tcp_v4_reqsk_destructor(struct request_sock *req)
888  {
889  	kfree(inet_rsk(req)->opt);
890  }
891  
892  /*
893   * Return true if a syncookie should be sent
894   */
tcp_syn_flood_action(struct sock * sk,const struct sk_buff * skb,const char * proto)895  bool tcp_syn_flood_action(struct sock *sk,
896  			 const struct sk_buff *skb,
897  			 const char *proto)
898  {
899  	const char *msg = "Dropping request";
900  	bool want_cookie = false;
901  	struct listen_sock *lopt;
902  
903  
904  
905  #ifdef CONFIG_SYN_COOKIES
906  	if (sysctl_tcp_syncookies) {
907  		msg = "Sending cookies";
908  		want_cookie = true;
909  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
910  	} else
911  #endif
912  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
913  
914  	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
915  	if (!lopt->synflood_warned) {
916  		lopt->synflood_warned = 1;
917  		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
918  			proto, ntohs(tcp_hdr(skb)->dest), msg);
919  	}
920  	return want_cookie;
921  }
922  EXPORT_SYMBOL(tcp_syn_flood_action);
923  
924  /*
925   * Save and compile IPv4 options into the request_sock if needed.
926   */
tcp_v4_save_options(struct sk_buff * skb)927  static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
928  {
929  	const struct ip_options *opt = &(IPCB(skb)->opt);
930  	struct ip_options_rcu *dopt = NULL;
931  
932  	if (opt && opt->optlen) {
933  		int opt_size = sizeof(*dopt) + opt->optlen;
934  
935  		dopt = kmalloc(opt_size, GFP_ATOMIC);
936  		if (dopt) {
937  			if (ip_options_echo(&dopt->opt, skb)) {
938  				kfree(dopt);
939  				dopt = NULL;
940  			}
941  		}
942  	}
943  	return dopt;
944  }
945  
946  #ifdef CONFIG_TCP_MD5SIG
947  /*
948   * RFC2385 MD5 checksumming requires a mapping of
949   * IP address->MD5 Key.
950   * We need to maintain these in the sk structure.
951   */
952  
953  /* Find the Key structure for an address.  */
tcp_md5_do_lookup(struct sock * sk,const union tcp_md5_addr * addr,int family)954  struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
955  					 const union tcp_md5_addr *addr,
956  					 int family)
957  {
958  	struct tcp_sock *tp = tcp_sk(sk);
959  	struct tcp_md5sig_key *key;
960  	unsigned int size = sizeof(struct in_addr);
961  	struct tcp_md5sig_info *md5sig;
962  
963  	/* caller either holds rcu_read_lock() or socket lock */
964  	md5sig = rcu_dereference_check(tp->md5sig_info,
965  				       sock_owned_by_user(sk) ||
966  				       lockdep_is_held(&sk->sk_lock.slock));
967  	if (!md5sig)
968  		return NULL;
969  #if IS_ENABLED(CONFIG_IPV6)
970  	if (family == AF_INET6)
971  		size = sizeof(struct in6_addr);
972  #endif
973  	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
974  		if (key->family != family)
975  			continue;
976  		if (!memcmp(&key->addr, addr, size))
977  			return key;
978  	}
979  	return NULL;
980  }
981  EXPORT_SYMBOL(tcp_md5_do_lookup);
982  
tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)983  struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
984  					 struct sock *addr_sk)
985  {
986  	union tcp_md5_addr *addr;
987  
988  	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
989  	return tcp_md5_do_lookup(sk, addr, AF_INET);
990  }
991  EXPORT_SYMBOL(tcp_v4_md5_lookup);
992  
tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)993  static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
994  						      struct request_sock *req)
995  {
996  	union tcp_md5_addr *addr;
997  
998  	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
999  	return tcp_md5_do_lookup(sk, addr, AF_INET);
1000  }
1001  
1002  /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,const u8 * newkey,u8 newkeylen,gfp_t gfp)1003  int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1004  		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1005  {
1006  	/* Add Key to the list */
1007  	struct tcp_md5sig_key *key;
1008  	struct tcp_sock *tp = tcp_sk(sk);
1009  	struct tcp_md5sig_info *md5sig;
1010  
1011  	key = tcp_md5_do_lookup(sk, addr, family);
1012  	if (key) {
1013  		/* Pre-existing entry - just update that one. */
1014  		memcpy(key->key, newkey, newkeylen);
1015  		key->keylen = newkeylen;
1016  		return 0;
1017  	}
1018  
1019  	md5sig = rcu_dereference_protected(tp->md5sig_info,
1020  					   sock_owned_by_user(sk));
1021  	if (!md5sig) {
1022  		md5sig = kmalloc(sizeof(*md5sig), gfp);
1023  		if (!md5sig)
1024  			return -ENOMEM;
1025  
1026  		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1027  		INIT_HLIST_HEAD(&md5sig->head);
1028  		rcu_assign_pointer(tp->md5sig_info, md5sig);
1029  	}
1030  
1031  	key = sock_kmalloc(sk, sizeof(*key), gfp);
1032  	if (!key)
1033  		return -ENOMEM;
1034  	if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1035  		sock_kfree_s(sk, key, sizeof(*key));
1036  		return -ENOMEM;
1037  	}
1038  
1039  	memcpy(key->key, newkey, newkeylen);
1040  	key->keylen = newkeylen;
1041  	key->family = family;
1042  	memcpy(&key->addr, addr,
1043  	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1044  				      sizeof(struct in_addr));
1045  	hlist_add_head_rcu(&key->node, &md5sig->head);
1046  	return 0;
1047  }
1048  EXPORT_SYMBOL(tcp_md5_do_add);
1049  
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family)1050  int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1051  {
1052  	struct tcp_sock *tp = tcp_sk(sk);
1053  	struct tcp_md5sig_key *key;
1054  	struct tcp_md5sig_info *md5sig;
1055  
1056  	key = tcp_md5_do_lookup(sk, addr, family);
1057  	if (!key)
1058  		return -ENOENT;
1059  	hlist_del_rcu(&key->node);
1060  	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061  	kfree_rcu(key, rcu);
1062  	md5sig = rcu_dereference_protected(tp->md5sig_info,
1063  					   sock_owned_by_user(sk));
1064  	if (hlist_empty(&md5sig->head))
1065  		tcp_free_md5sig_pool();
1066  	return 0;
1067  }
1068  EXPORT_SYMBOL(tcp_md5_do_del);
1069  
tcp_clear_md5_list(struct sock * sk)1070  static void tcp_clear_md5_list(struct sock *sk)
1071  {
1072  	struct tcp_sock *tp = tcp_sk(sk);
1073  	struct tcp_md5sig_key *key;
1074  	struct hlist_node *n;
1075  	struct tcp_md5sig_info *md5sig;
1076  
1077  	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1078  
1079  	if (!hlist_empty(&md5sig->head))
1080  		tcp_free_md5sig_pool();
1081  	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1082  		hlist_del_rcu(&key->node);
1083  		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1084  		kfree_rcu(key, rcu);
1085  	}
1086  }
1087  
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)1088  static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1089  				 int optlen)
1090  {
1091  	struct tcp_md5sig cmd;
1092  	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1093  
1094  	if (optlen < sizeof(cmd))
1095  		return -EINVAL;
1096  
1097  	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1098  		return -EFAULT;
1099  
1100  	if (sin->sin_family != AF_INET)
1101  		return -EINVAL;
1102  
1103  	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1104  		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1105  				      AF_INET);
1106  
1107  	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1108  		return -EINVAL;
1109  
1110  	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1111  			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1112  			      GFP_KERNEL);
1113  }
1114  
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1115  static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1116  					__be32 daddr, __be32 saddr, int nbytes)
1117  {
1118  	struct tcp4_pseudohdr *bp;
1119  	struct scatterlist sg;
1120  
1121  	bp = &hp->md5_blk.ip4;
1122  
1123  	/*
1124  	 * 1. the TCP pseudo-header (in the order: source IP address,
1125  	 * destination IP address, zero-padded protocol number, and
1126  	 * segment length)
1127  	 */
1128  	bp->saddr = saddr;
1129  	bp->daddr = daddr;
1130  	bp->pad = 0;
1131  	bp->protocol = IPPROTO_TCP;
1132  	bp->len = cpu_to_be16(nbytes);
1133  
1134  	sg_init_one(&sg, bp, sizeof(*bp));
1135  	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1136  }
1137  
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1138  static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1139  			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1140  {
1141  	struct tcp_md5sig_pool *hp;
1142  	struct hash_desc *desc;
1143  
1144  	hp = tcp_get_md5sig_pool();
1145  	if (!hp)
1146  		goto clear_hash_noput;
1147  	desc = &hp->md5_desc;
1148  
1149  	if (crypto_hash_init(desc))
1150  		goto clear_hash;
1151  	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1152  		goto clear_hash;
1153  	if (tcp_md5_hash_header(hp, th))
1154  		goto clear_hash;
1155  	if (tcp_md5_hash_key(hp, key))
1156  		goto clear_hash;
1157  	if (crypto_hash_final(desc, md5_hash))
1158  		goto clear_hash;
1159  
1160  	tcp_put_md5sig_pool();
1161  	return 0;
1162  
1163  clear_hash:
1164  	tcp_put_md5sig_pool();
1165  clear_hash_noput:
1166  	memset(md5_hash, 0, 16);
1167  	return 1;
1168  }
1169  
tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,const struct sock * sk,const struct request_sock * req,const struct sk_buff * skb)1170  int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1171  			const struct sock *sk, const struct request_sock *req,
1172  			const struct sk_buff *skb)
1173  {
1174  	struct tcp_md5sig_pool *hp;
1175  	struct hash_desc *desc;
1176  	const struct tcphdr *th = tcp_hdr(skb);
1177  	__be32 saddr, daddr;
1178  
1179  	if (sk) {
1180  		saddr = inet_sk(sk)->inet_saddr;
1181  		daddr = inet_sk(sk)->inet_daddr;
1182  	} else if (req) {
1183  		saddr = inet_rsk(req)->loc_addr;
1184  		daddr = inet_rsk(req)->rmt_addr;
1185  	} else {
1186  		const struct iphdr *iph = ip_hdr(skb);
1187  		saddr = iph->saddr;
1188  		daddr = iph->daddr;
1189  	}
1190  
1191  	hp = tcp_get_md5sig_pool();
1192  	if (!hp)
1193  		goto clear_hash_noput;
1194  	desc = &hp->md5_desc;
1195  
1196  	if (crypto_hash_init(desc))
1197  		goto clear_hash;
1198  
1199  	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1200  		goto clear_hash;
1201  	if (tcp_md5_hash_header(hp, th))
1202  		goto clear_hash;
1203  	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1204  		goto clear_hash;
1205  	if (tcp_md5_hash_key(hp, key))
1206  		goto clear_hash;
1207  	if (crypto_hash_final(desc, md5_hash))
1208  		goto clear_hash;
1209  
1210  	tcp_put_md5sig_pool();
1211  	return 0;
1212  
1213  clear_hash:
1214  	tcp_put_md5sig_pool();
1215  clear_hash_noput:
1216  	memset(md5_hash, 0, 16);
1217  	return 1;
1218  }
1219  EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1220  
tcp_v4_inbound_md5_hash(struct sock * sk,const struct sk_buff * skb)1221  static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1222  {
1223  	/*
1224  	 * This gets called for each TCP segment that arrives
1225  	 * so we want to be efficient.
1226  	 * We have 3 drop cases:
1227  	 * o No MD5 hash and one expected.
1228  	 * o MD5 hash and we're not expecting one.
1229  	 * o MD5 hash and its wrong.
1230  	 */
1231  	const __u8 *hash_location = NULL;
1232  	struct tcp_md5sig_key *hash_expected;
1233  	const struct iphdr *iph = ip_hdr(skb);
1234  	const struct tcphdr *th = tcp_hdr(skb);
1235  	int genhash;
1236  	unsigned char newhash[16];
1237  
1238  	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1239  					  AF_INET);
1240  	hash_location = tcp_parse_md5sig_option(th);
1241  
1242  	/* We've parsed the options - do we have a hash? */
1243  	if (!hash_expected && !hash_location)
1244  		return false;
1245  
1246  	if (hash_expected && !hash_location) {
1247  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1248  		return true;
1249  	}
1250  
1251  	if (!hash_expected && hash_location) {
1252  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1253  		return true;
1254  	}
1255  
1256  	/* Okay, so this is hash_expected and hash_location -
1257  	 * so we need to calculate the checksum.
1258  	 */
1259  	genhash = tcp_v4_md5_hash_skb(newhash,
1260  				      hash_expected,
1261  				      NULL, NULL, skb);
1262  
1263  	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1264  		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265  				     &iph->saddr, ntohs(th->source),
1266  				     &iph->daddr, ntohs(th->dest),
1267  				     genhash ? " tcp_v4_calc_md5_hash failed"
1268  				     : "");
1269  		return true;
1270  	}
1271  	return false;
1272  }
1273  
1274  #endif
1275  
1276  struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1277  	.family		=	PF_INET,
1278  	.obj_size	=	sizeof(struct tcp_request_sock),
1279  	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1280  	.send_ack	=	tcp_v4_reqsk_send_ack,
1281  	.destructor	=	tcp_v4_reqsk_destructor,
1282  	.send_reset	=	tcp_v4_send_reset,
1283  	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1284  };
1285  
1286  #ifdef CONFIG_TCP_MD5SIG
1287  static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1288  	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1289  	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1290  };
1291  #endif
1292  
tcp_fastopen_check(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct tcp_fastopen_cookie * foc,struct tcp_fastopen_cookie * valid_foc)1293  static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1294  			       struct request_sock *req,
1295  			       struct tcp_fastopen_cookie *foc,
1296  			       struct tcp_fastopen_cookie *valid_foc)
1297  {
1298  	bool skip_cookie = false;
1299  	struct fastopen_queue *fastopenq;
1300  
1301  	if (likely(!fastopen_cookie_present(foc))) {
1302  		/* See include/net/tcp.h for the meaning of these knobs */
1303  		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1304  		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1305  		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1306  			skip_cookie = true; /* no cookie to validate */
1307  		else
1308  			return false;
1309  	}
1310  	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1311  	/* A FO option is present; bump the counter. */
1312  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1313  
1314  	/* Make sure the listener has enabled fastopen, and we don't
1315  	 * exceed the max # of pending TFO requests allowed before trying
1316  	 * to validating the cookie in order to avoid burning CPU cycles
1317  	 * unnecessarily.
1318  	 *
1319  	 * XXX (TFO) - The implication of checking the max_qlen before
1320  	 * processing a cookie request is that clients can't differentiate
1321  	 * between qlen overflow causing Fast Open to be disabled
1322  	 * temporarily vs a server not supporting Fast Open at all.
1323  	 */
1324  	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1325  	    fastopenq == NULL || fastopenq->max_qlen == 0)
1326  		return false;
1327  
1328  	if (fastopenq->qlen >= fastopenq->max_qlen) {
1329  		struct request_sock *req1;
1330  		spin_lock(&fastopenq->lock);
1331  		req1 = fastopenq->rskq_rst_head;
1332  		if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1333  			spin_unlock(&fastopenq->lock);
1334  			NET_INC_STATS_BH(sock_net(sk),
1335  			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1336  			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1337  			foc->len = -1;
1338  			return false;
1339  		}
1340  		fastopenq->rskq_rst_head = req1->dl_next;
1341  		fastopenq->qlen--;
1342  		spin_unlock(&fastopenq->lock);
1343  		reqsk_free(req1);
1344  	}
1345  	if (skip_cookie) {
1346  		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1347  		return true;
1348  	}
1349  	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1350  		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1351  			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1352  			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1353  			    memcmp(&foc->val[0], &valid_foc->val[0],
1354  			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
1355  				return false;
1356  			valid_foc->len = -1;
1357  		}
1358  		/* Acknowledge the data received from the peer. */
1359  		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1360  		return true;
1361  	} else if (foc->len == 0) { /* Client requesting a cookie */
1362  		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1363  		NET_INC_STATS_BH(sock_net(sk),
1364  		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1365  	} else {
1366  		/* Client sent a cookie with wrong size. Treat it
1367  		 * the same as invalid and return a valid one.
1368  		 */
1369  		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1370  	}
1371  	return false;
1372  }
1373  
tcp_v4_conn_req_fastopen(struct sock * sk,struct sk_buff * skb,struct sk_buff * skb_synack,struct request_sock * req)1374  static int tcp_v4_conn_req_fastopen(struct sock *sk,
1375  				    struct sk_buff *skb,
1376  				    struct sk_buff *skb_synack,
1377  				    struct request_sock *req)
1378  {
1379  	struct tcp_sock *tp = tcp_sk(sk);
1380  	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1381  	const struct inet_request_sock *ireq = inet_rsk(req);
1382  	struct sock *child;
1383  	int err;
1384  
1385  	req->num_retrans = 0;
1386  	req->num_timeout = 0;
1387  	req->sk = NULL;
1388  
1389  	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390  	if (child == NULL) {
1391  		NET_INC_STATS_BH(sock_net(sk),
1392  				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393  		kfree_skb(skb_synack);
1394  		return -1;
1395  	}
1396  	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397  				    ireq->rmt_addr, ireq->opt);
1398  	err = net_xmit_eval(err);
1399  	if (!err)
1400  		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1401  	/* XXX (TFO) - is it ok to ignore error and continue? */
1402  
1403  	spin_lock(&queue->fastopenq->lock);
1404  	queue->fastopenq->qlen++;
1405  	spin_unlock(&queue->fastopenq->lock);
1406  
1407  	/* Initialize the child socket. Have to fix some values to take
1408  	 * into account the child is a Fast Open socket and is created
1409  	 * only out of the bits carried in the SYN packet.
1410  	 */
1411  	tp = tcp_sk(child);
1412  
1413  	tp->fastopen_rsk = req;
1414  	/* Do a hold on the listner sk so that if the listener is being
1415  	 * closed, the child that has been accepted can live on and still
1416  	 * access listen_lock.
1417  	 */
1418  	sock_hold(sk);
1419  	tcp_rsk(req)->listener = sk;
1420  
1421  	/* RFC1323: The window in SYN & SYN/ACK segments is never
1422  	 * scaled. So correct it appropriately.
1423  	 */
1424  	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1425  
1426  	/* Activate the retrans timer so that SYNACK can be retransmitted.
1427  	 * The request socket is not added to the SYN table of the parent
1428  	 * because it's been added to the accept queue directly.
1429  	 */
1430  	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1431  	    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1432  
1433  	/* Add the child socket directly into the accept queue */
1434  	inet_csk_reqsk_queue_add(sk, req, child);
1435  
1436  	/* Now finish processing the fastopen child socket. */
1437  	inet_csk(child)->icsk_af_ops->rebuild_header(child);
1438  	tcp_init_congestion_control(child);
1439  	tcp_mtup_init(child);
1440  	tcp_init_buffer_space(child);
1441  	tcp_init_metrics(child);
1442  
1443  	/* Queue the data carried in the SYN packet. We need to first
1444  	 * bump skb's refcnt because the caller will attempt to free it.
1445  	 *
1446  	 * XXX (TFO) - we honor a zero-payload TFO request for now.
1447  	 * (Any reason not to?)
1448  	 */
1449  	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1450  		/* Don't queue the skb if there is no payload in SYN.
1451  		 * XXX (TFO) - How about SYN+FIN?
1452  		 */
1453  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1454  	} else {
1455  		skb = skb_get(skb);
1456  		skb_dst_drop(skb);
1457  		__skb_pull(skb, tcp_hdr(skb)->doff * 4);
1458  		skb_set_owner_r(skb, child);
1459  		__skb_queue_tail(&child->sk_receive_queue, skb);
1460  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1461  		tp->syn_data_acked = 1;
1462  	}
1463  	sk->sk_data_ready(sk, 0);
1464  	bh_unlock_sock(child);
1465  	sock_put(child);
1466  	WARN_ON(req->sk == NULL);
1467  	return 0;
1468  }
1469  
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1470  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1471  {
1472  	struct tcp_options_received tmp_opt;
1473  	struct request_sock *req;
1474  	struct inet_request_sock *ireq;
1475  	struct tcp_sock *tp = tcp_sk(sk);
1476  	struct dst_entry *dst = NULL;
1477  	__be32 saddr = ip_hdr(skb)->saddr;
1478  	__be32 daddr = ip_hdr(skb)->daddr;
1479  	__u32 isn = TCP_SKB_CB(skb)->when;
1480  	bool want_cookie = false;
1481  	struct flowi4 fl4;
1482  	struct tcp_fastopen_cookie foc = { .len = -1 };
1483  	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1484  	struct sk_buff *skb_synack;
1485  	int do_fastopen;
1486  
1487  	/* Never answer to SYNs send to broadcast or multicast */
1488  	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489  		goto drop;
1490  
1491  	/* TW buckets are converted to open requests without
1492  	 * limitations, they conserve resources and peer is
1493  	 * evidently real one.
1494  	 */
1495  	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1496  		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1497  		if (!want_cookie)
1498  			goto drop;
1499  	}
1500  
1501  	/* Accept backlog is full. If we have already queued enough
1502  	 * of warm entries in syn queue, drop request. It is better than
1503  	 * clogging syn queue with openreqs with exponentially increasing
1504  	 * timeout.
1505  	 */
1506  	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1507  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1508  		goto drop;
1509  	}
1510  
1511  	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1512  	if (!req)
1513  		goto drop;
1514  
1515  #ifdef CONFIG_TCP_MD5SIG
1516  	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1517  #endif
1518  
1519  	tcp_clear_options(&tmp_opt);
1520  	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1521  	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1522  	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1523  
1524  	if (want_cookie && !tmp_opt.saw_tstamp)
1525  		tcp_clear_options(&tmp_opt);
1526  
1527  	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1528  	tcp_openreq_init(req, &tmp_opt, skb);
1529  
1530  	ireq = inet_rsk(req);
1531  	ireq->loc_addr = daddr;
1532  	ireq->rmt_addr = saddr;
1533  	ireq->no_srccheck = inet_sk(sk)->transparent;
1534  	ireq->opt = tcp_v4_save_options(skb);
1535  	ireq->ir_mark = inet_request_mark(sk, skb);
1536  
1537  	if (security_inet_conn_request(sk, skb, req))
1538  		goto drop_and_free;
1539  
1540  	if (!want_cookie || tmp_opt.tstamp_ok)
1541  		TCP_ECN_create_request(req, skb, sock_net(sk));
1542  
1543  	if (want_cookie) {
1544  		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1545  		req->cookie_ts = tmp_opt.tstamp_ok;
1546  	} else if (!isn) {
1547  		/* VJ's idea. We save last timestamp seen
1548  		 * from the destination in peer table, when entering
1549  		 * state TIME-WAIT, and check against it before
1550  		 * accepting new connection request.
1551  		 *
1552  		 * If "isn" is not zero, this request hit alive
1553  		 * timewait bucket, so that all the necessary checks
1554  		 * are made in the function processing timewait state.
1555  		 */
1556  		if (tmp_opt.saw_tstamp &&
1557  		    tcp_death_row.sysctl_tw_recycle &&
1558  		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1559  		    fl4.daddr == saddr) {
1560  			if (!tcp_peer_is_proven(req, dst, true)) {
1561  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1562  				goto drop_and_release;
1563  			}
1564  		}
1565  		/* Kill the following clause, if you dislike this way. */
1566  		else if (!sysctl_tcp_syncookies &&
1567  			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1568  			  (sysctl_max_syn_backlog >> 2)) &&
1569  			 !tcp_peer_is_proven(req, dst, false)) {
1570  			/* Without syncookies last quarter of
1571  			 * backlog is filled with destinations,
1572  			 * proven to be alive.
1573  			 * It means that we continue to communicate
1574  			 * to destinations, already remembered
1575  			 * to the moment of synflood.
1576  			 */
1577  			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1578  				       &saddr, ntohs(tcp_hdr(skb)->source));
1579  			goto drop_and_release;
1580  		}
1581  
1582  		isn = tcp_v4_init_sequence(skb);
1583  	}
1584  	tcp_rsk(req)->snt_isn = isn;
1585  
1586  	if (dst == NULL) {
1587  		dst = inet_csk_route_req(sk, &fl4, req);
1588  		if (dst == NULL)
1589  			goto drop_and_free;
1590  	}
1591  	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1592  
1593  	/* We don't call tcp_v4_send_synack() directly because we need
1594  	 * to make sure a child socket can be created successfully before
1595  	 * sending back synack!
1596  	 *
1597  	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1598  	 * (or better yet, call tcp_send_synack() in the child context
1599  	 * directly, but will have to fix bunch of other code first)
1600  	 * after syn_recv_sock() except one will need to first fix the
1601  	 * latter to remove its dependency on the current implementation
1602  	 * of tcp_v4_send_synack()->tcp_select_initial_window().
1603  	 */
1604  	skb_synack = tcp_make_synack(sk, dst, req,
1605  	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1606  
1607  	if (skb_synack) {
1608  		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1609  		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1610  	} else
1611  		goto drop_and_free;
1612  
1613  	if (likely(!do_fastopen)) {
1614  		int err;
1615  		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1616  		     ireq->rmt_addr, ireq->opt);
1617  		err = net_xmit_eval(err);
1618  		if (err || want_cookie)
1619  			goto drop_and_free;
1620  
1621  		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1622  		tcp_rsk(req)->listener = NULL;
1623  		/* Add the request_sock to the SYN table */
1624  		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1625  		if (fastopen_cookie_present(&foc) && foc.len != 0)
1626  			NET_INC_STATS_BH(sock_net(sk),
1627  			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1628  	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1629  		goto drop_and_free;
1630  
1631  	return 0;
1632  
1633  drop_and_release:
1634  	dst_release(dst);
1635  drop_and_free:
1636  	reqsk_free(req);
1637  drop:
1638  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1639  	return 0;
1640  }
1641  EXPORT_SYMBOL(tcp_v4_conn_request);
1642  
1643  
1644  /*
1645   * The three way handshake has completed - we got a valid synack -
1646   * now create the new socket.
1647   */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1648  struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1649  				  struct request_sock *req,
1650  				  struct dst_entry *dst)
1651  {
1652  	struct inet_request_sock *ireq;
1653  	struct inet_sock *newinet;
1654  	struct tcp_sock *newtp;
1655  	struct sock *newsk;
1656  #ifdef CONFIG_TCP_MD5SIG
1657  	struct tcp_md5sig_key *key;
1658  #endif
1659  	struct ip_options_rcu *inet_opt;
1660  
1661  	if (sk_acceptq_is_full(sk))
1662  		goto exit_overflow;
1663  
1664  	newsk = tcp_create_openreq_child(sk, req, skb);
1665  	if (!newsk)
1666  		goto exit_nonewsk;
1667  
1668  	newsk->sk_gso_type = SKB_GSO_TCPV4;
1669  	inet_sk_rx_dst_set(newsk, skb);
1670  
1671  	newtp		      = tcp_sk(newsk);
1672  	newinet		      = inet_sk(newsk);
1673  	ireq		      = inet_rsk(req);
1674  	newinet->inet_daddr   = ireq->rmt_addr;
1675  	newinet->inet_rcv_saddr = ireq->loc_addr;
1676  	newinet->inet_saddr	      = ireq->loc_addr;
1677  	inet_opt	      = ireq->opt;
1678  	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1679  	ireq->opt	      = NULL;
1680  	newinet->mc_index     = inet_iif(skb);
1681  	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1682  	newinet->rcv_tos      = ip_hdr(skb)->tos;
1683  	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1684  	if (inet_opt)
1685  		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1686  	newinet->inet_id = newtp->write_seq ^ jiffies;
1687  
1688  	if (!dst) {
1689  		dst = inet_csk_route_child_sock(sk, newsk, req);
1690  		if (!dst)
1691  			goto put_and_exit;
1692  	} else {
1693  		/* syncookie case : see end of cookie_v4_check() */
1694  	}
1695  	sk_setup_caps(newsk, dst);
1696  
1697  	tcp_mtup_init(newsk);
1698  	tcp_sync_mss(newsk, dst_mtu(dst));
1699  	newtp->advmss = dst_metric_advmss(dst);
1700  	if (tcp_sk(sk)->rx_opt.user_mss &&
1701  	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1702  		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1703  
1704  	tcp_initialize_rcv_mss(newsk);
1705  	tcp_synack_rtt_meas(newsk, req);
1706  	newtp->total_retrans = req->num_retrans;
1707  
1708  #ifdef CONFIG_TCP_MD5SIG
1709  	/* Copy over the MD5 key from the original socket */
1710  	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1711  				AF_INET);
1712  	if (key != NULL) {
1713  		/*
1714  		 * We're using one, so create a matching key
1715  		 * on the newsk structure. If we fail to get
1716  		 * memory, then we end up not copying the key
1717  		 * across. Shucks.
1718  		 */
1719  		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1720  			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1721  		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1722  	}
1723  #endif
1724  
1725  	if (__inet_inherit_port(sk, newsk) < 0)
1726  		goto put_and_exit;
1727  	__inet_hash_nolisten(newsk, NULL);
1728  
1729  	return newsk;
1730  
1731  exit_overflow:
1732  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1733  exit_nonewsk:
1734  	dst_release(dst);
1735  exit:
1736  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1737  	return NULL;
1738  put_and_exit:
1739  	inet_csk_prepare_forced_close(newsk);
1740  	tcp_done(newsk);
1741  	goto exit;
1742  }
1743  EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1744  
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1745  static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1746  {
1747  	struct tcphdr *th = tcp_hdr(skb);
1748  	const struct iphdr *iph = ip_hdr(skb);
1749  	struct sock *nsk;
1750  	struct request_sock **prev;
1751  	/* Find possible connection requests. */
1752  	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1753  						       iph->saddr, iph->daddr);
1754  	if (req)
1755  		return tcp_check_req(sk, skb, req, prev, false);
1756  
1757  	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1758  			th->source, iph->daddr, th->dest, inet_iif(skb));
1759  
1760  	if (nsk) {
1761  		if (nsk->sk_state != TCP_TIME_WAIT) {
1762  			bh_lock_sock(nsk);
1763  			return nsk;
1764  		}
1765  		inet_twsk_put(inet_twsk(nsk));
1766  		return NULL;
1767  	}
1768  
1769  #ifdef CONFIG_SYN_COOKIES
1770  	if (!th->syn)
1771  		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1772  #endif
1773  	return sk;
1774  }
1775  
tcp_v4_checksum_init(struct sk_buff * skb)1776  static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1777  {
1778  	const struct iphdr *iph = ip_hdr(skb);
1779  
1780  	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1781  		if (!tcp_v4_check(skb->len, iph->saddr,
1782  				  iph->daddr, skb->csum)) {
1783  			skb->ip_summed = CHECKSUM_UNNECESSARY;
1784  			return 0;
1785  		}
1786  	}
1787  
1788  	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1789  				       skb->len, IPPROTO_TCP, 0);
1790  
1791  	if (skb->len <= 76) {
1792  		return __skb_checksum_complete(skb);
1793  	}
1794  	return 0;
1795  }
1796  
1797  
1798  /* The socket must have it's spinlock held when we get
1799   * here.
1800   *
1801   * We have a potential double-lock case here, so even when
1802   * doing backlog processing we use the BH locking scheme.
1803   * This is because we cannot sleep with the original spinlock
1804   * held.
1805   */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1806  int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1807  {
1808  	struct sock *rsk;
1809  #ifdef CONFIG_TCP_MD5SIG
1810  	/*
1811  	 * We really want to reject the packet as early as possible
1812  	 * if:
1813  	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1814  	 *  o There is an MD5 option and we're not expecting one
1815  	 */
1816  	if (tcp_v4_inbound_md5_hash(sk, skb))
1817  		goto discard;
1818  #endif
1819  
1820  	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1821  		struct dst_entry *dst = sk->sk_rx_dst;
1822  
1823  		sock_rps_save_rxhash(sk, skb);
1824  		if (dst) {
1825  			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1826  			    dst->ops->check(dst, 0) == NULL) {
1827  				dst_release(dst);
1828  				sk->sk_rx_dst = NULL;
1829  			}
1830  		}
1831  		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1832  			rsk = sk;
1833  			goto reset;
1834  		}
1835  		return 0;
1836  	}
1837  
1838  	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1839  		goto csum_err;
1840  
1841  	if (sk->sk_state == TCP_LISTEN) {
1842  		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1843  		if (!nsk)
1844  			goto discard;
1845  
1846  		if (nsk != sk) {
1847  			sock_rps_save_rxhash(nsk, skb);
1848  			if (tcp_child_process(sk, nsk, skb)) {
1849  				rsk = nsk;
1850  				goto reset;
1851  			}
1852  			return 0;
1853  		}
1854  	} else
1855  		sock_rps_save_rxhash(sk, skb);
1856  
1857  	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1858  		rsk = sk;
1859  		goto reset;
1860  	}
1861  	return 0;
1862  
1863  reset:
1864  	tcp_v4_send_reset(rsk, skb);
1865  discard:
1866  	kfree_skb(skb);
1867  	/* Be careful here. If this function gets more complicated and
1868  	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1869  	 * might be destroyed here. This current version compiles correctly,
1870  	 * but you have been warned.
1871  	 */
1872  	return 0;
1873  
1874  csum_err:
1875  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1876  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1877  	goto discard;
1878  }
1879  EXPORT_SYMBOL(tcp_v4_do_rcv);
1880  
tcp_v4_early_demux(struct sk_buff * skb)1881  void tcp_v4_early_demux(struct sk_buff *skb)
1882  {
1883  	const struct iphdr *iph;
1884  	const struct tcphdr *th;
1885  	struct sock *sk;
1886  
1887  	if (skb->pkt_type != PACKET_HOST)
1888  		return;
1889  
1890  	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1891  		return;
1892  
1893  	iph = ip_hdr(skb);
1894  	th = tcp_hdr(skb);
1895  
1896  	if (th->doff < sizeof(struct tcphdr) / 4)
1897  		return;
1898  
1899  	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1900  				       iph->saddr, th->source,
1901  				       iph->daddr, ntohs(th->dest),
1902  				       skb->skb_iif);
1903  	if (sk) {
1904  		skb->sk = sk;
1905  		skb->destructor = sock_edemux;
1906  		if (sk->sk_state != TCP_TIME_WAIT) {
1907  			struct dst_entry *dst = sk->sk_rx_dst;
1908  
1909  			if (dst)
1910  				dst = dst_check(dst, 0);
1911  			if (dst &&
1912  			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1913  				skb_dst_set_noref(skb, dst);
1914  		}
1915  	}
1916  }
1917  
1918  /* Packet is added to VJ-style prequeue for processing in process
1919   * context, if a reader task is waiting. Apparently, this exciting
1920   * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1921   * failed somewhere. Latency? Burstiness? Well, at least now we will
1922   * see, why it failed. 8)8)				  --ANK
1923   *
1924   */
tcp_prequeue(struct sock * sk,struct sk_buff * skb)1925  bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1926  {
1927  	struct tcp_sock *tp = tcp_sk(sk);
1928  
1929  	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1930  		return false;
1931  
1932  	if (skb->len <= tcp_hdrlen(skb) &&
1933  	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1934  		return false;
1935  
1936  	skb_dst_force(skb);
1937  	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1938  	tp->ucopy.memory += skb->truesize;
1939  	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1940  		struct sk_buff *skb1;
1941  
1942  		BUG_ON(sock_owned_by_user(sk));
1943  
1944  		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1945  			sk_backlog_rcv(sk, skb1);
1946  			NET_INC_STATS_BH(sock_net(sk),
1947  					 LINUX_MIB_TCPPREQUEUEDROPPED);
1948  		}
1949  
1950  		tp->ucopy.memory = 0;
1951  	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1952  		wake_up_interruptible_sync_poll(sk_sleep(sk),
1953  					   POLLIN | POLLRDNORM | POLLRDBAND);
1954  		if (!inet_csk_ack_scheduled(sk))
1955  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1956  						  (3 * tcp_rto_min(sk)) / 4,
1957  						  TCP_RTO_MAX);
1958  	}
1959  	return true;
1960  }
1961  EXPORT_SYMBOL(tcp_prequeue);
1962  
1963  /*
1964   *	From tcp_input.c
1965   */
1966  
tcp_v4_rcv(struct sk_buff * skb)1967  int tcp_v4_rcv(struct sk_buff *skb)
1968  {
1969  	const struct iphdr *iph;
1970  	const struct tcphdr *th;
1971  	struct sock *sk;
1972  	int ret;
1973  	struct net *net = dev_net(skb->dev);
1974  
1975  	if (skb->pkt_type != PACKET_HOST)
1976  		goto discard_it;
1977  
1978  	/* Count it even if it's bad */
1979  	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1980  
1981  	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1982  		goto discard_it;
1983  
1984  	th = tcp_hdr(skb);
1985  
1986  	if (th->doff < sizeof(struct tcphdr) / 4)
1987  		goto bad_packet;
1988  	if (!pskb_may_pull(skb, th->doff * 4))
1989  		goto discard_it;
1990  
1991  	/* An explanation is required here, I think.
1992  	 * Packet length and doff are validated by header prediction,
1993  	 * provided case of th->doff==0 is eliminated.
1994  	 * So, we defer the checks. */
1995  	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1996  		goto csum_error;
1997  
1998  	th = tcp_hdr(skb);
1999  	iph = ip_hdr(skb);
2000  	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2001  	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2002  				    skb->len - th->doff * 4);
2003  	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2004  	TCP_SKB_CB(skb)->when	 = 0;
2005  	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2006  	TCP_SKB_CB(skb)->sacked	 = 0;
2007  
2008  	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2009  	if (!sk)
2010  		goto no_tcp_socket;
2011  
2012  process:
2013  	if (sk->sk_state == TCP_TIME_WAIT)
2014  		goto do_time_wait;
2015  
2016  	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2017  		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2018  		goto discard_and_relse;
2019  	}
2020  
2021  	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2022  		goto discard_and_relse;
2023  	nf_reset(skb);
2024  
2025  	if (sk_filter(sk, skb))
2026  		goto discard_and_relse;
2027  
2028  	skb->dev = NULL;
2029  
2030  	bh_lock_sock_nested(sk);
2031  	ret = 0;
2032  	if (!sock_owned_by_user(sk)) {
2033  #ifdef CONFIG_NET_DMA
2034  		struct tcp_sock *tp = tcp_sk(sk);
2035  		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2036  			tp->ucopy.dma_chan = net_dma_find_channel();
2037  		if (tp->ucopy.dma_chan)
2038  			ret = tcp_v4_do_rcv(sk, skb);
2039  		else
2040  #endif
2041  		{
2042  			if (!tcp_prequeue(sk, skb))
2043  				ret = tcp_v4_do_rcv(sk, skb);
2044  		}
2045  	} else if (unlikely(sk_add_backlog(sk, skb,
2046  					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
2047  		bh_unlock_sock(sk);
2048  		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2049  		goto discard_and_relse;
2050  	}
2051  	bh_unlock_sock(sk);
2052  
2053  	sock_put(sk);
2054  
2055  	return ret;
2056  
2057  no_tcp_socket:
2058  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2059  		goto discard_it;
2060  
2061  	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2062  csum_error:
2063  		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2064  bad_packet:
2065  		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2066  	} else {
2067  		tcp_v4_send_reset(NULL, skb);
2068  	}
2069  
2070  discard_it:
2071  	/* Discard frame. */
2072  	kfree_skb(skb);
2073  	return 0;
2074  
2075  discard_and_relse:
2076  	sock_put(sk);
2077  	goto discard_it;
2078  
2079  do_time_wait:
2080  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2081  		inet_twsk_put(inet_twsk(sk));
2082  		goto discard_it;
2083  	}
2084  
2085  	if (skb->len < (th->doff << 2)) {
2086  		inet_twsk_put(inet_twsk(sk));
2087  		goto bad_packet;
2088  	}
2089  	if (tcp_checksum_complete(skb)) {
2090  		inet_twsk_put(inet_twsk(sk));
2091  		goto csum_error;
2092  	}
2093  	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2094  	case TCP_TW_SYN: {
2095  		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2096  							&tcp_hashinfo,
2097  							iph->saddr, th->source,
2098  							iph->daddr, th->dest,
2099  							inet_iif(skb));
2100  		if (sk2) {
2101  			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2102  			inet_twsk_put(inet_twsk(sk));
2103  			sk = sk2;
2104  			goto process;
2105  		}
2106  		/* Fall through to ACK */
2107  	}
2108  	case TCP_TW_ACK:
2109  		tcp_v4_timewait_ack(sk, skb);
2110  		break;
2111  	case TCP_TW_RST:
2112  		goto no_tcp_socket;
2113  	case TCP_TW_SUCCESS:;
2114  	}
2115  	goto discard_it;
2116  }
2117  
2118  static struct timewait_sock_ops tcp_timewait_sock_ops = {
2119  	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2120  	.twsk_unique	= tcp_twsk_unique,
2121  	.twsk_destructor= tcp_twsk_destructor,
2122  };
2123  
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2124  void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2125  {
2126  	struct dst_entry *dst = skb_dst(skb);
2127  
2128  	dst_hold(dst);
2129  	sk->sk_rx_dst = dst;
2130  	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2131  }
2132  EXPORT_SYMBOL(inet_sk_rx_dst_set);
2133  
2134  const struct inet_connection_sock_af_ops ipv4_specific = {
2135  	.queue_xmit	   = ip_queue_xmit,
2136  	.send_check	   = tcp_v4_send_check,
2137  	.rebuild_header	   = inet_sk_rebuild_header,
2138  	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2139  	.conn_request	   = tcp_v4_conn_request,
2140  	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2141  	.net_header_len	   = sizeof(struct iphdr),
2142  	.setsockopt	   = ip_setsockopt,
2143  	.getsockopt	   = ip_getsockopt,
2144  	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2145  	.sockaddr_len	   = sizeof(struct sockaddr_in),
2146  	.bind_conflict	   = inet_csk_bind_conflict,
2147  #ifdef CONFIG_COMPAT
2148  	.compat_setsockopt = compat_ip_setsockopt,
2149  	.compat_getsockopt = compat_ip_getsockopt,
2150  #endif
2151  };
2152  EXPORT_SYMBOL(ipv4_specific);
2153  
2154  #ifdef CONFIG_TCP_MD5SIG
2155  static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2156  	.md5_lookup		= tcp_v4_md5_lookup,
2157  	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2158  	.md5_parse		= tcp_v4_parse_md5_keys,
2159  };
2160  #endif
2161  
2162  /* NOTE: A lot of things set to zero explicitly by call to
2163   *       sk_alloc() so need not be done here.
2164   */
tcp_v4_init_sock(struct sock * sk)2165  static int tcp_v4_init_sock(struct sock *sk)
2166  {
2167  	struct inet_connection_sock *icsk = inet_csk(sk);
2168  
2169  	tcp_init_sock(sk);
2170  
2171  	icsk->icsk_af_ops = &ipv4_specific;
2172  
2173  #ifdef CONFIG_TCP_MD5SIG
2174  	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2175  #endif
2176  
2177  	return 0;
2178  }
2179  
tcp_v4_destroy_sock(struct sock * sk)2180  void tcp_v4_destroy_sock(struct sock *sk)
2181  {
2182  	struct tcp_sock *tp = tcp_sk(sk);
2183  
2184  	tcp_clear_xmit_timers(sk);
2185  
2186  	tcp_cleanup_congestion_control(sk);
2187  
2188  	/* Cleanup up the write buffer. */
2189  	tcp_write_queue_purge(sk);
2190  
2191  	/* Cleans up our, hopefully empty, out_of_order_queue. */
2192  	__skb_queue_purge(&tp->out_of_order_queue);
2193  
2194  #ifdef CONFIG_TCP_MD5SIG
2195  	/* Clean up the MD5 key list, if any */
2196  	if (tp->md5sig_info) {
2197  		tcp_clear_md5_list(sk);
2198  		kfree_rcu(tp->md5sig_info, rcu);
2199  		tp->md5sig_info = NULL;
2200  	}
2201  #endif
2202  
2203  #ifdef CONFIG_NET_DMA
2204  	/* Cleans up our sk_async_wait_queue */
2205  	__skb_queue_purge(&sk->sk_async_wait_queue);
2206  #endif
2207  
2208  	/* Clean prequeue, it must be empty really */
2209  	__skb_queue_purge(&tp->ucopy.prequeue);
2210  
2211  	/* Clean up a referenced TCP bind bucket. */
2212  	if (inet_csk(sk)->icsk_bind_hash)
2213  		inet_put_port(sk);
2214  
2215  	BUG_ON(tp->fastopen_rsk != NULL);
2216  
2217  	/* If socket is aborted during connect operation */
2218  	tcp_free_fastopen_req(tp);
2219  
2220  	sk_sockets_allocated_dec(sk);
2221  	sock_release_memcg(sk);
2222  }
2223  EXPORT_SYMBOL(tcp_v4_destroy_sock);
2224  
2225  #ifdef CONFIG_PROC_FS
2226  /* Proc filesystem TCP sock list dumping. */
2227  
tw_head(struct hlist_nulls_head * head)2228  static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2229  {
2230  	return hlist_nulls_empty(head) ? NULL :
2231  		list_entry(head->first, struct inet_timewait_sock, tw_node);
2232  }
2233  
tw_next(struct inet_timewait_sock * tw)2234  static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2235  {
2236  	return !is_a_nulls(tw->tw_node.next) ?
2237  		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2238  }
2239  
2240  /*
2241   * Get next listener socket follow cur.  If cur is NULL, get first socket
2242   * starting from bucket given in st->bucket; when st->bucket is zero the
2243   * very first socket in the hash table is returned.
2244   */
listening_get_next(struct seq_file * seq,void * cur)2245  static void *listening_get_next(struct seq_file *seq, void *cur)
2246  {
2247  	struct inet_connection_sock *icsk;
2248  	struct hlist_nulls_node *node;
2249  	struct sock *sk = cur;
2250  	struct inet_listen_hashbucket *ilb;
2251  	struct tcp_iter_state *st = seq->private;
2252  	struct net *net = seq_file_net(seq);
2253  
2254  	if (!sk) {
2255  		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2256  		spin_lock_bh(&ilb->lock);
2257  		sk = sk_nulls_head(&ilb->head);
2258  		st->offset = 0;
2259  		goto get_sk;
2260  	}
2261  	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2262  	++st->num;
2263  	++st->offset;
2264  
2265  	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2266  		struct request_sock *req = cur;
2267  
2268  		icsk = inet_csk(st->syn_wait_sk);
2269  		req = req->dl_next;
2270  		while (1) {
2271  			while (req) {
2272  				if (req->rsk_ops->family == st->family) {
2273  					cur = req;
2274  					goto out;
2275  				}
2276  				req = req->dl_next;
2277  			}
2278  			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2279  				break;
2280  get_req:
2281  			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2282  		}
2283  		sk	  = sk_nulls_next(st->syn_wait_sk);
2284  		st->state = TCP_SEQ_STATE_LISTENING;
2285  		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2286  	} else {
2287  		icsk = inet_csk(sk);
2288  		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2289  		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2290  			goto start_req;
2291  		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2292  		sk = sk_nulls_next(sk);
2293  	}
2294  get_sk:
2295  	sk_nulls_for_each_from(sk, node) {
2296  		if (!net_eq(sock_net(sk), net))
2297  			continue;
2298  		if (sk->sk_family == st->family) {
2299  			cur = sk;
2300  			goto out;
2301  		}
2302  		icsk = inet_csk(sk);
2303  		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2304  		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2305  start_req:
2306  			st->uid		= sock_i_uid(sk);
2307  			st->syn_wait_sk = sk;
2308  			st->state	= TCP_SEQ_STATE_OPENREQ;
2309  			st->sbucket	= 0;
2310  			goto get_req;
2311  		}
2312  		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2313  	}
2314  	spin_unlock_bh(&ilb->lock);
2315  	st->offset = 0;
2316  	if (++st->bucket < INET_LHTABLE_SIZE) {
2317  		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2318  		spin_lock_bh(&ilb->lock);
2319  		sk = sk_nulls_head(&ilb->head);
2320  		goto get_sk;
2321  	}
2322  	cur = NULL;
2323  out:
2324  	return cur;
2325  }
2326  
listening_get_idx(struct seq_file * seq,loff_t * pos)2327  static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2328  {
2329  	struct tcp_iter_state *st = seq->private;
2330  	void *rc;
2331  
2332  	st->bucket = 0;
2333  	st->offset = 0;
2334  	rc = listening_get_next(seq, NULL);
2335  
2336  	while (rc && *pos) {
2337  		rc = listening_get_next(seq, rc);
2338  		--*pos;
2339  	}
2340  	return rc;
2341  }
2342  
empty_bucket(struct tcp_iter_state * st)2343  static inline bool empty_bucket(struct tcp_iter_state *st)
2344  {
2345  	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2346  		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2347  }
2348  
2349  /*
2350   * Get first established socket starting from bucket given in st->bucket.
2351   * If st->bucket is zero, the very first socket in the hash is returned.
2352   */
established_get_first(struct seq_file * seq)2353  static void *established_get_first(struct seq_file *seq)
2354  {
2355  	struct tcp_iter_state *st = seq->private;
2356  	struct net *net = seq_file_net(seq);
2357  	void *rc = NULL;
2358  
2359  	st->offset = 0;
2360  	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2361  		struct sock *sk;
2362  		struct hlist_nulls_node *node;
2363  		struct inet_timewait_sock *tw;
2364  		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2365  
2366  		/* Lockless fast path for the common case of empty buckets */
2367  		if (empty_bucket(st))
2368  			continue;
2369  
2370  		spin_lock_bh(lock);
2371  		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2372  			if (sk->sk_family != st->family ||
2373  			    !net_eq(sock_net(sk), net)) {
2374  				continue;
2375  			}
2376  			rc = sk;
2377  			goto out;
2378  		}
2379  		st->state = TCP_SEQ_STATE_TIME_WAIT;
2380  		inet_twsk_for_each(tw, node,
2381  				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2382  			if (tw->tw_family != st->family ||
2383  			    !net_eq(twsk_net(tw), net)) {
2384  				continue;
2385  			}
2386  			rc = tw;
2387  			goto out;
2388  		}
2389  		spin_unlock_bh(lock);
2390  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2391  	}
2392  out:
2393  	return rc;
2394  }
2395  
established_get_next(struct seq_file * seq,void * cur)2396  static void *established_get_next(struct seq_file *seq, void *cur)
2397  {
2398  	struct sock *sk = cur;
2399  	struct inet_timewait_sock *tw;
2400  	struct hlist_nulls_node *node;
2401  	struct tcp_iter_state *st = seq->private;
2402  	struct net *net = seq_file_net(seq);
2403  
2404  	++st->num;
2405  	++st->offset;
2406  
2407  	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2408  		tw = cur;
2409  		tw = tw_next(tw);
2410  get_tw:
2411  		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2412  			tw = tw_next(tw);
2413  		}
2414  		if (tw) {
2415  			cur = tw;
2416  			goto out;
2417  		}
2418  		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2419  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2420  
2421  		/* Look for next non empty bucket */
2422  		st->offset = 0;
2423  		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2424  				empty_bucket(st))
2425  			;
2426  		if (st->bucket > tcp_hashinfo.ehash_mask)
2427  			return NULL;
2428  
2429  		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2430  		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2431  	} else
2432  		sk = sk_nulls_next(sk);
2433  
2434  	sk_nulls_for_each_from(sk, node) {
2435  		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2436  			goto found;
2437  	}
2438  
2439  	st->state = TCP_SEQ_STATE_TIME_WAIT;
2440  	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2441  	goto get_tw;
2442  found:
2443  	cur = sk;
2444  out:
2445  	return cur;
2446  }
2447  
established_get_idx(struct seq_file * seq,loff_t pos)2448  static void *established_get_idx(struct seq_file *seq, loff_t pos)
2449  {
2450  	struct tcp_iter_state *st = seq->private;
2451  	void *rc;
2452  
2453  	st->bucket = 0;
2454  	rc = established_get_first(seq);
2455  
2456  	while (rc && pos) {
2457  		rc = established_get_next(seq, rc);
2458  		--pos;
2459  	}
2460  	return rc;
2461  }
2462  
tcp_get_idx(struct seq_file * seq,loff_t pos)2463  static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2464  {
2465  	void *rc;
2466  	struct tcp_iter_state *st = seq->private;
2467  
2468  	st->state = TCP_SEQ_STATE_LISTENING;
2469  	rc	  = listening_get_idx(seq, &pos);
2470  
2471  	if (!rc) {
2472  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2473  		rc	  = established_get_idx(seq, pos);
2474  	}
2475  
2476  	return rc;
2477  }
2478  
tcp_seek_last_pos(struct seq_file * seq)2479  static void *tcp_seek_last_pos(struct seq_file *seq)
2480  {
2481  	struct tcp_iter_state *st = seq->private;
2482  	int offset = st->offset;
2483  	int orig_num = st->num;
2484  	void *rc = NULL;
2485  
2486  	switch (st->state) {
2487  	case TCP_SEQ_STATE_OPENREQ:
2488  	case TCP_SEQ_STATE_LISTENING:
2489  		if (st->bucket >= INET_LHTABLE_SIZE)
2490  			break;
2491  		st->state = TCP_SEQ_STATE_LISTENING;
2492  		rc = listening_get_next(seq, NULL);
2493  		while (offset-- && rc)
2494  			rc = listening_get_next(seq, rc);
2495  		if (rc)
2496  			break;
2497  		st->bucket = 0;
2498  		/* Fallthrough */
2499  	case TCP_SEQ_STATE_ESTABLISHED:
2500  	case TCP_SEQ_STATE_TIME_WAIT:
2501  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2502  		if (st->bucket > tcp_hashinfo.ehash_mask)
2503  			break;
2504  		rc = established_get_first(seq);
2505  		while (offset-- && rc)
2506  			rc = established_get_next(seq, rc);
2507  	}
2508  
2509  	st->num = orig_num;
2510  
2511  	return rc;
2512  }
2513  
tcp_seq_start(struct seq_file * seq,loff_t * pos)2514  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2515  {
2516  	struct tcp_iter_state *st = seq->private;
2517  	void *rc;
2518  
2519  	if (*pos && *pos == st->last_pos) {
2520  		rc = tcp_seek_last_pos(seq);
2521  		if (rc)
2522  			goto out;
2523  	}
2524  
2525  	st->state = TCP_SEQ_STATE_LISTENING;
2526  	st->num = 0;
2527  	st->bucket = 0;
2528  	st->offset = 0;
2529  	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2530  
2531  out:
2532  	st->last_pos = *pos;
2533  	return rc;
2534  }
2535  
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2536  static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2537  {
2538  	struct tcp_iter_state *st = seq->private;
2539  	void *rc = NULL;
2540  
2541  	if (v == SEQ_START_TOKEN) {
2542  		rc = tcp_get_idx(seq, 0);
2543  		goto out;
2544  	}
2545  
2546  	switch (st->state) {
2547  	case TCP_SEQ_STATE_OPENREQ:
2548  	case TCP_SEQ_STATE_LISTENING:
2549  		rc = listening_get_next(seq, v);
2550  		if (!rc) {
2551  			st->state = TCP_SEQ_STATE_ESTABLISHED;
2552  			st->bucket = 0;
2553  			st->offset = 0;
2554  			rc	  = established_get_first(seq);
2555  		}
2556  		break;
2557  	case TCP_SEQ_STATE_ESTABLISHED:
2558  	case TCP_SEQ_STATE_TIME_WAIT:
2559  		rc = established_get_next(seq, v);
2560  		break;
2561  	}
2562  out:
2563  	++*pos;
2564  	st->last_pos = *pos;
2565  	return rc;
2566  }
2567  
tcp_seq_stop(struct seq_file * seq,void * v)2568  static void tcp_seq_stop(struct seq_file *seq, void *v)
2569  {
2570  	struct tcp_iter_state *st = seq->private;
2571  
2572  	switch (st->state) {
2573  	case TCP_SEQ_STATE_OPENREQ:
2574  		if (v) {
2575  			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2576  			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2577  		}
2578  	case TCP_SEQ_STATE_LISTENING:
2579  		if (v != SEQ_START_TOKEN)
2580  			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2581  		break;
2582  	case TCP_SEQ_STATE_TIME_WAIT:
2583  	case TCP_SEQ_STATE_ESTABLISHED:
2584  		if (v)
2585  			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2586  		break;
2587  	}
2588  }
2589  
tcp_seq_open(struct inode * inode,struct file * file)2590  int tcp_seq_open(struct inode *inode, struct file *file)
2591  {
2592  	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2593  	struct tcp_iter_state *s;
2594  	int err;
2595  
2596  	err = seq_open_net(inode, file, &afinfo->seq_ops,
2597  			  sizeof(struct tcp_iter_state));
2598  	if (err < 0)
2599  		return err;
2600  
2601  	s = ((struct seq_file *)file->private_data)->private;
2602  	s->family		= afinfo->family;
2603  	s->last_pos 		= 0;
2604  	return 0;
2605  }
2606  EXPORT_SYMBOL(tcp_seq_open);
2607  
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2608  int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2609  {
2610  	int rc = 0;
2611  	struct proc_dir_entry *p;
2612  
2613  	afinfo->seq_ops.start		= tcp_seq_start;
2614  	afinfo->seq_ops.next		= tcp_seq_next;
2615  	afinfo->seq_ops.stop		= tcp_seq_stop;
2616  
2617  	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2618  			     afinfo->seq_fops, afinfo);
2619  	if (!p)
2620  		rc = -ENOMEM;
2621  	return rc;
2622  }
2623  EXPORT_SYMBOL(tcp_proc_register);
2624  
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2625  void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2626  {
2627  	remove_proc_entry(afinfo->name, net->proc_net);
2628  }
2629  EXPORT_SYMBOL(tcp_proc_unregister);
2630  
get_openreq4(const struct sock * sk,const struct request_sock * req,struct seq_file * f,int i,kuid_t uid,int * len)2631  static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2632  			 struct seq_file *f, int i, kuid_t uid, int *len)
2633  {
2634  	const struct inet_request_sock *ireq = inet_rsk(req);
2635  	long delta = req->expires - jiffies;
2636  
2637  	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2638  		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2639  		i,
2640  		ireq->loc_addr,
2641  		ntohs(inet_sk(sk)->inet_sport),
2642  		ireq->rmt_addr,
2643  		ntohs(ireq->rmt_port),
2644  		TCP_SYN_RECV,
2645  		0, 0, /* could print option size, but that is af dependent. */
2646  		1,    /* timers active (only the expire timer) */
2647  		jiffies_delta_to_clock_t(delta),
2648  		req->num_timeout,
2649  		from_kuid_munged(seq_user_ns(f), uid),
2650  		0,  /* non standard timer */
2651  		0, /* open_requests have no inode */
2652  		atomic_read(&sk->sk_refcnt),
2653  		req,
2654  		len);
2655  }
2656  
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2657  static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2658  {
2659  	int timer_active;
2660  	unsigned long timer_expires;
2661  	const struct tcp_sock *tp = tcp_sk(sk);
2662  	const struct inet_connection_sock *icsk = inet_csk(sk);
2663  	const struct inet_sock *inet = inet_sk(sk);
2664  	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2665  	__be32 dest = inet->inet_daddr;
2666  	__be32 src = inet->inet_rcv_saddr;
2667  	__u16 destp = ntohs(inet->inet_dport);
2668  	__u16 srcp = ntohs(inet->inet_sport);
2669  	int rx_queue;
2670  
2671  	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2672  	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2673  	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2674  		timer_active	= 1;
2675  		timer_expires	= icsk->icsk_timeout;
2676  	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2677  		timer_active	= 4;
2678  		timer_expires	= icsk->icsk_timeout;
2679  	} else if (timer_pending(&sk->sk_timer)) {
2680  		timer_active	= 2;
2681  		timer_expires	= sk->sk_timer.expires;
2682  	} else {
2683  		timer_active	= 0;
2684  		timer_expires = jiffies;
2685  	}
2686  
2687  	if (sk->sk_state == TCP_LISTEN)
2688  		rx_queue = sk->sk_ack_backlog;
2689  	else
2690  		/*
2691  		 * because we dont lock socket, we might find a transient negative value
2692  		 */
2693  		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2694  
2695  	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2696  			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2697  		i, src, srcp, dest, destp, sk->sk_state,
2698  		tp->write_seq - tp->snd_una,
2699  		rx_queue,
2700  		timer_active,
2701  		jiffies_delta_to_clock_t(timer_expires - jiffies),
2702  		icsk->icsk_retransmits,
2703  		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2704  		icsk->icsk_probes_out,
2705  		sock_i_ino(sk),
2706  		atomic_read(&sk->sk_refcnt), sk,
2707  		jiffies_to_clock_t(icsk->icsk_rto),
2708  		jiffies_to_clock_t(icsk->icsk_ack.ato),
2709  		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2710  		tp->snd_cwnd,
2711  		sk->sk_state == TCP_LISTEN ?
2712  		    (fastopenq ? fastopenq->max_qlen : 0) :
2713  		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2714  		len);
2715  }
2716  
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2717  static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2718  			       struct seq_file *f, int i, int *len)
2719  {
2720  	__be32 dest, src;
2721  	__u16 destp, srcp;
2722  	long delta = tw->tw_ttd - jiffies;
2723  
2724  	dest  = tw->tw_daddr;
2725  	src   = tw->tw_rcv_saddr;
2726  	destp = ntohs(tw->tw_dport);
2727  	srcp  = ntohs(tw->tw_sport);
2728  
2729  	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2730  		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2731  		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2732  		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2733  		atomic_read(&tw->tw_refcnt), tw, len);
2734  }
2735  
2736  #define TMPSZ 150
2737  
tcp4_seq_show(struct seq_file * seq,void * v)2738  static int tcp4_seq_show(struct seq_file *seq, void *v)
2739  {
2740  	struct tcp_iter_state *st;
2741  	int len;
2742  
2743  	if (v == SEQ_START_TOKEN) {
2744  		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2745  			   "  sl  local_address rem_address   st tx_queue "
2746  			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2747  			   "inode");
2748  		goto out;
2749  	}
2750  	st = seq->private;
2751  
2752  	switch (st->state) {
2753  	case TCP_SEQ_STATE_LISTENING:
2754  	case TCP_SEQ_STATE_ESTABLISHED:
2755  		get_tcp4_sock(v, seq, st->num, &len);
2756  		break;
2757  	case TCP_SEQ_STATE_OPENREQ:
2758  		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2759  		break;
2760  	case TCP_SEQ_STATE_TIME_WAIT:
2761  		get_timewait4_sock(v, seq, st->num, &len);
2762  		break;
2763  	}
2764  	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2765  out:
2766  	return 0;
2767  }
2768  
2769  static const struct file_operations tcp_afinfo_seq_fops = {
2770  	.owner   = THIS_MODULE,
2771  	.open    = tcp_seq_open,
2772  	.read    = seq_read,
2773  	.llseek  = seq_lseek,
2774  	.release = seq_release_net
2775  };
2776  
2777  static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2778  	.name		= "tcp",
2779  	.family		= AF_INET,
2780  	.seq_fops	= &tcp_afinfo_seq_fops,
2781  	.seq_ops	= {
2782  		.show		= tcp4_seq_show,
2783  	},
2784  };
2785  
tcp4_proc_init_net(struct net * net)2786  static int __net_init tcp4_proc_init_net(struct net *net)
2787  {
2788  	return tcp_proc_register(net, &tcp4_seq_afinfo);
2789  }
2790  
tcp4_proc_exit_net(struct net * net)2791  static void __net_exit tcp4_proc_exit_net(struct net *net)
2792  {
2793  	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2794  }
2795  
2796  static struct pernet_operations tcp4_net_ops = {
2797  	.init = tcp4_proc_init_net,
2798  	.exit = tcp4_proc_exit_net,
2799  };
2800  
tcp4_proc_init(void)2801  int __init tcp4_proc_init(void)
2802  {
2803  	return register_pernet_subsys(&tcp4_net_ops);
2804  }
2805  
tcp4_proc_exit(void)2806  void tcp4_proc_exit(void)
2807  {
2808  	unregister_pernet_subsys(&tcp4_net_ops);
2809  }
2810  #endif /* CONFIG_PROC_FS */
2811  
tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2812  struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2813  {
2814  	const struct iphdr *iph = skb_gro_network_header(skb);
2815  	__wsum wsum;
2816  	__sum16 sum;
2817  
2818  	switch (skb->ip_summed) {
2819  	case CHECKSUM_COMPLETE:
2820  		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2821  				  skb->csum)) {
2822  			skb->ip_summed = CHECKSUM_UNNECESSARY;
2823  			break;
2824  		}
2825  flush:
2826  		NAPI_GRO_CB(skb)->flush = 1;
2827  		return NULL;
2828  
2829  	case CHECKSUM_NONE:
2830  		wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2831  					  skb_gro_len(skb), IPPROTO_TCP, 0);
2832  		sum = csum_fold(skb_checksum(skb,
2833  					     skb_gro_offset(skb),
2834  					     skb_gro_len(skb),
2835  					     wsum));
2836  		if (sum)
2837  			goto flush;
2838  
2839  		skb->ip_summed = CHECKSUM_UNNECESSARY;
2840  		break;
2841  	}
2842  
2843  	return tcp_gro_receive(head, skb);
2844  }
2845  
tcp4_gro_complete(struct sk_buff * skb)2846  int tcp4_gro_complete(struct sk_buff *skb)
2847  {
2848  	const struct iphdr *iph = ip_hdr(skb);
2849  	struct tcphdr *th = tcp_hdr(skb);
2850  
2851  	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2852  				  iph->saddr, iph->daddr, 0);
2853  	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2854  
2855  	return tcp_gro_complete(skb);
2856  }
2857  
2858  struct proto tcp_prot = {
2859  	.name			= "TCP",
2860  	.owner			= THIS_MODULE,
2861  	.close			= tcp_close,
2862  	.connect		= tcp_v4_connect,
2863  	.disconnect		= tcp_disconnect,
2864  	.accept			= inet_csk_accept,
2865  	.ioctl			= tcp_ioctl,
2866  	.init			= tcp_v4_init_sock,
2867  	.destroy		= tcp_v4_destroy_sock,
2868  	.shutdown		= tcp_shutdown,
2869  	.setsockopt		= tcp_setsockopt,
2870  	.getsockopt		= tcp_getsockopt,
2871  	.recvmsg		= tcp_recvmsg,
2872  	.sendmsg		= tcp_sendmsg,
2873  	.sendpage		= tcp_sendpage,
2874  	.backlog_rcv		= tcp_v4_do_rcv,
2875  	.release_cb		= tcp_release_cb,
2876  	.mtu_reduced		= tcp_v4_mtu_reduced,
2877  	.hash			= inet_hash,
2878  	.unhash			= inet_unhash,
2879  	.get_port		= inet_csk_get_port,
2880  	.enter_memory_pressure	= tcp_enter_memory_pressure,
2881  	.sockets_allocated	= &tcp_sockets_allocated,
2882  	.orphan_count		= &tcp_orphan_count,
2883  	.memory_allocated	= &tcp_memory_allocated,
2884  	.memory_pressure	= &tcp_memory_pressure,
2885  	.sysctl_wmem		= sysctl_tcp_wmem,
2886  	.sysctl_rmem		= sysctl_tcp_rmem,
2887  	.max_header		= MAX_TCP_HEADER,
2888  	.obj_size		= sizeof(struct tcp_sock),
2889  	.slab_flags		= SLAB_DESTROY_BY_RCU,
2890  	.twsk_prot		= &tcp_timewait_sock_ops,
2891  	.rsk_prot		= &tcp_request_sock_ops,
2892  	.h.hashinfo		= &tcp_hashinfo,
2893  	.no_autobind		= true,
2894  #ifdef CONFIG_COMPAT
2895  	.compat_setsockopt	= compat_tcp_setsockopt,
2896  	.compat_getsockopt	= compat_tcp_getsockopt,
2897  #endif
2898  #ifdef CONFIG_MEMCG_KMEM
2899  	.init_cgroup		= tcp_init_cgroup,
2900  	.destroy_cgroup		= tcp_destroy_cgroup,
2901  	.proto_cgroup		= tcp_proto_cgroup,
2902  #endif
2903  	.diag_destroy		= tcp_abort,
2904  };
2905  EXPORT_SYMBOL(tcp_prot);
2906  
tcp_sk_exit(struct net * net)2907  static void __net_exit tcp_sk_exit(struct net *net)
2908  {
2909  	int cpu;
2910  
2911  	for_each_possible_cpu(cpu)
2912  		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2913  	free_percpu(net->ipv4.tcp_sk);
2914  }
2915  
tcp_sk_init(struct net * net)2916  static int __net_init tcp_sk_init(struct net *net)
2917  {
2918  	int res, cpu;
2919  
2920  	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2921  	if (!net->ipv4.tcp_sk)
2922  		return -ENOMEM;
2923  
2924  	for_each_possible_cpu(cpu) {
2925  		struct sock *sk;
2926  
2927  		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2928  					   IPPROTO_TCP, net);
2929  		if (res)
2930  			goto fail;
2931  		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2932  	}
2933  	net->ipv4.sysctl_tcp_ecn = 2;
2934  	return 0;
2935  
2936  fail:
2937  	tcp_sk_exit(net);
2938  
2939  	return res;
2940  }
2941  
tcp_sk_exit_batch(struct list_head * net_exit_list)2942  static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2943  {
2944  	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2945  }
2946  
2947  static struct pernet_operations __net_initdata tcp_sk_ops = {
2948         .init	   = tcp_sk_init,
2949         .exit	   = tcp_sk_exit,
2950         .exit_batch = tcp_sk_exit_batch,
2951  };
2952  
tcp_v4_init(void)2953  void __init tcp_v4_init(void)
2954  {
2955  	inet_hashinfo_init(&tcp_hashinfo);
2956  	if (register_pernet_subsys(&tcp_sk_ops))
2957  		panic("Failed to create the TCP control socket.\n");
2958  }
2959