• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * INET		An implementation of the TCP/IP protocol suite for the LINUX
3   *		operating system.  INET is implemented using the  BSD Socket
4   *		interface as the means of communication with the user level.
5   *
6   *		Implementation of the Transmission Control Protocol(TCP).
7   *
8   * Authors:	Ross Biro
9   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10   *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11   *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12   *		Florian La Roche, <flla@stud.uni-sb.de>
13   *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14   *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16   *		Matthew Dillon, <dillon@apollo.west.oic.com>
17   *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18   *		Jorge Cwik, <jorge@laser.satlink.net>
19   */
20  
21  /*
22   * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
23   *				:	Fragmentation on mtu decrease
24   *				:	Segment collapse on retransmit
25   *				:	AF independence
26   *
27   *		Linus Torvalds	:	send_delayed_ack
28   *		David S. Miller	:	Charge memory using the right skb
29   *					during syn/ack processing.
30   *		David S. Miller :	Output engine completely rewritten.
31   *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
32   *		Cacophonix Gaul :	draft-minshall-nagle-01
33   *		J Hadi Salim	:	ECN support
34   *
35   */
36  
37  #define pr_fmt(fmt) "TCP: " fmt
38  
39  #include <net/tcp.h>
40  
41  #include <linux/compiler.h>
42  #include <linux/gfp.h>
43  #include <linux/module.h>
44  
45  /* People can turn this off for buggy TCP's found in printers etc. */
46  int sysctl_tcp_retrans_collapse __read_mostly = 1;
47  
48  /* People can turn this on to work with those rare, broken TCPs that
49   * interpret the window field as a signed quantity.
50   */
51  int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52  
53  /* Default TSQ limit of two TSO segments */
54  int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55  
56  /* This limits the percentage of the congestion window which we
57   * will allow a single TSO frame to consume.  Building TSO frames
58   * which are too large can cause TCP streams to be bursty.
59   */
60  int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61  
62  int sysctl_tcp_mtu_probing __read_mostly = 0;
63  int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
64  
65  /* By default, RFC2861 behavior.  */
66  int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67  
68  unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
69  EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
70  
71  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72  			   int push_one, gfp_t gfp);
73  
74  /* Account for new data that has been sent to the network. */
tcp_event_new_data_sent(struct sock * sk,const struct sk_buff * skb)75  static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76  {
77  	struct inet_connection_sock *icsk = inet_csk(sk);
78  	struct tcp_sock *tp = tcp_sk(sk);
79  	unsigned int prior_packets = tp->packets_out;
80  
81  	tcp_advance_send_head(sk, skb);
82  	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
83  
84  	tp->packets_out += tcp_skb_pcount(skb);
85  	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
86  	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
87  		tcp_rearm_rto(sk);
88  	}
89  
90  	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
91  		      tcp_skb_pcount(skb));
92  }
93  
94  /* SND.NXT, if window was not shrunk.
95   * If window has been shrunk, what should we make? It is not clear at all.
96   * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
97   * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
98   * invalid. OK, let's make this for now:
99   */
tcp_acceptable_seq(const struct sock * sk)100  static inline __u32 tcp_acceptable_seq(const struct sock *sk)
101  {
102  	const struct tcp_sock *tp = tcp_sk(sk);
103  
104  	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
105  		return tp->snd_nxt;
106  	else
107  		return tcp_wnd_end(tp);
108  }
109  
110  /* Calculate mss to advertise in SYN segment.
111   * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
112   *
113   * 1. It is independent of path mtu.
114   * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
115   * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
116   *    attached devices, because some buggy hosts are confused by
117   *    large MSS.
118   * 4. We do not make 3, we advertise MSS, calculated from first
119   *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
120   *    This may be overridden via information stored in routing table.
121   * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
122   *    probably even Jumbo".
123   */
tcp_advertise_mss(struct sock * sk)124  static __u16 tcp_advertise_mss(struct sock *sk)
125  {
126  	struct tcp_sock *tp = tcp_sk(sk);
127  	const struct dst_entry *dst = __sk_dst_get(sk);
128  	int mss = tp->advmss;
129  
130  	if (dst) {
131  		unsigned int metric = dst_metric_advmss(dst);
132  
133  		if (metric < mss) {
134  			mss = metric;
135  			tp->advmss = mss;
136  		}
137  	}
138  
139  	return (__u16)mss;
140  }
141  
142  /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
143   * This is the first part of cwnd validation mechanism. */
tcp_cwnd_restart(struct sock * sk,const struct dst_entry * dst)144  static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
145  {
146  	struct tcp_sock *tp = tcp_sk(sk);
147  	s32 delta = tcp_time_stamp - tp->lsndtime;
148  	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
149  	u32 cwnd = tp->snd_cwnd;
150  
151  	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
152  
153  	tp->snd_ssthresh = tcp_current_ssthresh(sk);
154  	restart_cwnd = min(restart_cwnd, cwnd);
155  
156  	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
157  		cwnd >>= 1;
158  	tp->snd_cwnd = max(cwnd, restart_cwnd);
159  	tp->snd_cwnd_stamp = tcp_time_stamp;
160  	tp->snd_cwnd_used = 0;
161  }
162  
163  /* Congestion state accounting after a packet has been sent. */
tcp_event_data_sent(struct tcp_sock * tp,struct sock * sk)164  static void tcp_event_data_sent(struct tcp_sock *tp,
165  				struct sock *sk)
166  {
167  	struct inet_connection_sock *icsk = inet_csk(sk);
168  	const u32 now = tcp_time_stamp;
169  	const struct dst_entry *dst = __sk_dst_get(sk);
170  
171  	if (sysctl_tcp_slow_start_after_idle &&
172  	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
173  		tcp_cwnd_restart(sk, __sk_dst_get(sk));
174  
175  	tp->lsndtime = now;
176  
177  	/* If it is a reply for ato after last received
178  	 * packet, enter pingpong mode.
179  	 */
180  	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
181  	    (!dst || !dst_metric(dst, RTAX_QUICKACK)))
182  			icsk->icsk_ack.pingpong = 1;
183  }
184  
185  /* Account for an ACK we sent. */
tcp_event_ack_sent(struct sock * sk,unsigned int pkts)186  static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
187  {
188  	tcp_dec_quickack_mode(sk, pkts);
189  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
190  }
191  
192  
tcp_default_init_rwnd(u32 mss)193  u32 tcp_default_init_rwnd(u32 mss)
194  {
195  	/* Initial receive window should be twice of TCP_INIT_CWND to
196  	 * enable proper sending of new unsent data during fast recovery
197  	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
198  	 * limit when mss is larger than 1460.
199  	 */
200  	u32 init_rwnd = sysctl_tcp_default_init_rwnd;
201  
202  	if (mss > 1460)
203  		init_rwnd = max((1460 * init_rwnd) / mss, 2U);
204  	return init_rwnd;
205  }
206  
207  /* Determine a window scaling and initial window to offer.
208   * Based on the assumption that the given amount of space
209   * will be offered. Store the results in the tp structure.
210   * NOTE: for smooth operation initial space offering should
211   * be a multiple of mss if possible. We assume here that mss >= 1.
212   * This MUST be enforced by all callers.
213   */
tcp_select_initial_window(int __space,__u32 mss,__u32 * rcv_wnd,__u32 * window_clamp,int wscale_ok,__u8 * rcv_wscale,__u32 init_rcv_wnd)214  void tcp_select_initial_window(int __space, __u32 mss,
215  			       __u32 *rcv_wnd, __u32 *window_clamp,
216  			       int wscale_ok, __u8 *rcv_wscale,
217  			       __u32 init_rcv_wnd)
218  {
219  	unsigned int space = (__space < 0 ? 0 : __space);
220  
221  	/* If no clamp set the clamp to the max possible scaled window */
222  	if (*window_clamp == 0)
223  		(*window_clamp) = (65535 << 14);
224  	space = min(*window_clamp, space);
225  
226  	/* Quantize space offering to a multiple of mss if possible. */
227  	if (space > mss)
228  		space = (space / mss) * mss;
229  
230  	/* NOTE: offering an initial window larger than 32767
231  	 * will break some buggy TCP stacks. If the admin tells us
232  	 * it is likely we could be speaking with such a buggy stack
233  	 * we will truncate our initial window offering to 32K-1
234  	 * unless the remote has sent us a window scaling option,
235  	 * which we interpret as a sign the remote TCP is not
236  	 * misinterpreting the window field as a signed quantity.
237  	 */
238  	if (sysctl_tcp_workaround_signed_windows)
239  		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
240  	else
241  		(*rcv_wnd) = space;
242  
243  	(*rcv_wscale) = 0;
244  	if (wscale_ok) {
245  		/* Set window scaling on max possible window
246  		 * See RFC1323 for an explanation of the limit to 14
247  		 */
248  		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
249  		space = min_t(u32, space, *window_clamp);
250  		while (space > 65535 && (*rcv_wscale) < 14) {
251  			space >>= 1;
252  			(*rcv_wscale)++;
253  		}
254  	}
255  
256  	if (mss > (1 << *rcv_wscale)) {
257  		if (!init_rcv_wnd) /* Use default unless specified otherwise */
258  			init_rcv_wnd = tcp_default_init_rwnd(mss);
259  		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
260  	}
261  
262  	/* Set the clamp no higher than max representable value */
263  	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
264  }
265  EXPORT_SYMBOL(tcp_select_initial_window);
266  
267  /* Chose a new window to advertise, update state in tcp_sock for the
268   * socket, and return result with RFC1323 scaling applied.  The return
269   * value can be stuffed directly into th->window for an outgoing
270   * frame.
271   */
tcp_select_window(struct sock * sk)272  static u16 tcp_select_window(struct sock *sk)
273  {
274  	struct tcp_sock *tp = tcp_sk(sk);
275  	u32 old_win = tp->rcv_wnd;
276  	u32 cur_win = tcp_receive_window(tp);
277  	u32 new_win = __tcp_select_window(sk);
278  
279  	/* Never shrink the offered window */
280  	if (new_win < cur_win) {
281  		/* Danger Will Robinson!
282  		 * Don't update rcv_wup/rcv_wnd here or else
283  		 * we will not be able to advertise a zero
284  		 * window in time.  --DaveM
285  		 *
286  		 * Relax Will Robinson.
287  		 */
288  		if (new_win == 0)
289  			NET_INC_STATS(sock_net(sk),
290  				      LINUX_MIB_TCPWANTZEROWINDOWADV);
291  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
292  	}
293  	tp->rcv_wnd = new_win;
294  	tp->rcv_wup = tp->rcv_nxt;
295  
296  	/* Make sure we do not exceed the maximum possible
297  	 * scaled window.
298  	 */
299  	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
300  		new_win = min(new_win, MAX_TCP_WINDOW);
301  	else
302  		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
303  
304  	/* RFC1323 scaling applied */
305  	new_win >>= tp->rx_opt.rcv_wscale;
306  
307  	/* If we advertise zero window, disable fast path. */
308  	if (new_win == 0) {
309  		tp->pred_flags = 0;
310  		if (old_win)
311  			NET_INC_STATS(sock_net(sk),
312  				      LINUX_MIB_TCPTOZEROWINDOWADV);
313  	} else if (old_win == 0) {
314  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
315  	}
316  
317  	return new_win;
318  }
319  
320  /* Packet ECN state for a SYN-ACK */
tcp_ecn_send_synack(struct sock * sk,struct sk_buff * skb)321  static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
322  {
323  	const struct tcp_sock *tp = tcp_sk(sk);
324  
325  	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
326  	if (!(tp->ecn_flags & TCP_ECN_OK))
327  		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
328  	else if (tcp_ca_needs_ecn(sk))
329  		INET_ECN_xmit(sk);
330  }
331  
332  /* Packet ECN state for a SYN.  */
tcp_ecn_send_syn(struct sock * sk,struct sk_buff * skb)333  static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
334  {
335  	struct tcp_sock *tp = tcp_sk(sk);
336  
337  	tp->ecn_flags = 0;
338  	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
339  	    tcp_ca_needs_ecn(sk)) {
340  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
341  		tp->ecn_flags = TCP_ECN_OK;
342  		if (tcp_ca_needs_ecn(sk))
343  			INET_ECN_xmit(sk);
344  	}
345  }
346  
347  static void
tcp_ecn_make_synack(const struct request_sock * req,struct tcphdr * th,struct sock * sk)348  tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
349  		    struct sock *sk)
350  {
351  	if (inet_rsk(req)->ecn_ok) {
352  		th->ece = 1;
353  		if (tcp_ca_needs_ecn(sk))
354  			INET_ECN_xmit(sk);
355  	}
356  }
357  
358  /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
359   * be sent.
360   */
tcp_ecn_send(struct sock * sk,struct sk_buff * skb,int tcp_header_len)361  static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
362  				int tcp_header_len)
363  {
364  	struct tcp_sock *tp = tcp_sk(sk);
365  
366  	if (tp->ecn_flags & TCP_ECN_OK) {
367  		/* Not-retransmitted data segment: set ECT and inject CWR. */
368  		if (skb->len != tcp_header_len &&
369  		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
370  			INET_ECN_xmit(sk);
371  			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
372  				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
373  				tcp_hdr(skb)->cwr = 1;
374  				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
375  			}
376  		} else if (!tcp_ca_needs_ecn(sk)) {
377  			/* ACK or retransmitted segment: clear ECT|CE */
378  			INET_ECN_dontxmit(sk);
379  		}
380  		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
381  			tcp_hdr(skb)->ece = 1;
382  	}
383  }
384  
385  /* Constructs common control bits of non-data skb. If SYN/FIN is present,
386   * auto increment end seqno.
387   */
tcp_init_nondata_skb(struct sk_buff * skb,u32 seq,u8 flags)388  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
389  {
390  	struct skb_shared_info *shinfo = skb_shinfo(skb);
391  
392  	skb->ip_summed = CHECKSUM_PARTIAL;
393  	skb->csum = 0;
394  
395  	TCP_SKB_CB(skb)->tcp_flags = flags;
396  	TCP_SKB_CB(skb)->sacked = 0;
397  
398  	tcp_skb_pcount_set(skb, 1);
399  	shinfo->gso_size = 0;
400  	shinfo->gso_type = 0;
401  
402  	TCP_SKB_CB(skb)->seq = seq;
403  	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
404  		seq++;
405  	TCP_SKB_CB(skb)->end_seq = seq;
406  }
407  
tcp_urg_mode(const struct tcp_sock * tp)408  static inline bool tcp_urg_mode(const struct tcp_sock *tp)
409  {
410  	return tp->snd_una != tp->snd_up;
411  }
412  
413  #define OPTION_SACK_ADVERTISE	(1 << 0)
414  #define OPTION_TS		(1 << 1)
415  #define OPTION_MD5		(1 << 2)
416  #define OPTION_WSCALE		(1 << 3)
417  #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
418  
419  struct tcp_out_options {
420  	u16 options;		/* bit field of OPTION_* */
421  	u16 mss;		/* 0 to disable */
422  	u8 ws;			/* window scale, 0 to disable */
423  	u8 num_sack_blocks;	/* number of SACK blocks to include */
424  	u8 hash_size;		/* bytes in hash_location */
425  	__u8 *hash_location;	/* temporary pointer, overloaded */
426  	__u32 tsval, tsecr;	/* need to include OPTION_TS */
427  	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
428  };
429  
430  /* Write previously computed TCP options to the packet.
431   *
432   * Beware: Something in the Internet is very sensitive to the ordering of
433   * TCP options, we learned this through the hard way, so be careful here.
434   * Luckily we can at least blame others for their non-compliance but from
435   * inter-operability perspective it seems that we're somewhat stuck with
436   * the ordering which we have been using if we want to keep working with
437   * those broken things (not that it currently hurts anybody as there isn't
438   * particular reason why the ordering would need to be changed).
439   *
440   * At least SACK_PERM as the first option is known to lead to a disaster
441   * (but it may well be that other scenarios fail similarly).
442   */
tcp_options_write(__be32 * ptr,struct tcp_sock * tp,struct tcp_out_options * opts)443  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
444  			      struct tcp_out_options *opts)
445  {
446  	u16 options = opts->options;	/* mungable copy */
447  
448  	if (unlikely(OPTION_MD5 & options)) {
449  		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
450  			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
451  		/* overload cookie hash location */
452  		opts->hash_location = (__u8 *)ptr;
453  		ptr += 4;
454  	}
455  
456  	if (unlikely(opts->mss)) {
457  		*ptr++ = htonl((TCPOPT_MSS << 24) |
458  			       (TCPOLEN_MSS << 16) |
459  			       opts->mss);
460  	}
461  
462  	if (likely(OPTION_TS & options)) {
463  		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
464  			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
465  				       (TCPOLEN_SACK_PERM << 16) |
466  				       (TCPOPT_TIMESTAMP << 8) |
467  				       TCPOLEN_TIMESTAMP);
468  			options &= ~OPTION_SACK_ADVERTISE;
469  		} else {
470  			*ptr++ = htonl((TCPOPT_NOP << 24) |
471  				       (TCPOPT_NOP << 16) |
472  				       (TCPOPT_TIMESTAMP << 8) |
473  				       TCPOLEN_TIMESTAMP);
474  		}
475  		*ptr++ = htonl(opts->tsval);
476  		*ptr++ = htonl(opts->tsecr);
477  	}
478  
479  	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
480  		*ptr++ = htonl((TCPOPT_NOP << 24) |
481  			       (TCPOPT_NOP << 16) |
482  			       (TCPOPT_SACK_PERM << 8) |
483  			       TCPOLEN_SACK_PERM);
484  	}
485  
486  	if (unlikely(OPTION_WSCALE & options)) {
487  		*ptr++ = htonl((TCPOPT_NOP << 24) |
488  			       (TCPOPT_WINDOW << 16) |
489  			       (TCPOLEN_WINDOW << 8) |
490  			       opts->ws);
491  	}
492  
493  	if (unlikely(opts->num_sack_blocks)) {
494  		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
495  			tp->duplicate_sack : tp->selective_acks;
496  		int this_sack;
497  
498  		*ptr++ = htonl((TCPOPT_NOP  << 24) |
499  			       (TCPOPT_NOP  << 16) |
500  			       (TCPOPT_SACK <<  8) |
501  			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
502  						     TCPOLEN_SACK_PERBLOCK)));
503  
504  		for (this_sack = 0; this_sack < opts->num_sack_blocks;
505  		     ++this_sack) {
506  			*ptr++ = htonl(sp[this_sack].start_seq);
507  			*ptr++ = htonl(sp[this_sack].end_seq);
508  		}
509  
510  		tp->rx_opt.dsack = 0;
511  	}
512  
513  	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
514  		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
515  
516  		*ptr++ = htonl((TCPOPT_EXP << 24) |
517  			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
518  			       TCPOPT_FASTOPEN_MAGIC);
519  
520  		memcpy(ptr, foc->val, foc->len);
521  		if ((foc->len & 3) == 2) {
522  			u8 *align = ((u8 *)ptr) + foc->len;
523  			align[0] = align[1] = TCPOPT_NOP;
524  		}
525  		ptr += (foc->len + 3) >> 2;
526  	}
527  }
528  
529  /* Compute TCP options for SYN packets. This is not the final
530   * network wire format yet.
531   */
tcp_syn_options(struct sock * sk,struct sk_buff * skb,struct tcp_out_options * opts,struct tcp_md5sig_key ** md5)532  static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
533  				struct tcp_out_options *opts,
534  				struct tcp_md5sig_key **md5)
535  {
536  	struct tcp_sock *tp = tcp_sk(sk);
537  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
538  	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
539  
540  #ifdef CONFIG_TCP_MD5SIG
541  	*md5 = tp->af_specific->md5_lookup(sk, sk);
542  	if (*md5) {
543  		opts->options |= OPTION_MD5;
544  		remaining -= TCPOLEN_MD5SIG_ALIGNED;
545  	}
546  #else
547  	*md5 = NULL;
548  #endif
549  
550  	/* We always get an MSS option.  The option bytes which will be seen in
551  	 * normal data packets should timestamps be used, must be in the MSS
552  	 * advertised.  But we subtract them from tp->mss_cache so that
553  	 * calculations in tcp_sendmsg are simpler etc.  So account for this
554  	 * fact here if necessary.  If we don't do this correctly, as a
555  	 * receiver we won't recognize data packets as being full sized when we
556  	 * should, and thus we won't abide by the delayed ACK rules correctly.
557  	 * SACKs don't matter, we never delay an ACK when we have any of those
558  	 * going out.  */
559  	opts->mss = tcp_advertise_mss(sk);
560  	remaining -= TCPOLEN_MSS_ALIGNED;
561  
562  	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
563  		opts->options |= OPTION_TS;
564  		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
565  		opts->tsecr = tp->rx_opt.ts_recent;
566  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
567  	}
568  	if (likely(sysctl_tcp_window_scaling)) {
569  		opts->ws = tp->rx_opt.rcv_wscale;
570  		opts->options |= OPTION_WSCALE;
571  		remaining -= TCPOLEN_WSCALE_ALIGNED;
572  	}
573  	if (likely(sysctl_tcp_sack)) {
574  		opts->options |= OPTION_SACK_ADVERTISE;
575  		if (unlikely(!(OPTION_TS & opts->options)))
576  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
577  	}
578  
579  	if (fastopen && fastopen->cookie.len >= 0) {
580  		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
581  		need = (need + 3) & ~3U;  /* Align to 32 bits */
582  		if (remaining >= need) {
583  			opts->options |= OPTION_FAST_OPEN_COOKIE;
584  			opts->fastopen_cookie = &fastopen->cookie;
585  			remaining -= need;
586  			tp->syn_fastopen = 1;
587  		}
588  	}
589  
590  	return MAX_TCP_OPTION_SPACE - remaining;
591  }
592  
593  /* Set up TCP options for SYN-ACKs. */
tcp_synack_options(struct sock * sk,struct request_sock * req,unsigned int mss,struct sk_buff * skb,struct tcp_out_options * opts,struct tcp_md5sig_key ** md5,struct tcp_fastopen_cookie * foc)594  static unsigned int tcp_synack_options(struct sock *sk,
595  				   struct request_sock *req,
596  				   unsigned int mss, struct sk_buff *skb,
597  				   struct tcp_out_options *opts,
598  				   struct tcp_md5sig_key **md5,
599  				   struct tcp_fastopen_cookie *foc)
600  {
601  	struct inet_request_sock *ireq = inet_rsk(req);
602  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
603  
604  #ifdef CONFIG_TCP_MD5SIG
605  	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
606  	if (*md5) {
607  		opts->options |= OPTION_MD5;
608  		remaining -= TCPOLEN_MD5SIG_ALIGNED;
609  
610  		/* We can't fit any SACK blocks in a packet with MD5 + TS
611  		 * options. There was discussion about disabling SACK
612  		 * rather than TS in order to fit in better with old,
613  		 * buggy kernels, but that was deemed to be unnecessary.
614  		 */
615  		ireq->tstamp_ok &= !ireq->sack_ok;
616  	}
617  #else
618  	*md5 = NULL;
619  #endif
620  
621  	/* We always send an MSS option. */
622  	opts->mss = mss;
623  	remaining -= TCPOLEN_MSS_ALIGNED;
624  
625  	if (likely(ireq->wscale_ok)) {
626  		opts->ws = ireq->rcv_wscale;
627  		opts->options |= OPTION_WSCALE;
628  		remaining -= TCPOLEN_WSCALE_ALIGNED;
629  	}
630  	if (likely(ireq->tstamp_ok)) {
631  		opts->options |= OPTION_TS;
632  		opts->tsval = tcp_skb_timestamp(skb);
633  		opts->tsecr = req->ts_recent;
634  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
635  	}
636  	if (likely(ireq->sack_ok)) {
637  		opts->options |= OPTION_SACK_ADVERTISE;
638  		if (unlikely(!ireq->tstamp_ok))
639  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
640  	}
641  	if (foc != NULL && foc->len >= 0) {
642  		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
643  		need = (need + 3) & ~3U;  /* Align to 32 bits */
644  		if (remaining >= need) {
645  			opts->options |= OPTION_FAST_OPEN_COOKIE;
646  			opts->fastopen_cookie = foc;
647  			remaining -= need;
648  		}
649  	}
650  
651  	return MAX_TCP_OPTION_SPACE - remaining;
652  }
653  
654  /* Compute TCP options for ESTABLISHED sockets. This is not the
655   * final wire format yet.
656   */
tcp_established_options(struct sock * sk,struct sk_buff * skb,struct tcp_out_options * opts,struct tcp_md5sig_key ** md5)657  static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
658  					struct tcp_out_options *opts,
659  					struct tcp_md5sig_key **md5)
660  {
661  	struct tcp_sock *tp = tcp_sk(sk);
662  	unsigned int size = 0;
663  	unsigned int eff_sacks;
664  
665  	opts->options = 0;
666  
667  #ifdef CONFIG_TCP_MD5SIG
668  	*md5 = tp->af_specific->md5_lookup(sk, sk);
669  	if (unlikely(*md5)) {
670  		opts->options |= OPTION_MD5;
671  		size += TCPOLEN_MD5SIG_ALIGNED;
672  	}
673  #else
674  	*md5 = NULL;
675  #endif
676  
677  	if (likely(tp->rx_opt.tstamp_ok)) {
678  		opts->options |= OPTION_TS;
679  		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
680  		opts->tsecr = tp->rx_opt.ts_recent;
681  		size += TCPOLEN_TSTAMP_ALIGNED;
682  	}
683  
684  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
685  	if (unlikely(eff_sacks)) {
686  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
687  		opts->num_sack_blocks =
688  			min_t(unsigned int, eff_sacks,
689  			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
690  			      TCPOLEN_SACK_PERBLOCK);
691  		size += TCPOLEN_SACK_BASE_ALIGNED +
692  			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
693  	}
694  
695  	return size;
696  }
697  
698  
699  /* TCP SMALL QUEUES (TSQ)
700   *
701   * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
702   * to reduce RTT and bufferbloat.
703   * We do this using a special skb destructor (tcp_wfree).
704   *
705   * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
706   * needs to be reallocated in a driver.
707   * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
708   *
709   * Since transmit from skb destructor is forbidden, we use a tasklet
710   * to process all sockets that eventually need to send more skbs.
711   * We use one tasklet per cpu, with its own queue of sockets.
712   */
713  struct tsq_tasklet {
714  	struct tasklet_struct	tasklet;
715  	struct list_head	head; /* queue of tcp sockets */
716  };
717  static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
718  
tcp_tsq_handler(struct sock * sk)719  static void tcp_tsq_handler(struct sock *sk)
720  {
721  	if ((1 << sk->sk_state) &
722  	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
723  	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
724  		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
725  			       0, GFP_ATOMIC);
726  }
727  /*
728   * One tasklet per cpu tries to send more skbs.
729   * We run in tasklet context but need to disable irqs when
730   * transferring tsq->head because tcp_wfree() might
731   * interrupt us (non NAPI drivers)
732   */
tcp_tasklet_func(unsigned long data)733  static void tcp_tasklet_func(unsigned long data)
734  {
735  	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
736  	LIST_HEAD(list);
737  	unsigned long flags;
738  	struct list_head *q, *n;
739  	struct tcp_sock *tp;
740  	struct sock *sk;
741  
742  	local_irq_save(flags);
743  	list_splice_init(&tsq->head, &list);
744  	local_irq_restore(flags);
745  
746  	list_for_each_safe(q, n, &list) {
747  		tp = list_entry(q, struct tcp_sock, tsq_node);
748  		list_del(&tp->tsq_node);
749  
750  		sk = (struct sock *)tp;
751  		bh_lock_sock(sk);
752  
753  		if (!sock_owned_by_user(sk)) {
754  			tcp_tsq_handler(sk);
755  		} else {
756  			/* defer the work to tcp_release_cb() */
757  			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
758  		}
759  		bh_unlock_sock(sk);
760  
761  		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
762  		sk_free(sk);
763  	}
764  }
765  
766  #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
767  			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
768  			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
769  			  (1UL << TCP_MTU_REDUCED_DEFERRED))
770  /**
771   * tcp_release_cb - tcp release_sock() callback
772   * @sk: socket
773   *
774   * called from release_sock() to perform protocol dependent
775   * actions before socket release.
776   */
tcp_release_cb(struct sock * sk)777  void tcp_release_cb(struct sock *sk)
778  {
779  	struct tcp_sock *tp = tcp_sk(sk);
780  	unsigned long flags, nflags;
781  
782  	/* perform an atomic operation only if at least one flag is set */
783  	do {
784  		flags = tp->tsq_flags;
785  		if (!(flags & TCP_DEFERRED_ALL))
786  			return;
787  		nflags = flags & ~TCP_DEFERRED_ALL;
788  	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
789  
790  	if (flags & (1UL << TCP_TSQ_DEFERRED))
791  		tcp_tsq_handler(sk);
792  
793  	/* Here begins the tricky part :
794  	 * We are called from release_sock() with :
795  	 * 1) BH disabled
796  	 * 2) sk_lock.slock spinlock held
797  	 * 3) socket owned by us (sk->sk_lock.owned == 1)
798  	 *
799  	 * But following code is meant to be called from BH handlers,
800  	 * so we should keep BH disabled, but early release socket ownership
801  	 */
802  	sock_release_ownership(sk);
803  
804  	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
805  		tcp_write_timer_handler(sk);
806  		__sock_put(sk);
807  	}
808  	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
809  		tcp_delack_timer_handler(sk);
810  		__sock_put(sk);
811  	}
812  	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
813  		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
814  		__sock_put(sk);
815  	}
816  }
817  EXPORT_SYMBOL(tcp_release_cb);
818  
tcp_tasklet_init(void)819  void __init tcp_tasklet_init(void)
820  {
821  	int i;
822  
823  	for_each_possible_cpu(i) {
824  		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
825  
826  		INIT_LIST_HEAD(&tsq->head);
827  		tasklet_init(&tsq->tasklet,
828  			     tcp_tasklet_func,
829  			     (unsigned long)tsq);
830  	}
831  }
832  
833  /*
834   * Write buffer destructor automatically called from kfree_skb.
835   * We can't xmit new skbs from this context, as we might already
836   * hold qdisc lock.
837   */
tcp_wfree(struct sk_buff * skb)838  void tcp_wfree(struct sk_buff *skb)
839  {
840  	struct sock *sk = skb->sk;
841  	struct tcp_sock *tp = tcp_sk(sk);
842  	int wmem;
843  
844  	/* Keep one reference on sk_wmem_alloc.
845  	 * Will be released by sk_free() from here or tcp_tasklet_func()
846  	 */
847  	wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
848  
849  	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
850  	 * Wait until our queues (qdisc + devices) are drained.
851  	 * This gives :
852  	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
853  	 * - chance for incoming ACK (processed by another cpu maybe)
854  	 *   to migrate this flow (skb->ooo_okay will be eventually set)
855  	 */
856  	if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
857  		goto out;
858  
859  	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
860  	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
861  		unsigned long flags;
862  		struct tsq_tasklet *tsq;
863  
864  		/* queue this socket to tasklet queue */
865  		local_irq_save(flags);
866  		tsq = this_cpu_ptr(&tsq_tasklet);
867  		list_add(&tp->tsq_node, &tsq->head);
868  		tasklet_schedule(&tsq->tasklet);
869  		local_irq_restore(flags);
870  		return;
871  	}
872  out:
873  	sk_free(sk);
874  }
875  
876  /* This routine actually transmits TCP packets queued in by
877   * tcp_do_sendmsg().  This is used by both the initial
878   * transmission and possible later retransmissions.
879   * All SKB's seen here are completely headerless.  It is our
880   * job to build the TCP header, and pass the packet down to
881   * IP so it can do the same plus pass the packet off to the
882   * device.
883   *
884   * We are working here with either a clone of the original
885   * SKB, or a fresh unique copy made by the retransmit engine.
886   */
tcp_transmit_skb(struct sock * sk,struct sk_buff * skb,int clone_it,gfp_t gfp_mask)887  static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
888  			    gfp_t gfp_mask)
889  {
890  	const struct inet_connection_sock *icsk = inet_csk(sk);
891  	struct inet_sock *inet;
892  	struct tcp_sock *tp;
893  	struct tcp_skb_cb *tcb;
894  	struct tcp_out_options opts;
895  	unsigned int tcp_options_size, tcp_header_size;
896  	struct tcp_md5sig_key *md5;
897  	struct tcphdr *th;
898  	int err;
899  
900  	BUG_ON(!skb || !tcp_skb_pcount(skb));
901  
902  	if (clone_it) {
903  		skb_mstamp_get(&skb->skb_mstamp);
904  
905  		if (unlikely(skb_cloned(skb)))
906  			skb = pskb_copy(skb, gfp_mask);
907  		else
908  			skb = skb_clone(skb, gfp_mask);
909  		if (unlikely(!skb))
910  			return -ENOBUFS;
911  	}
912  
913  	inet = inet_sk(sk);
914  	tp = tcp_sk(sk);
915  	tcb = TCP_SKB_CB(skb);
916  	memset(&opts, 0, sizeof(opts));
917  
918  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
919  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
920  	else
921  		tcp_options_size = tcp_established_options(sk, skb, &opts,
922  							   &md5);
923  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
924  
925  	if (tcp_packets_in_flight(tp) == 0)
926  		tcp_ca_event(sk, CA_EVENT_TX_START);
927  
928  	/* if no packet is in qdisc/device queue, then allow XPS to select
929  	 * another queue. We can be called from tcp_tsq_handler()
930  	 * which holds one reference to sk_wmem_alloc.
931  	 *
932  	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
933  	 * One way to get this would be to set skb->truesize = 2 on them.
934  	 */
935  	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
936  
937  	skb_push(skb, tcp_header_size);
938  	skb_reset_transport_header(skb);
939  
940  	skb_orphan(skb);
941  	skb->sk = sk;
942  	skb->destructor = tcp_wfree;
943  	skb_set_hash_from_sk(skb, sk);
944  	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
945  
946  	/* Build TCP header and checksum it. */
947  	th = tcp_hdr(skb);
948  	th->source		= inet->inet_sport;
949  	th->dest		= inet->inet_dport;
950  	th->seq			= htonl(tcb->seq);
951  	th->ack_seq		= htonl(tp->rcv_nxt);
952  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
953  					tcb->tcp_flags);
954  
955  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
956  		/* RFC1323: The window in SYN & SYN/ACK segments
957  		 * is never scaled.
958  		 */
959  		th->window	= htons(min(tp->rcv_wnd, 65535U));
960  	} else {
961  		th->window	= htons(tcp_select_window(sk));
962  	}
963  	th->check		= 0;
964  	th->urg_ptr		= 0;
965  
966  	/* The urg_mode check is necessary during a below snd_una win probe */
967  	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
968  		if (before(tp->snd_up, tcb->seq + 0x10000)) {
969  			th->urg_ptr = htons(tp->snd_up - tcb->seq);
970  			th->urg = 1;
971  		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
972  			th->urg_ptr = htons(0xFFFF);
973  			th->urg = 1;
974  		}
975  	}
976  
977  	tcp_options_write((__be32 *)(th + 1), tp, &opts);
978  	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
979  		tcp_ecn_send(sk, skb, tcp_header_size);
980  
981  #ifdef CONFIG_TCP_MD5SIG
982  	/* Calculate the MD5 hash, as we have all we need now */
983  	if (md5) {
984  		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
985  		tp->af_specific->calc_md5_hash(opts.hash_location,
986  					       md5, sk, NULL, skb);
987  	}
988  #endif
989  
990  	icsk->icsk_af_ops->send_check(sk, skb);
991  
992  	if (likely(tcb->tcp_flags & TCPHDR_ACK))
993  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
994  
995  	if (skb->len != tcp_header_size)
996  		tcp_event_data_sent(tp, sk);
997  
998  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
999  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1000  			      tcp_skb_pcount(skb));
1001  
1002  	/* OK, its time to fill skb_shinfo(skb)->gso_segs */
1003  	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1004  
1005  	/* Our usage of tstamp should remain private */
1006  	skb->tstamp.tv64 = 0;
1007  
1008  	/* Cleanup our debris for IP stacks */
1009  	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1010  			       sizeof(struct inet6_skb_parm)));
1011  
1012  	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
1013  
1014  	if (likely(err <= 0))
1015  		return err;
1016  
1017  	tcp_enter_cwr(sk);
1018  
1019  	return net_xmit_eval(err);
1020  }
1021  
1022  /* This routine just queues the buffer for sending.
1023   *
1024   * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
1025   * otherwise socket can stall.
1026   */
tcp_queue_skb(struct sock * sk,struct sk_buff * skb)1027  static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1028  {
1029  	struct tcp_sock *tp = tcp_sk(sk);
1030  
1031  	/* Advance write_seq and place onto the write_queue. */
1032  	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
1033  	__skb_header_release(skb);
1034  	tcp_add_write_queue_tail(sk, skb);
1035  	sk->sk_wmem_queued += skb->truesize;
1036  	sk_mem_charge(sk, skb->truesize);
1037  }
1038  
1039  /* Initialize TSO segments for a packet. */
tcp_set_skb_tso_segs(const struct sock * sk,struct sk_buff * skb,unsigned int mss_now)1040  static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
1041  				 unsigned int mss_now)
1042  {
1043  	struct skb_shared_info *shinfo = skb_shinfo(skb);
1044  
1045  	/* Make sure we own this skb before messing gso_size/gso_segs */
1046  	WARN_ON_ONCE(skb_cloned(skb));
1047  
1048  	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
1049  		/* Avoid the costly divide in the normal
1050  		 * non-TSO case.
1051  		 */
1052  		tcp_skb_pcount_set(skb, 1);
1053  		shinfo->gso_size = 0;
1054  		shinfo->gso_type = 0;
1055  	} else {
1056  		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1057  		shinfo->gso_size = mss_now;
1058  		shinfo->gso_type = sk->sk_gso_type;
1059  	}
1060  }
1061  
1062  /* When a modification to fackets out becomes necessary, we need to check
1063   * skb is counted to fackets_out or not.
1064   */
tcp_adjust_fackets_out(struct sock * sk,const struct sk_buff * skb,int decr)1065  static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1066  				   int decr)
1067  {
1068  	struct tcp_sock *tp = tcp_sk(sk);
1069  
1070  	if (!tp->sacked_out || tcp_is_reno(tp))
1071  		return;
1072  
1073  	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
1074  		tp->fackets_out -= decr;
1075  }
1076  
1077  /* Pcount in the middle of the write queue got changed, we need to do various
1078   * tweaks to fix counters
1079   */
tcp_adjust_pcount(struct sock * sk,const struct sk_buff * skb,int decr)1080  static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1081  {
1082  	struct tcp_sock *tp = tcp_sk(sk);
1083  
1084  	tp->packets_out -= decr;
1085  
1086  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1087  		tp->sacked_out -= decr;
1088  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1089  		tp->retrans_out -= decr;
1090  	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1091  		tp->lost_out -= decr;
1092  
1093  	/* Reno case is special. Sigh... */
1094  	if (tcp_is_reno(tp) && decr > 0)
1095  		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1096  
1097  	tcp_adjust_fackets_out(sk, skb, decr);
1098  
1099  	if (tp->lost_skb_hint &&
1100  	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1101  	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
1102  		tp->lost_cnt_hint -= decr;
1103  
1104  	tcp_verify_left_out(tp);
1105  }
1106  
tcp_fragment_tstamp(struct sk_buff * skb,struct sk_buff * skb2)1107  static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1108  {
1109  	struct skb_shared_info *shinfo = skb_shinfo(skb);
1110  
1111  	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
1112  	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1113  		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1114  		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1115  
1116  		shinfo->tx_flags &= ~tsflags;
1117  		shinfo2->tx_flags |= tsflags;
1118  		swap(shinfo->tskey, shinfo2->tskey);
1119  	}
1120  }
1121  
1122  /* Function to create two new TCP segments.  Shrinks the given segment
1123   * to the specified size and appends a new segment with the rest of the
1124   * packet to the list.  This won't be called frequently, I hope.
1125   * Remember, these are still headerless SKBs at this point.
1126   */
tcp_fragment(struct sock * sk,struct sk_buff * skb,u32 len,unsigned int mss_now,gfp_t gfp)1127  int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1128  		 unsigned int mss_now, gfp_t gfp)
1129  {
1130  	struct tcp_sock *tp = tcp_sk(sk);
1131  	struct sk_buff *buff;
1132  	int nsize, old_factor;
1133  	int nlen;
1134  	u8 flags;
1135  
1136  	if (WARN_ON(len > skb->len))
1137  		return -EINVAL;
1138  
1139  	nsize = skb_headlen(skb) - len;
1140  	if (nsize < 0)
1141  		nsize = 0;
1142  
1143  	if (skb_unclone(skb, gfp))
1144  		return -ENOMEM;
1145  
1146  	/* Get a new skb... force flag on. */
1147  	buff = sk_stream_alloc_skb(sk, nsize, gfp);
1148  	if (buff == NULL)
1149  		return -ENOMEM; /* We'll just try again later. */
1150  
1151  	sk->sk_wmem_queued += buff->truesize;
1152  	sk_mem_charge(sk, buff->truesize);
1153  	nlen = skb->len - len - nsize;
1154  	buff->truesize += nlen;
1155  	skb->truesize -= nlen;
1156  
1157  	/* Correct the sequence numbers. */
1158  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1159  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1160  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1161  
1162  	/* PSH and FIN should only be set in the second packet. */
1163  	flags = TCP_SKB_CB(skb)->tcp_flags;
1164  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1165  	TCP_SKB_CB(buff)->tcp_flags = flags;
1166  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1167  
1168  	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
1169  		/* Copy and checksum data tail into the new buffer. */
1170  		buff->csum = csum_partial_copy_nocheck(skb->data + len,
1171  						       skb_put(buff, nsize),
1172  						       nsize, 0);
1173  
1174  		skb_trim(skb, len);
1175  
1176  		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
1177  	} else {
1178  		skb->ip_summed = CHECKSUM_PARTIAL;
1179  		skb_split(skb, buff, len);
1180  	}
1181  
1182  	buff->ip_summed = skb->ip_summed;
1183  
1184  	buff->tstamp = skb->tstamp;
1185  	tcp_fragment_tstamp(skb, buff);
1186  
1187  	old_factor = tcp_skb_pcount(skb);
1188  
1189  	/* Fix up tso_factor for both original and new SKB.  */
1190  	tcp_set_skb_tso_segs(sk, skb, mss_now);
1191  	tcp_set_skb_tso_segs(sk, buff, mss_now);
1192  
1193  	/* If this packet has been sent out already, we must
1194  	 * adjust the various packet counters.
1195  	 */
1196  	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1197  		int diff = old_factor - tcp_skb_pcount(skb) -
1198  			tcp_skb_pcount(buff);
1199  
1200  		if (diff)
1201  			tcp_adjust_pcount(sk, skb, diff);
1202  	}
1203  
1204  	/* Link BUFF into the send queue. */
1205  	__skb_header_release(buff);
1206  	tcp_insert_write_queue_after(skb, buff, sk);
1207  
1208  	return 0;
1209  }
1210  
1211  /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
1212   * eventually). The difference is that pulled data not copied, but
1213   * immediately discarded.
1214   */
__pskb_trim_head(struct sk_buff * skb,int len)1215  static int __pskb_trim_head(struct sk_buff *skb, int len)
1216  {
1217  	struct skb_shared_info *shinfo;
1218  	int i, k, eat;
1219  
1220  	eat = min_t(int, len, skb_headlen(skb));
1221  	if (eat) {
1222  		__skb_pull(skb, eat);
1223  		len -= eat;
1224  		if (!len)
1225  			return 0;
1226  	}
1227  	eat = len;
1228  	k = 0;
1229  	shinfo = skb_shinfo(skb);
1230  	for (i = 0; i < shinfo->nr_frags; i++) {
1231  		int size = skb_frag_size(&shinfo->frags[i]);
1232  
1233  		if (size <= eat) {
1234  			skb_frag_unref(skb, i);
1235  			eat -= size;
1236  		} else {
1237  			shinfo->frags[k] = shinfo->frags[i];
1238  			if (eat) {
1239  				shinfo->frags[k].page_offset += eat;
1240  				skb_frag_size_sub(&shinfo->frags[k], eat);
1241  				eat = 0;
1242  			}
1243  			k++;
1244  		}
1245  	}
1246  	shinfo->nr_frags = k;
1247  
1248  	skb_reset_tail_pointer(skb);
1249  	skb->data_len -= len;
1250  	skb->len = skb->data_len;
1251  	return len;
1252  }
1253  
1254  /* Remove acked data from a packet in the transmit queue. */
tcp_trim_head(struct sock * sk,struct sk_buff * skb,u32 len)1255  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1256  {
1257  	u32 delta_truesize;
1258  
1259  	if (skb_unclone(skb, GFP_ATOMIC))
1260  		return -ENOMEM;
1261  
1262  	delta_truesize = __pskb_trim_head(skb, len);
1263  
1264  	TCP_SKB_CB(skb)->seq += len;
1265  	skb->ip_summed = CHECKSUM_PARTIAL;
1266  
1267  	if (delta_truesize) {
1268  		skb->truesize	   -= delta_truesize;
1269  		sk->sk_wmem_queued -= delta_truesize;
1270  		sk_mem_uncharge(sk, delta_truesize);
1271  		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1272  	}
1273  
1274  	/* Any change of skb->len requires recalculation of tso factor. */
1275  	if (tcp_skb_pcount(skb) > 1)
1276  		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
1277  
1278  	return 0;
1279  }
1280  
1281  /* Calculate MSS not accounting any TCP options.  */
__tcp_mtu_to_mss(struct sock * sk,int pmtu)1282  static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1283  {
1284  	const struct tcp_sock *tp = tcp_sk(sk);
1285  	const struct inet_connection_sock *icsk = inet_csk(sk);
1286  	int mss_now;
1287  
1288  	/* Calculate base mss without TCP options:
1289  	   It is MMS_S - sizeof(tcphdr) of rfc1122
1290  	 */
1291  	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1292  
1293  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1294  	if (icsk->icsk_af_ops->net_frag_header_len) {
1295  		const struct dst_entry *dst = __sk_dst_get(sk);
1296  
1297  		if (dst && dst_allfrag(dst))
1298  			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1299  	}
1300  
1301  	/* Clamp it (mss_clamp does not include tcp options) */
1302  	if (mss_now > tp->rx_opt.mss_clamp)
1303  		mss_now = tp->rx_opt.mss_clamp;
1304  
1305  	/* Now subtract optional transport overhead */
1306  	mss_now -= icsk->icsk_ext_hdr_len;
1307  
1308  	/* Then reserve room for full set of TCP options and 8 bytes of data */
1309  	if (mss_now < 48)
1310  		mss_now = 48;
1311  	return mss_now;
1312  }
1313  
1314  /* Calculate MSS. Not accounting for SACKs here.  */
tcp_mtu_to_mss(struct sock * sk,int pmtu)1315  int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1316  {
1317  	/* Subtract TCP options size, not including SACKs */
1318  	return __tcp_mtu_to_mss(sk, pmtu) -
1319  	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1320  }
1321  
1322  /* Inverse of above */
tcp_mss_to_mtu(struct sock * sk,int mss)1323  int tcp_mss_to_mtu(struct sock *sk, int mss)
1324  {
1325  	const struct tcp_sock *tp = tcp_sk(sk);
1326  	const struct inet_connection_sock *icsk = inet_csk(sk);
1327  	int mtu;
1328  
1329  	mtu = mss +
1330  	      tp->tcp_header_len +
1331  	      icsk->icsk_ext_hdr_len +
1332  	      icsk->icsk_af_ops->net_header_len;
1333  
1334  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1335  	if (icsk->icsk_af_ops->net_frag_header_len) {
1336  		const struct dst_entry *dst = __sk_dst_get(sk);
1337  
1338  		if (dst && dst_allfrag(dst))
1339  			mtu += icsk->icsk_af_ops->net_frag_header_len;
1340  	}
1341  	return mtu;
1342  }
1343  
1344  /* MTU probing init per socket */
tcp_mtup_init(struct sock * sk)1345  void tcp_mtup_init(struct sock *sk)
1346  {
1347  	struct tcp_sock *tp = tcp_sk(sk);
1348  	struct inet_connection_sock *icsk = inet_csk(sk);
1349  
1350  	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
1351  	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1352  			       icsk->icsk_af_ops->net_header_len;
1353  	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
1354  	icsk->icsk_mtup.probe_size = 0;
1355  }
1356  EXPORT_SYMBOL(tcp_mtup_init);
1357  
1358  /* This function synchronize snd mss to current pmtu/exthdr set.
1359  
1360     tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1361     for TCP options, but includes only bare TCP header.
1362  
1363     tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1364     It is minimum of user_mss and mss received with SYN.
1365     It also does not include TCP options.
1366  
1367     inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1368  
1369     tp->mss_cache is current effective sending mss, including
1370     all tcp options except for SACKs. It is evaluated,
1371     taking into account current pmtu, but never exceeds
1372     tp->rx_opt.mss_clamp.
1373  
1374     NOTE1. rfc1122 clearly states that advertised MSS
1375     DOES NOT include either tcp or ip options.
1376  
1377     NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1378     are READ ONLY outside this function.		--ANK (980731)
1379   */
tcp_sync_mss(struct sock * sk,u32 pmtu)1380  unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1381  {
1382  	struct tcp_sock *tp = tcp_sk(sk);
1383  	struct inet_connection_sock *icsk = inet_csk(sk);
1384  	int mss_now;
1385  
1386  	if (icsk->icsk_mtup.search_high > pmtu)
1387  		icsk->icsk_mtup.search_high = pmtu;
1388  
1389  	mss_now = tcp_mtu_to_mss(sk, pmtu);
1390  	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1391  
1392  	/* And store cached results */
1393  	icsk->icsk_pmtu_cookie = pmtu;
1394  	if (icsk->icsk_mtup.enabled)
1395  		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1396  	tp->mss_cache = mss_now;
1397  
1398  	return mss_now;
1399  }
1400  EXPORT_SYMBOL(tcp_sync_mss);
1401  
1402  /* Compute the current effective MSS, taking SACKs and IP options,
1403   * and even PMTU discovery events into account.
1404   */
tcp_current_mss(struct sock * sk)1405  unsigned int tcp_current_mss(struct sock *sk)
1406  {
1407  	const struct tcp_sock *tp = tcp_sk(sk);
1408  	const struct dst_entry *dst = __sk_dst_get(sk);
1409  	u32 mss_now;
1410  	unsigned int header_len;
1411  	struct tcp_out_options opts;
1412  	struct tcp_md5sig_key *md5;
1413  
1414  	mss_now = tp->mss_cache;
1415  
1416  	if (dst) {
1417  		u32 mtu = dst_mtu(dst);
1418  		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1419  			mss_now = tcp_sync_mss(sk, mtu);
1420  	}
1421  
1422  	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1423  		     sizeof(struct tcphdr);
1424  	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
1425  	 * some common options. If this is an odd packet (because we have SACK
1426  	 * blocks etc) then our calculated header_len will be different, and
1427  	 * we have to adjust mss_now correspondingly */
1428  	if (header_len != tp->tcp_header_len) {
1429  		int delta = (int) header_len - tp->tcp_header_len;
1430  		mss_now -= delta;
1431  	}
1432  
1433  	return mss_now;
1434  }
1435  
1436  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1437   * As additional protections, we do not touch cwnd in retransmission phases,
1438   * and if application hit its sndbuf limit recently.
1439   */
tcp_cwnd_application_limited(struct sock * sk)1440  static void tcp_cwnd_application_limited(struct sock *sk)
1441  {
1442  	struct tcp_sock *tp = tcp_sk(sk);
1443  
1444  	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1445  	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1446  		/* Limited by application or receiver window. */
1447  		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1448  		u32 win_used = max(tp->snd_cwnd_used, init_win);
1449  		if (win_used < tp->snd_cwnd) {
1450  			tp->snd_ssthresh = tcp_current_ssthresh(sk);
1451  			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1452  		}
1453  		tp->snd_cwnd_used = 0;
1454  	}
1455  	tp->snd_cwnd_stamp = tcp_time_stamp;
1456  }
1457  
tcp_cwnd_validate(struct sock * sk,bool is_cwnd_limited)1458  static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1459  {
1460  	struct tcp_sock *tp = tcp_sk(sk);
1461  
1462  	/* Track the maximum number of outstanding packets in each
1463  	 * window, and remember whether we were cwnd-limited then.
1464  	 */
1465  	if (!before(tp->snd_una, tp->max_packets_seq) ||
1466  	    tp->packets_out > tp->max_packets_out) {
1467  		tp->max_packets_out = tp->packets_out;
1468  		tp->max_packets_seq = tp->snd_nxt;
1469  		tp->is_cwnd_limited = is_cwnd_limited;
1470  	}
1471  
1472  	if (tcp_is_cwnd_limited(sk)) {
1473  		/* Network is feed fully. */
1474  		tp->snd_cwnd_used = 0;
1475  		tp->snd_cwnd_stamp = tcp_time_stamp;
1476  	} else {
1477  		/* Network starves. */
1478  		if (tp->packets_out > tp->snd_cwnd_used)
1479  			tp->snd_cwnd_used = tp->packets_out;
1480  
1481  		if (sysctl_tcp_slow_start_after_idle &&
1482  		    (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1483  			tcp_cwnd_application_limited(sk);
1484  	}
1485  }
1486  
1487  /* Minshall's variant of the Nagle send check. */
tcp_minshall_check(const struct tcp_sock * tp)1488  static bool tcp_minshall_check(const struct tcp_sock *tp)
1489  {
1490  	return after(tp->snd_sml, tp->snd_una) &&
1491  		!after(tp->snd_sml, tp->snd_nxt);
1492  }
1493  
1494  /* Update snd_sml if this skb is under mss
1495   * Note that a TSO packet might end with a sub-mss segment
1496   * The test is really :
1497   * if ((skb->len % mss) != 0)
1498   *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1499   * But we can avoid doing the divide again given we already have
1500   *  skb_pcount = skb->len / mss_now
1501   */
tcp_minshall_update(struct tcp_sock * tp,unsigned int mss_now,const struct sk_buff * skb)1502  static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1503  				const struct sk_buff *skb)
1504  {
1505  	if (skb->len < tcp_skb_pcount(skb) * mss_now)
1506  		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1507  }
1508  
1509  /* Return false, if packet can be sent now without violation Nagle's rules:
1510   * 1. It is full sized. (provided by caller in %partial bool)
1511   * 2. Or it contains FIN. (already checked by caller)
1512   * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1513   * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1514   *    With Minshall's modification: all sent small packets are ACKed.
1515   */
tcp_nagle_check(bool partial,const struct tcp_sock * tp,int nonagle)1516  static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1517  			    int nonagle)
1518  {
1519  	return partial &&
1520  		((nonagle & TCP_NAGLE_CORK) ||
1521  		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1522  }
1523  /* Returns the portion of skb which can be sent right away */
tcp_mss_split_point(const struct sock * sk,const struct sk_buff * skb,unsigned int mss_now,unsigned int max_segs,int nonagle)1524  static unsigned int tcp_mss_split_point(const struct sock *sk,
1525  					const struct sk_buff *skb,
1526  					unsigned int mss_now,
1527  					unsigned int max_segs,
1528  					int nonagle)
1529  {
1530  	const struct tcp_sock *tp = tcp_sk(sk);
1531  	u32 partial, needed, window, max_len;
1532  
1533  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1534  	max_len = mss_now * max_segs;
1535  
1536  	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1537  		return max_len;
1538  
1539  	needed = min(skb->len, window);
1540  
1541  	if (max_len <= needed)
1542  		return max_len;
1543  
1544  	partial = needed % mss_now;
1545  	/* If last segment is not a full MSS, check if Nagle rules allow us
1546  	 * to include this last segment in this skb.
1547  	 * Otherwise, we'll split the skb at last MSS boundary
1548  	 */
1549  	if (tcp_nagle_check(partial != 0, tp, nonagle))
1550  		return needed - partial;
1551  
1552  	return needed;
1553  }
1554  
1555  /* Can at least one segment of SKB be sent right now, according to the
1556   * congestion window rules?  If so, return how many segments are allowed.
1557   */
tcp_cwnd_test(const struct tcp_sock * tp,const struct sk_buff * skb)1558  static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1559  					 const struct sk_buff *skb)
1560  {
1561  	u32 in_flight, cwnd;
1562  
1563  	/* Don't be strict about the congestion window for the final FIN.  */
1564  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1565  	    tcp_skb_pcount(skb) == 1)
1566  		return 1;
1567  
1568  	in_flight = tcp_packets_in_flight(tp);
1569  	cwnd = tp->snd_cwnd;
1570  	if (in_flight < cwnd)
1571  		return (cwnd - in_flight);
1572  
1573  	return 0;
1574  }
1575  
1576  /* Initialize TSO state of a skb.
1577   * This must be invoked the first time we consider transmitting
1578   * SKB onto the wire.
1579   */
tcp_init_tso_segs(const struct sock * sk,struct sk_buff * skb,unsigned int mss_now)1580  static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1581  			     unsigned int mss_now)
1582  {
1583  	int tso_segs = tcp_skb_pcount(skb);
1584  
1585  	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1586  		tcp_set_skb_tso_segs(sk, skb, mss_now);
1587  		tso_segs = tcp_skb_pcount(skb);
1588  	}
1589  	return tso_segs;
1590  }
1591  
1592  
1593  /* Return true if the Nagle test allows this packet to be
1594   * sent now.
1595   */
tcp_nagle_test(const struct tcp_sock * tp,const struct sk_buff * skb,unsigned int cur_mss,int nonagle)1596  static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1597  				  unsigned int cur_mss, int nonagle)
1598  {
1599  	/* Nagle rule does not apply to frames, which sit in the middle of the
1600  	 * write_queue (they have no chances to get new data).
1601  	 *
1602  	 * This is implemented in the callers, where they modify the 'nonagle'
1603  	 * argument based upon the location of SKB in the send queue.
1604  	 */
1605  	if (nonagle & TCP_NAGLE_PUSH)
1606  		return true;
1607  
1608  	/* Don't use the nagle rule for urgent data (or for the final FIN). */
1609  	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1610  		return true;
1611  
1612  	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1613  		return true;
1614  
1615  	return false;
1616  }
1617  
1618  /* Does at least the first segment of SKB fit into the send window? */
tcp_snd_wnd_test(const struct tcp_sock * tp,const struct sk_buff * skb,unsigned int cur_mss)1619  static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1620  			     const struct sk_buff *skb,
1621  			     unsigned int cur_mss)
1622  {
1623  	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1624  
1625  	if (skb->len > cur_mss)
1626  		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1627  
1628  	return !after(end_seq, tcp_wnd_end(tp));
1629  }
1630  
1631  /* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1632   * should be put on the wire right now.  If so, it returns the number of
1633   * packets allowed by the congestion window.
1634   */
tcp_snd_test(const struct sock * sk,struct sk_buff * skb,unsigned int cur_mss,int nonagle)1635  static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
1636  				 unsigned int cur_mss, int nonagle)
1637  {
1638  	const struct tcp_sock *tp = tcp_sk(sk);
1639  	unsigned int cwnd_quota;
1640  
1641  	tcp_init_tso_segs(sk, skb, cur_mss);
1642  
1643  	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1644  		return 0;
1645  
1646  	cwnd_quota = tcp_cwnd_test(tp, skb);
1647  	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
1648  		cwnd_quota = 0;
1649  
1650  	return cwnd_quota;
1651  }
1652  
1653  /* Test if sending is allowed right now. */
tcp_may_send_now(struct sock * sk)1654  bool tcp_may_send_now(struct sock *sk)
1655  {
1656  	const struct tcp_sock *tp = tcp_sk(sk);
1657  	struct sk_buff *skb = tcp_send_head(sk);
1658  
1659  	return skb &&
1660  		tcp_snd_test(sk, skb, tcp_current_mss(sk),
1661  			     (tcp_skb_is_last(sk, skb) ?
1662  			      tp->nonagle : TCP_NAGLE_PUSH));
1663  }
1664  
1665  /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1666   * which is put after SKB on the list.  It is very much like
1667   * tcp_fragment() except that it may make several kinds of assumptions
1668   * in order to speed up the splitting operation.  In particular, we
1669   * know that all the data is in scatter-gather pages, and that the
1670   * packet has never been sent out before (and thus is not cloned).
1671   */
tso_fragment(struct sock * sk,struct sk_buff * skb,unsigned int len,unsigned int mss_now,gfp_t gfp)1672  static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1673  			unsigned int mss_now, gfp_t gfp)
1674  {
1675  	struct sk_buff *buff;
1676  	int nlen = skb->len - len;
1677  	u8 flags;
1678  
1679  	/* All of a TSO frame must be composed of paged data.  */
1680  	if (skb->len != skb->data_len)
1681  		return tcp_fragment(sk, skb, len, mss_now, gfp);
1682  
1683  	buff = sk_stream_alloc_skb(sk, 0, gfp);
1684  	if (unlikely(buff == NULL))
1685  		return -ENOMEM;
1686  
1687  	sk->sk_wmem_queued += buff->truesize;
1688  	sk_mem_charge(sk, buff->truesize);
1689  	buff->truesize += nlen;
1690  	skb->truesize -= nlen;
1691  
1692  	/* Correct the sequence numbers. */
1693  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1694  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1695  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1696  
1697  	/* PSH and FIN should only be set in the second packet. */
1698  	flags = TCP_SKB_CB(skb)->tcp_flags;
1699  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1700  	TCP_SKB_CB(buff)->tcp_flags = flags;
1701  
1702  	/* This packet was never sent out yet, so no SACK bits. */
1703  	TCP_SKB_CB(buff)->sacked = 0;
1704  
1705  	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1706  	skb_split(skb, buff, len);
1707  	tcp_fragment_tstamp(skb, buff);
1708  
1709  	/* Fix up tso_factor for both original and new SKB.  */
1710  	tcp_set_skb_tso_segs(sk, skb, mss_now);
1711  	tcp_set_skb_tso_segs(sk, buff, mss_now);
1712  
1713  	/* Link BUFF into the send queue. */
1714  	__skb_header_release(buff);
1715  	tcp_insert_write_queue_after(skb, buff, sk);
1716  
1717  	return 0;
1718  }
1719  
1720  /* Try to defer sending, if possible, in order to minimize the amount
1721   * of TSO splitting we do.  View it as a kind of TSO Nagle test.
1722   *
1723   * This algorithm is from John Heffner.
1724   */
tcp_tso_should_defer(struct sock * sk,struct sk_buff * skb,bool * is_cwnd_limited)1725  static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1726  				 bool *is_cwnd_limited)
1727  {
1728  	struct tcp_sock *tp = tcp_sk(sk);
1729  	const struct inet_connection_sock *icsk = inet_csk(sk);
1730  	u32 send_win, cong_win, limit, in_flight;
1731  	int win_divisor;
1732  
1733  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1734  		goto send_now;
1735  
1736  	if (icsk->icsk_ca_state != TCP_CA_Open)
1737  		goto send_now;
1738  
1739  	/* Defer for less than two clock ticks. */
1740  	if (tp->tso_deferred &&
1741  	    (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1742  		goto send_now;
1743  
1744  	in_flight = tcp_packets_in_flight(tp);
1745  
1746  	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1747  
1748  	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1749  
1750  	/* From in_flight test above, we know that cwnd > in_flight.  */
1751  	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1752  
1753  	limit = min(send_win, cong_win);
1754  
1755  	/* If a full-sized TSO skb can be sent, do it. */
1756  	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1757  			   tp->xmit_size_goal_segs * tp->mss_cache))
1758  		goto send_now;
1759  
1760  	/* Middle in queue won't get any more data, full sendable already? */
1761  	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1762  		goto send_now;
1763  
1764  	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1765  	if (win_divisor) {
1766  		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1767  
1768  		/* If at least some fraction of a window is available,
1769  		 * just use it.
1770  		 */
1771  		chunk /= win_divisor;
1772  		if (limit >= chunk)
1773  			goto send_now;
1774  	} else {
1775  		/* Different approach, try not to defer past a single
1776  		 * ACK.  Receiver should ACK every other full sized
1777  		 * frame, so if we have space for more than 3 frames
1778  		 * then send now.
1779  		 */
1780  		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
1781  			goto send_now;
1782  	}
1783  
1784  	/* Ok, it looks like it is advisable to defer.
1785  	 * Do not rearm the timer if already set to not break TCP ACK clocking.
1786  	 */
1787  	if (!tp->tso_deferred)
1788  		tp->tso_deferred = 1 | (jiffies << 1);
1789  
1790  	if (cong_win < send_win && cong_win < skb->len)
1791  		*is_cwnd_limited = true;
1792  
1793  	return true;
1794  
1795  send_now:
1796  	tp->tso_deferred = 0;
1797  	return false;
1798  }
1799  
1800  /* Create a new MTU probe if we are ready.
1801   * MTU probe is regularly attempting to increase the path MTU by
1802   * deliberately sending larger packets.  This discovers routing
1803   * changes resulting in larger path MTUs.
1804   *
1805   * Returns 0 if we should wait to probe (no cwnd available),
1806   *         1 if a probe was sent,
1807   *         -1 otherwise
1808   */
tcp_mtu_probe(struct sock * sk)1809  static int tcp_mtu_probe(struct sock *sk)
1810  {
1811  	struct tcp_sock *tp = tcp_sk(sk);
1812  	struct inet_connection_sock *icsk = inet_csk(sk);
1813  	struct sk_buff *skb, *nskb, *next;
1814  	int len;
1815  	int probe_size;
1816  	int size_needed;
1817  	int copy;
1818  	int mss_now;
1819  
1820  	/* Not currently probing/verifying,
1821  	 * not in recovery,
1822  	 * have enough cwnd, and
1823  	 * not SACKing (the variable headers throw things off) */
1824  	if (!icsk->icsk_mtup.enabled ||
1825  	    icsk->icsk_mtup.probe_size ||
1826  	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1827  	    tp->snd_cwnd < 11 ||
1828  	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1829  		return -1;
1830  
1831  	/* Very simple search strategy: just double the MSS. */
1832  	mss_now = tcp_current_mss(sk);
1833  	probe_size = 2 * tp->mss_cache;
1834  	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1835  	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1836  		/* TODO: set timer for probe_converge_event */
1837  		return -1;
1838  	}
1839  
1840  	/* Have enough data in the send queue to probe? */
1841  	if (tp->write_seq - tp->snd_nxt < size_needed)
1842  		return -1;
1843  
1844  	if (tp->snd_wnd < size_needed)
1845  		return -1;
1846  	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
1847  		return 0;
1848  
1849  	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
1850  	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1851  		if (!tcp_packets_in_flight(tp))
1852  			return -1;
1853  		else
1854  			return 0;
1855  	}
1856  
1857  	/* We're allowed to probe.  Build it now. */
1858  	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1859  		return -1;
1860  	sk->sk_wmem_queued += nskb->truesize;
1861  	sk_mem_charge(sk, nskb->truesize);
1862  
1863  	skb = tcp_send_head(sk);
1864  
1865  	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1866  	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1867  	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
1868  	TCP_SKB_CB(nskb)->sacked = 0;
1869  	nskb->csum = 0;
1870  	nskb->ip_summed = skb->ip_summed;
1871  
1872  	tcp_insert_write_queue_before(nskb, skb, sk);
1873  	tcp_highest_sack_replace(sk, skb, nskb);
1874  
1875  	len = 0;
1876  	tcp_for_write_queue_from_safe(skb, next, sk) {
1877  		copy = min_t(int, skb->len, probe_size - len);
1878  		if (nskb->ip_summed) {
1879  			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1880  		} else {
1881  			__wsum csum = skb_copy_and_csum_bits(skb, 0,
1882  							     skb_put(nskb, copy),
1883  							     copy, 0);
1884  			nskb->csum = csum_block_add(nskb->csum, csum, len);
1885  		}
1886  
1887  		if (skb->len <= copy) {
1888  			/* We've eaten all the data from this skb.
1889  			 * Throw it away. */
1890  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1891  			tcp_unlink_write_queue(skb, sk);
1892  			sk_wmem_free_skb(sk, skb);
1893  		} else {
1894  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
1895  						   ~(TCPHDR_FIN|TCPHDR_PSH);
1896  			if (!skb_shinfo(skb)->nr_frags) {
1897  				skb_pull(skb, copy);
1898  				if (skb->ip_summed != CHECKSUM_PARTIAL)
1899  					skb->csum = csum_partial(skb->data,
1900  								 skb->len, 0);
1901  			} else {
1902  				__pskb_trim_head(skb, copy);
1903  				tcp_set_skb_tso_segs(sk, skb, mss_now);
1904  			}
1905  			TCP_SKB_CB(skb)->seq += copy;
1906  		}
1907  
1908  		len += copy;
1909  
1910  		if (len >= probe_size)
1911  			break;
1912  	}
1913  	tcp_init_tso_segs(sk, nskb, nskb->len);
1914  
1915  	/* We're ready to send.  If this fails, the probe will
1916  	 * be resegmented into mss-sized pieces by tcp_write_xmit().
1917  	 */
1918  	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
1919  		/* Decrement cwnd here because we are sending
1920  		 * effectively two packets. */
1921  		tp->snd_cwnd--;
1922  		tcp_event_new_data_sent(sk, nskb);
1923  
1924  		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
1925  		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1926  		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1927  
1928  		return 1;
1929  	}
1930  
1931  	return -1;
1932  }
1933  
1934  /* This routine writes packets to the network.  It advances the
1935   * send_head.  This happens as incoming acks open up the remote
1936   * window for us.
1937   *
1938   * LARGESEND note: !tcp_urg_mode is overkill, only frames between
1939   * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1940   * account rare use of URG, this is not a big flaw.
1941   *
1942   * Send at most one packet when push_one > 0. Temporarily ignore
1943   * cwnd limit to force at most one packet out when push_one == 2.
1944  
1945   * Returns true, if no segments are in flight and we have queued segments,
1946   * but cannot send anything now because of SWS or another problem.
1947   */
tcp_write_xmit(struct sock * sk,unsigned int mss_now,int nonagle,int push_one,gfp_t gfp)1948  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1949  			   int push_one, gfp_t gfp)
1950  {
1951  	struct tcp_sock *tp = tcp_sk(sk);
1952  	struct sk_buff *skb;
1953  	unsigned int tso_segs, sent_pkts;
1954  	int cwnd_quota;
1955  	int result;
1956  	bool is_cwnd_limited = false;
1957  
1958  	sent_pkts = 0;
1959  
1960  	if (!push_one) {
1961  		/* Do MTU probing. */
1962  		result = tcp_mtu_probe(sk);
1963  		if (!result) {
1964  			return false;
1965  		} else if (result > 0) {
1966  			sent_pkts = 1;
1967  		}
1968  	}
1969  
1970  	while ((skb = tcp_send_head(sk))) {
1971  		unsigned int limit;
1972  
1973  		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1974  		BUG_ON(!tso_segs);
1975  
1976  		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
1977  			/* "skb_mstamp" is used as a start point for the retransmit timer */
1978  			skb_mstamp_get(&skb->skb_mstamp);
1979  			goto repair; /* Skip network transmission */
1980  		}
1981  
1982  		cwnd_quota = tcp_cwnd_test(tp, skb);
1983  		if (!cwnd_quota) {
1984  			is_cwnd_limited = true;
1985  			if (push_one == 2)
1986  				/* Force out a loss probe pkt. */
1987  				cwnd_quota = 1;
1988  			else
1989  				break;
1990  		}
1991  
1992  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1993  			break;
1994  
1995  		if (tso_segs == 1 || !sk->sk_gso_max_segs) {
1996  			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1997  						     (tcp_skb_is_last(sk, skb) ?
1998  						      nonagle : TCP_NAGLE_PUSH))))
1999  				break;
2000  		} else {
2001  			if (!push_one &&
2002  			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
2003  				break;
2004  		}
2005  
2006  		/* TCP Small Queues :
2007  		 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
2008  		 * This allows for :
2009  		 *  - better RTT estimation and ACK scheduling
2010  		 *  - faster recovery
2011  		 *  - high rates
2012  		 * Alas, some drivers / subsystems require a fair amount
2013  		 * of queued bytes to ensure line rate.
2014  		 * One example is wifi aggregation (802.11 AMPDU)
2015  		 */
2016  		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
2017  			      sk->sk_pacing_rate >> 10);
2018  
2019  		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2020  			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
2021  			/* It is possible TX completion already happened
2022  			 * before we set TSQ_THROTTLED, so we must
2023  			 * test again the condition.
2024  			 */
2025  			smp_mb__after_atomic();
2026  			if (atomic_read(&sk->sk_wmem_alloc) > limit)
2027  				break;
2028  		}
2029  
2030  		limit = mss_now;
2031  		if (tso_segs > 1 && sk->sk_gso_max_segs && !tcp_urg_mode(tp))
2032  			limit = tcp_mss_split_point(sk, skb, mss_now,
2033  						    min_t(unsigned int,
2034  							  cwnd_quota,
2035  							  sk->sk_gso_max_segs),
2036  						    nonagle);
2037  
2038  		if (skb->len > limit &&
2039  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2040  			break;
2041  
2042  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2043  			break;
2044  
2045  repair:
2046  		/* Advance the send_head.  This one is sent out.
2047  		 * This call will increment packets_out.
2048  		 */
2049  		tcp_event_new_data_sent(sk, skb);
2050  
2051  		tcp_minshall_update(tp, mss_now, skb);
2052  		sent_pkts += tcp_skb_pcount(skb);
2053  
2054  		if (push_one)
2055  			break;
2056  	}
2057  
2058  	if (likely(sent_pkts)) {
2059  		if (tcp_in_cwnd_reduction(sk))
2060  			tp->prr_out += sent_pkts;
2061  
2062  		/* Send one loss probe per tail loss episode. */
2063  		if (push_one != 2)
2064  			tcp_schedule_loss_probe(sk);
2065  		tcp_cwnd_validate(sk, is_cwnd_limited);
2066  		return false;
2067  	}
2068  	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
2069  }
2070  
tcp_schedule_loss_probe(struct sock * sk)2071  bool tcp_schedule_loss_probe(struct sock *sk)
2072  {
2073  	struct inet_connection_sock *icsk = inet_csk(sk);
2074  	struct tcp_sock *tp = tcp_sk(sk);
2075  	u32 timeout, tlp_time_stamp, rto_time_stamp;
2076  	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
2077  
2078  	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2079  		return false;
2080  	/* No consecutive loss probes. */
2081  	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2082  		tcp_rearm_rto(sk);
2083  		return false;
2084  	}
2085  	/* Don't do any loss probe on a Fast Open connection before 3WHS
2086  	 * finishes.
2087  	 */
2088  	if (sk->sk_state == TCP_SYN_RECV)
2089  		return false;
2090  
2091  	/* TLP is only scheduled when next timer event is RTO. */
2092  	if (icsk->icsk_pending != ICSK_TIME_RETRANS)
2093  		return false;
2094  
2095  	/* Schedule a loss probe in 2*RTT for SACK capable connections
2096  	 * in Open state, that are either limited by cwnd or application.
2097  	 */
2098  	if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
2099  	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2100  		return false;
2101  
2102  	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2103  	     tcp_send_head(sk))
2104  		return false;
2105  
2106  	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
2107  	 * for delayed ack when there's one outstanding packet.
2108  	 */
2109  	timeout = rtt << 1;
2110  	if (tp->packets_out == 1)
2111  		timeout = max_t(u32, timeout,
2112  				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
2113  	timeout = max_t(u32, timeout, msecs_to_jiffies(10));
2114  
2115  	/* If RTO is shorter, just schedule TLP in its place. */
2116  	tlp_time_stamp = tcp_time_stamp + timeout;
2117  	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
2118  	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
2119  		s32 delta = rto_time_stamp - tcp_time_stamp;
2120  		if (delta > 0)
2121  			timeout = delta;
2122  	}
2123  
2124  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2125  				  TCP_RTO_MAX);
2126  	return true;
2127  }
2128  
2129  /* Thanks to skb fast clones, we can detect if a prior transmit of
2130   * a packet is still in a qdisc or driver queue.
2131   * In this case, there is very little point doing a retransmit !
2132   * Note: This is called from BH context only.
2133   */
skb_still_in_host_queue(const struct sock * sk,const struct sk_buff * skb)2134  static bool skb_still_in_host_queue(const struct sock *sk,
2135  				    const struct sk_buff *skb)
2136  {
2137  	if (unlikely(skb_fclone_busy(sk, skb))) {
2138  		NET_INC_STATS_BH(sock_net(sk),
2139  				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2140  		return true;
2141  	}
2142  	return false;
2143  }
2144  
2145  /* When probe timeout (PTO) fires, send a new segment if one exists, else
2146   * retransmit the last segment.
2147   */
tcp_send_loss_probe(struct sock * sk)2148  void tcp_send_loss_probe(struct sock *sk)
2149  {
2150  	struct tcp_sock *tp = tcp_sk(sk);
2151  	struct sk_buff *skb;
2152  	int pcount;
2153  	int mss = tcp_current_mss(sk);
2154  	int err = -1;
2155  
2156  	if (tcp_send_head(sk) != NULL) {
2157  		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2158  		goto rearm_timer;
2159  	}
2160  
2161  	/* At most one outstanding TLP retransmission. */
2162  	if (tp->tlp_high_seq)
2163  		goto rearm_timer;
2164  
2165  	/* Retransmit last segment. */
2166  	skb = tcp_write_queue_tail(sk);
2167  	if (WARN_ON(!skb))
2168  		goto rearm_timer;
2169  
2170  	if (skb_still_in_host_queue(sk, skb))
2171  		goto rearm_timer;
2172  
2173  	pcount = tcp_skb_pcount(skb);
2174  	if (WARN_ON(!pcount))
2175  		goto rearm_timer;
2176  
2177  	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2178  		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2179  					  GFP_ATOMIC)))
2180  			goto rearm_timer;
2181  		skb = tcp_write_queue_tail(sk);
2182  	}
2183  
2184  	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2185  		goto rearm_timer;
2186  
2187  	err = __tcp_retransmit_skb(sk, skb);
2188  
2189  	/* Record snd_nxt for loss detection. */
2190  	if (likely(!err))
2191  		tp->tlp_high_seq = tp->snd_nxt;
2192  
2193  rearm_timer:
2194  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2195  				  inet_csk(sk)->icsk_rto,
2196  				  TCP_RTO_MAX);
2197  
2198  	if (likely(!err))
2199  		NET_INC_STATS_BH(sock_net(sk),
2200  				 LINUX_MIB_TCPLOSSPROBES);
2201  }
2202  
2203  /* Push out any pending frames which were held back due to
2204   * TCP_CORK or attempt at coalescing tiny packets.
2205   * The socket must be locked by the caller.
2206   */
__tcp_push_pending_frames(struct sock * sk,unsigned int cur_mss,int nonagle)2207  void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2208  			       int nonagle)
2209  {
2210  	/* If we are closed, the bytes will have to remain here.
2211  	 * In time closedown will finish, we empty the write queue and
2212  	 * all will be happy.
2213  	 */
2214  	if (unlikely(sk->sk_state == TCP_CLOSE))
2215  		return;
2216  
2217  	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2218  			   sk_gfp_atomic(sk, GFP_ATOMIC)))
2219  		tcp_check_probe_timer(sk);
2220  }
2221  
2222  /* Send _single_ skb sitting at the send head. This function requires
2223   * true push pending frames to setup probe timer etc.
2224   */
tcp_push_one(struct sock * sk,unsigned int mss_now)2225  void tcp_push_one(struct sock *sk, unsigned int mss_now)
2226  {
2227  	struct sk_buff *skb = tcp_send_head(sk);
2228  
2229  	BUG_ON(!skb || skb->len < mss_now);
2230  
2231  	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2232  }
2233  
2234  /* This function returns the amount that we can raise the
2235   * usable window based on the following constraints
2236   *
2237   * 1. The window can never be shrunk once it is offered (RFC 793)
2238   * 2. We limit memory per socket
2239   *
2240   * RFC 1122:
2241   * "the suggested [SWS] avoidance algorithm for the receiver is to keep
2242   *  RECV.NEXT + RCV.WIN fixed until:
2243   *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
2244   *
2245   * i.e. don't raise the right edge of the window until you can raise
2246   * it at least MSS bytes.
2247   *
2248   * Unfortunately, the recommended algorithm breaks header prediction,
2249   * since header prediction assumes th->window stays fixed.
2250   *
2251   * Strictly speaking, keeping th->window fixed violates the receiver
2252   * side SWS prevention criteria. The problem is that under this rule
2253   * a stream of single byte packets will cause the right side of the
2254   * window to always advance by a single byte.
2255   *
2256   * Of course, if the sender implements sender side SWS prevention
2257   * then this will not be a problem.
2258   *
2259   * BSD seems to make the following compromise:
2260   *
2261   *	If the free space is less than the 1/4 of the maximum
2262   *	space available and the free space is less than 1/2 mss,
2263   *	then set the window to 0.
2264   *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
2265   *	Otherwise, just prevent the window from shrinking
2266   *	and from being larger than the largest representable value.
2267   *
2268   * This prevents incremental opening of the window in the regime
2269   * where TCP is limited by the speed of the reader side taking
2270   * data out of the TCP receive queue. It does nothing about
2271   * those cases where the window is constrained on the sender side
2272   * because the pipeline is full.
2273   *
2274   * BSD also seems to "accidentally" limit itself to windows that are a
2275   * multiple of MSS, at least until the free space gets quite small.
2276   * This would appear to be a side effect of the mbuf implementation.
2277   * Combining these two algorithms results in the observed behavior
2278   * of having a fixed window size at almost all times.
2279   *
2280   * Below we obtain similar behavior by forcing the offered window to
2281   * a multiple of the mss when it is feasible to do so.
2282   *
2283   * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
2284   * Regular options like TIMESTAMP are taken into account.
2285   */
__tcp_select_window(struct sock * sk)2286  u32 __tcp_select_window(struct sock *sk)
2287  {
2288  	struct inet_connection_sock *icsk = inet_csk(sk);
2289  	struct tcp_sock *tp = tcp_sk(sk);
2290  	/* MSS for the peer's data.  Previous versions used mss_clamp
2291  	 * here.  I don't know if the value based on our guesses
2292  	 * of peer's MSS is better for the performance.  It's more correct
2293  	 * but may be worse for the performance because of rcv_mss
2294  	 * fluctuations.  --SAW  1998/11/1
2295  	 */
2296  	int mss = icsk->icsk_ack.rcv_mss;
2297  	int free_space = tcp_space(sk);
2298  	int allowed_space = tcp_full_space(sk);
2299  	int full_space = min_t(int, tp->window_clamp, allowed_space);
2300  	int window;
2301  
2302  	if (unlikely(mss > full_space)) {
2303  		mss = full_space;
2304  		if (mss <= 0)
2305  			return 0;
2306  	}
2307  	if (free_space < (full_space >> 1)) {
2308  		icsk->icsk_ack.quick = 0;
2309  
2310  		if (sk_under_memory_pressure(sk))
2311  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2312  					       4U * tp->advmss);
2313  
2314  		/* free_space might become our new window, make sure we don't
2315  		 * increase it due to wscale.
2316  		 */
2317  		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2318  
2319  		/* if free space is less than mss estimate, or is below 1/16th
2320  		 * of the maximum allowed, try to move to zero-window, else
2321  		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
2322  		 * new incoming data is dropped due to memory limits.
2323  		 * With large window, mss test triggers way too late in order
2324  		 * to announce zero window in time before rmem limit kicks in.
2325  		 */
2326  		if (free_space < (allowed_space >> 4) || free_space < mss)
2327  			return 0;
2328  	}
2329  
2330  	if (free_space > tp->rcv_ssthresh)
2331  		free_space = tp->rcv_ssthresh;
2332  
2333  	/* Don't do rounding if we are using window scaling, since the
2334  	 * scaled window will not line up with the MSS boundary anyway.
2335  	 */
2336  	window = tp->rcv_wnd;
2337  	if (tp->rx_opt.rcv_wscale) {
2338  		window = free_space;
2339  
2340  		/* Advertise enough space so that it won't get scaled away.
2341  		 * Import case: prevent zero window announcement if
2342  		 * 1<<rcv_wscale > mss.
2343  		 */
2344  		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
2345  			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
2346  				  << tp->rx_opt.rcv_wscale);
2347  	} else {
2348  		/* Get the largest window that is a nice multiple of mss.
2349  		 * Window clamp already applied above.
2350  		 * If our current window offering is within 1 mss of the
2351  		 * free space we just keep it. This prevents the divide
2352  		 * and multiply from happening most of the time.
2353  		 * We also don't do any window rounding when the free space
2354  		 * is too small.
2355  		 */
2356  		if (window <= free_space - mss || window > free_space)
2357  			window = (free_space / mss) * mss;
2358  		else if (mss == full_space &&
2359  			 free_space > window + (full_space >> 1))
2360  			window = free_space;
2361  	}
2362  
2363  	return window;
2364  }
2365  
2366  /* Collapses two adjacent SKB's during retransmission. */
tcp_collapse_retrans(struct sock * sk,struct sk_buff * skb)2367  static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2368  {
2369  	struct tcp_sock *tp = tcp_sk(sk);
2370  	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
2371  	int skb_size, next_skb_size;
2372  
2373  	skb_size = skb->len;
2374  	next_skb_size = next_skb->len;
2375  
2376  	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2377  
2378  	tcp_highest_sack_replace(sk, next_skb, skb);
2379  
2380  	tcp_unlink_write_queue(next_skb, sk);
2381  
2382  	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
2383  				  next_skb_size);
2384  
2385  	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2386  		skb->ip_summed = CHECKSUM_PARTIAL;
2387  
2388  	if (skb->ip_summed != CHECKSUM_PARTIAL)
2389  		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
2390  
2391  	/* Update sequence range on original skb. */
2392  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2393  
2394  	/* Merge over control information. This moves PSH/FIN etc. over */
2395  	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
2396  
2397  	/* All done, get rid of second SKB and account for it so
2398  	 * packet counting does not break.
2399  	 */
2400  	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2401  
2402  	/* changed transmit queue under us so clear hints */
2403  	tcp_clear_retrans_hints_partial(tp);
2404  	if (next_skb == tp->retransmit_skb_hint)
2405  		tp->retransmit_skb_hint = skb;
2406  
2407  	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2408  
2409  	sk_wmem_free_skb(sk, next_skb);
2410  }
2411  
2412  /* Check if coalescing SKBs is legal. */
tcp_can_collapse(const struct sock * sk,const struct sk_buff * skb)2413  static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2414  {
2415  	if (tcp_skb_pcount(skb) > 1)
2416  		return false;
2417  	/* TODO: SACK collapsing could be used to remove this condition */
2418  	if (skb_shinfo(skb)->nr_frags != 0)
2419  		return false;
2420  	if (skb_cloned(skb))
2421  		return false;
2422  	if (skb == tcp_send_head(sk))
2423  		return false;
2424  	/* Some heurestics for collapsing over SACK'd could be invented */
2425  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2426  		return false;
2427  
2428  	return true;
2429  }
2430  
2431  /* Collapse packets in the retransmit queue to make to create
2432   * less packets on the wire. This is only done on retransmission.
2433   */
tcp_retrans_try_collapse(struct sock * sk,struct sk_buff * to,int space)2434  static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2435  				     int space)
2436  {
2437  	struct tcp_sock *tp = tcp_sk(sk);
2438  	struct sk_buff *skb = to, *tmp;
2439  	bool first = true;
2440  
2441  	if (!sysctl_tcp_retrans_collapse)
2442  		return;
2443  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2444  		return;
2445  
2446  	tcp_for_write_queue_from_safe(skb, tmp, sk) {
2447  		if (!tcp_can_collapse(sk, skb))
2448  			break;
2449  
2450  		space -= skb->len;
2451  
2452  		if (first) {
2453  			first = false;
2454  			continue;
2455  		}
2456  
2457  		if (space < 0)
2458  			break;
2459  		/* Punt if not enough space exists in the first SKB for
2460  		 * the data in the second
2461  		 */
2462  		if (skb->len > skb_availroom(to))
2463  			break;
2464  
2465  		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2466  			break;
2467  
2468  		tcp_collapse_retrans(sk, to);
2469  	}
2470  }
2471  
2472  /* This retransmits one SKB.  Policy decisions and retransmit queue
2473   * state updates are done by the caller.  Returns non-zero if an
2474   * error occurred which prevented the send.
2475   */
__tcp_retransmit_skb(struct sock * sk,struct sk_buff * skb)2476  int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2477  {
2478  	struct tcp_sock *tp = tcp_sk(sk);
2479  	struct inet_connection_sock *icsk = inet_csk(sk);
2480  	unsigned int cur_mss;
2481  	int err;
2482  
2483  	/* Inconslusive MTU probe */
2484  	if (icsk->icsk_mtup.probe_size) {
2485  		icsk->icsk_mtup.probe_size = 0;
2486  	}
2487  
2488  	/* Do not sent more than we queued. 1/4 is reserved for possible
2489  	 * copying overhead: fragmentation, tunneling, mangling etc.
2490  	 */
2491  	if (atomic_read(&sk->sk_wmem_alloc) >
2492  	    min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
2493  		  sk->sk_sndbuf))
2494  		return -EAGAIN;
2495  
2496  	if (skb_still_in_host_queue(sk, skb))
2497  		return -EBUSY;
2498  
2499  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2500  		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2501  			BUG();
2502  		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
2503  			return -ENOMEM;
2504  	}
2505  
2506  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
2507  		return -EHOSTUNREACH; /* Routing failure or similar. */
2508  
2509  	cur_mss = tcp_current_mss(sk);
2510  
2511  	/* If receiver has shrunk his window, and skb is out of
2512  	 * new window, do not retransmit it. The exception is the
2513  	 * case, when window is shrunk to zero. In this case
2514  	 * our retransmit serves as a zero window probe.
2515  	 */
2516  	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2517  	    TCP_SKB_CB(skb)->seq != tp->snd_una)
2518  		return -EAGAIN;
2519  
2520  	if (skb->len > cur_mss) {
2521  		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2522  			return -ENOMEM; /* We'll try again later. */
2523  	} else {
2524  		int oldpcount = tcp_skb_pcount(skb);
2525  
2526  		if (unlikely(oldpcount > 1)) {
2527  			if (skb_unclone(skb, GFP_ATOMIC))
2528  				return -ENOMEM;
2529  			tcp_init_tso_segs(sk, skb, cur_mss);
2530  			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
2531  		}
2532  	}
2533  
2534  	tcp_retrans_try_collapse(sk, skb, cur_mss);
2535  
2536  	/* Make a copy, if the first transmission SKB clone we made
2537  	 * is still in somebody's hands, else make a clone.
2538  	 */
2539  
2540  	/* make sure skb->data is aligned on arches that require it
2541  	 * and check if ack-trimming & collapsing extended the headroom
2542  	 * beyond what csum_start can cover.
2543  	 */
2544  	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
2545  		     skb_headroom(skb) >= 0xFFFF)) {
2546  		struct sk_buff *nskb;
2547  
2548  		skb_mstamp_get(&skb->skb_mstamp);
2549  		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2550  		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2551  			     -ENOBUFS;
2552  	} else {
2553  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2554  	}
2555  
2556  	if (likely(!err)) {
2557  		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2558  		/* Update global TCP statistics. */
2559  		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2560  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2561  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2562  		tp->total_retrans++;
2563  	}
2564  	return err;
2565  }
2566  
tcp_retransmit_skb(struct sock * sk,struct sk_buff * skb)2567  int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2568  {
2569  	struct tcp_sock *tp = tcp_sk(sk);
2570  	int err = __tcp_retransmit_skb(sk, skb);
2571  
2572  	if (err == 0) {
2573  #if FASTRETRANS_DEBUG > 0
2574  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2575  			net_dbg_ratelimited("retrans_out leaked\n");
2576  		}
2577  #endif
2578  		if (!tp->retrans_out)
2579  			tp->lost_retrans_low = tp->snd_nxt;
2580  		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
2581  		tp->retrans_out += tcp_skb_pcount(skb);
2582  
2583  		/* Save stamp of the first retransmit. */
2584  		if (!tp->retrans_stamp)
2585  			tp->retrans_stamp = tcp_skb_timestamp(skb);
2586  
2587  		/* snd_nxt is stored to detect loss of retransmitted segment,
2588  		 * see tcp_input.c tcp_sacktag_write_queue().
2589  		 */
2590  		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2591  	} else if (err != -EBUSY) {
2592  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2593  	}
2594  
2595  	if (tp->undo_retrans < 0)
2596  		tp->undo_retrans = 0;
2597  	tp->undo_retrans += tcp_skb_pcount(skb);
2598  	return err;
2599  }
2600  
2601  /* Check if we forward retransmits are possible in the current
2602   * window/congestion state.
2603   */
tcp_can_forward_retransmit(struct sock * sk)2604  static bool tcp_can_forward_retransmit(struct sock *sk)
2605  {
2606  	const struct inet_connection_sock *icsk = inet_csk(sk);
2607  	const struct tcp_sock *tp = tcp_sk(sk);
2608  
2609  	/* Forward retransmissions are possible only during Recovery. */
2610  	if (icsk->icsk_ca_state != TCP_CA_Recovery)
2611  		return false;
2612  
2613  	/* No forward retransmissions in Reno are possible. */
2614  	if (tcp_is_reno(tp))
2615  		return false;
2616  
2617  	/* Yeah, we have to make difficult choice between forward transmission
2618  	 * and retransmission... Both ways have their merits...
2619  	 *
2620  	 * For now we do not retransmit anything, while we have some new
2621  	 * segments to send. In the other cases, follow rule 3 for
2622  	 * NextSeg() specified in RFC3517.
2623  	 */
2624  
2625  	if (tcp_may_send_now(sk))
2626  		return false;
2627  
2628  	return true;
2629  }
2630  
2631  /* This gets called after a retransmit timeout, and the initially
2632   * retransmitted data is acknowledged.  It tries to continue
2633   * resending the rest of the retransmit queue, until either
2634   * we've sent it all or the congestion window limit is reached.
2635   * If doing SACK, the first ACK which comes back for a timeout
2636   * based retransmit packet might feed us FACK information again.
2637   * If so, we use it to avoid unnecessarily retransmissions.
2638   */
tcp_xmit_retransmit_queue(struct sock * sk)2639  void tcp_xmit_retransmit_queue(struct sock *sk)
2640  {
2641  	const struct inet_connection_sock *icsk = inet_csk(sk);
2642  	struct tcp_sock *tp = tcp_sk(sk);
2643  	struct sk_buff *skb;
2644  	struct sk_buff *hole = NULL;
2645  	u32 last_lost;
2646  	int mib_idx;
2647  	int fwd_rexmitting = 0;
2648  
2649  	if (!tp->packets_out)
2650  		return;
2651  
2652  	if (!tp->lost_out)
2653  		tp->retransmit_high = tp->snd_una;
2654  
2655  	if (tp->retransmit_skb_hint) {
2656  		skb = tp->retransmit_skb_hint;
2657  		last_lost = TCP_SKB_CB(skb)->end_seq;
2658  		if (after(last_lost, tp->retransmit_high))
2659  			last_lost = tp->retransmit_high;
2660  	} else {
2661  		skb = tcp_write_queue_head(sk);
2662  		last_lost = tp->snd_una;
2663  	}
2664  
2665  	tcp_for_write_queue_from(skb, sk) {
2666  		__u8 sacked = TCP_SKB_CB(skb)->sacked;
2667  
2668  		if (skb == tcp_send_head(sk))
2669  			break;
2670  		/* we could do better than to assign each time */
2671  		if (hole == NULL)
2672  			tp->retransmit_skb_hint = skb;
2673  
2674  		/* Assume this retransmit will generate
2675  		 * only one packet for congestion window
2676  		 * calculation purposes.  This works because
2677  		 * tcp_retransmit_skb() will chop up the
2678  		 * packet to be MSS sized and all the
2679  		 * packet counting works out.
2680  		 */
2681  		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2682  			return;
2683  
2684  		if (fwd_rexmitting) {
2685  begin_fwd:
2686  			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2687  				break;
2688  			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2689  
2690  		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2691  			tp->retransmit_high = last_lost;
2692  			if (!tcp_can_forward_retransmit(sk))
2693  				break;
2694  			/* Backtrack if necessary to non-L'ed skb */
2695  			if (hole != NULL) {
2696  				skb = hole;
2697  				hole = NULL;
2698  			}
2699  			fwd_rexmitting = 1;
2700  			goto begin_fwd;
2701  
2702  		} else if (!(sacked & TCPCB_LOST)) {
2703  			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2704  				hole = skb;
2705  			continue;
2706  
2707  		} else {
2708  			last_lost = TCP_SKB_CB(skb)->end_seq;
2709  			if (icsk->icsk_ca_state != TCP_CA_Loss)
2710  				mib_idx = LINUX_MIB_TCPFASTRETRANS;
2711  			else
2712  				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2713  		}
2714  
2715  		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2716  			continue;
2717  
2718  		if (tcp_retransmit_skb(sk, skb))
2719  			return;
2720  
2721  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
2722  
2723  		if (tcp_in_cwnd_reduction(sk))
2724  			tp->prr_out += tcp_skb_pcount(skb);
2725  
2726  		if (skb == tcp_write_queue_head(sk))
2727  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2728  						  inet_csk(sk)->icsk_rto,
2729  						  TCP_RTO_MAX);
2730  	}
2731  }
2732  
2733  /* We allow to exceed memory limits for FIN packets to expedite
2734   * connection tear down and (memory) recovery.
2735   * Otherwise tcp_send_fin() could be tempted to either delay FIN
2736   * or even be forced to close flow without any FIN.
2737   */
sk_forced_wmem_schedule(struct sock * sk,int size)2738  static void sk_forced_wmem_schedule(struct sock *sk, int size)
2739  {
2740  	int amt, status;
2741  
2742  	if (size <= sk->sk_forward_alloc)
2743  		return;
2744  	amt = sk_mem_pages(size);
2745  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2746  	sk_memory_allocated_add(sk, amt, &status);
2747  }
2748  
2749  /* Send a FIN. The caller locks the socket for us.
2750   * We should try to send a FIN packet really hard, but eventually give up.
2751   */
tcp_send_fin(struct sock * sk)2752  void tcp_send_fin(struct sock *sk)
2753  {
2754  	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
2755  	struct tcp_sock *tp = tcp_sk(sk);
2756  
2757  	/* Optimization, tack on the FIN if we have one skb in write queue and
2758  	 * this skb was not yet sent, or we are under memory pressure.
2759  	 * Note: in the latter case, FIN packet will be sent after a timeout,
2760  	 * as TCP stack thinks it has already been transmitted.
2761  	 */
2762  	if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
2763  coalesce:
2764  		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
2765  		TCP_SKB_CB(tskb)->end_seq++;
2766  		tp->write_seq++;
2767  		if (!tcp_send_head(sk)) {
2768  			/* This means tskb was already sent.
2769  			 * Pretend we included the FIN on previous transmit.
2770  			 * We need to set tp->snd_nxt to the value it would have
2771  			 * if FIN had been sent. This is because retransmit path
2772  			 * does not change tp->snd_nxt.
2773  			 */
2774  			tp->snd_nxt++;
2775  			return;
2776  		}
2777  	} else {
2778  		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
2779  		if (unlikely(!skb)) {
2780  			if (tskb)
2781  				goto coalesce;
2782  			return;
2783  		}
2784  		skb_reserve(skb, MAX_TCP_HEADER);
2785  		sk_forced_wmem_schedule(sk, skb->truesize);
2786  		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2787  		tcp_init_nondata_skb(skb, tp->write_seq,
2788  				     TCPHDR_ACK | TCPHDR_FIN);
2789  		tcp_queue_skb(sk, skb);
2790  	}
2791  	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
2792  }
2793  
2794  /* We get here when a process closes a file descriptor (either due to
2795   * an explicit close() or as a byproduct of exit()'ing) and there
2796   * was unread data in the receive queue.  This behavior is recommended
2797   * by RFC 2525, section 2.17.  -DaveM
2798   */
tcp_send_active_reset(struct sock * sk,gfp_t priority)2799  void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2800  {
2801  	struct sk_buff *skb;
2802  
2803  	/* NOTE: No TCP options attached and we never retransmit this. */
2804  	skb = alloc_skb(MAX_TCP_HEADER, priority);
2805  	if (!skb) {
2806  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2807  		return;
2808  	}
2809  
2810  	/* Reserve space for headers and prepare control bits. */
2811  	skb_reserve(skb, MAX_TCP_HEADER);
2812  	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2813  			     TCPHDR_ACK | TCPHDR_RST);
2814  	skb_mstamp_get(&skb->skb_mstamp);
2815  	/* Send it off. */
2816  	if (tcp_transmit_skb(sk, skb, 0, priority))
2817  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2818  
2819  	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2820  }
2821  
2822  /* Send a crossed SYN-ACK during socket establishment.
2823   * WARNING: This routine must only be called when we have already sent
2824   * a SYN packet that crossed the incoming SYN that caused this routine
2825   * to get called. If this assumption fails then the initial rcv_wnd
2826   * and rcv_wscale values will not be correct.
2827   */
tcp_send_synack(struct sock * sk)2828  int tcp_send_synack(struct sock *sk)
2829  {
2830  	struct sk_buff *skb;
2831  
2832  	skb = tcp_write_queue_head(sk);
2833  	if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2834  		pr_debug("%s: wrong queue state\n", __func__);
2835  		return -EFAULT;
2836  	}
2837  	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
2838  		if (skb_cloned(skb)) {
2839  			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2840  			if (nskb == NULL)
2841  				return -ENOMEM;
2842  			tcp_unlink_write_queue(skb, sk);
2843  			__skb_header_release(nskb);
2844  			__tcp_add_write_queue_head(sk, nskb);
2845  			sk_wmem_free_skb(sk, skb);
2846  			sk->sk_wmem_queued += nskb->truesize;
2847  			sk_mem_charge(sk, nskb->truesize);
2848  			skb = nskb;
2849  		}
2850  
2851  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2852  		tcp_ecn_send_synack(sk, skb);
2853  	}
2854  	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2855  }
2856  
2857  /**
2858   * tcp_make_synack - Prepare a SYN-ACK.
2859   * sk: listener socket
2860   * dst: dst entry attached to the SYNACK
2861   * req: request_sock pointer
2862   *
2863   * Allocate one skb and build a SYNACK packet.
2864   * @dst is consumed : Caller should not use it again.
2865   */
tcp_make_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct tcp_fastopen_cookie * foc)2866  struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2867  				struct request_sock *req,
2868  				struct tcp_fastopen_cookie *foc)
2869  {
2870  	struct tcp_out_options opts;
2871  	struct inet_request_sock *ireq = inet_rsk(req);
2872  	struct tcp_sock *tp = tcp_sk(sk);
2873  	struct tcphdr *th;
2874  	struct sk_buff *skb;
2875  	struct tcp_md5sig_key *md5;
2876  	int tcp_header_size;
2877  	int mss;
2878  
2879  	skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
2880  	if (unlikely(!skb)) {
2881  		dst_release(dst);
2882  		return NULL;
2883  	}
2884  	/* Reserve space for headers. */
2885  	skb_reserve(skb, MAX_TCP_HEADER);
2886  
2887  	skb_dst_set(skb, dst);
2888  	security_skb_owned_by(skb, sk);
2889  
2890  	mss = dst_metric_advmss(dst);
2891  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2892  		mss = tp->rx_opt.user_mss;
2893  
2894  	memset(&opts, 0, sizeof(opts));
2895  #ifdef CONFIG_SYN_COOKIES
2896  	if (unlikely(req->cookie_ts))
2897  		skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
2898  	else
2899  #endif
2900  	skb_mstamp_get(&skb->skb_mstamp);
2901  	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
2902  					     foc) + sizeof(*th);
2903  
2904  	skb_push(skb, tcp_header_size);
2905  	skb_reset_transport_header(skb);
2906  
2907  	th = tcp_hdr(skb);
2908  	memset(th, 0, sizeof(struct tcphdr));
2909  	th->syn = 1;
2910  	th->ack = 1;
2911  	tcp_ecn_make_synack(req, th, sk);
2912  	th->source = htons(ireq->ir_num);
2913  	th->dest = ireq->ir_rmt_port;
2914  	skb->ip_summed = CHECKSUM_PARTIAL;
2915  	th->seq = htonl(tcp_rsk(req)->snt_isn);
2916  	/* XXX data is queued and acked as is. No buffer/window check */
2917  	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2918  
2919  	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2920  	th->window = htons(min(req->rcv_wnd, 65535U));
2921  	tcp_options_write((__be32 *)(th + 1), tp, &opts);
2922  	th->doff = (tcp_header_size >> 2);
2923  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
2924  
2925  #ifdef CONFIG_TCP_MD5SIG
2926  	/* Okay, we have all we need - do the md5 hash if needed */
2927  	if (md5) {
2928  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2929  					       md5, NULL, req, skb);
2930  	}
2931  #endif
2932  
2933  	/* Do not fool tcpdump (if any), clean our debris */
2934  	skb->tstamp.tv64 = 0;
2935  	return skb;
2936  }
2937  EXPORT_SYMBOL(tcp_make_synack);
2938  
2939  /* Do all connect socket setups that can be done AF independent. */
tcp_connect_init(struct sock * sk)2940  static void tcp_connect_init(struct sock *sk)
2941  {
2942  	const struct dst_entry *dst = __sk_dst_get(sk);
2943  	struct tcp_sock *tp = tcp_sk(sk);
2944  	__u8 rcv_wscale;
2945  
2946  	/* We'll fix this up when we get a response from the other end.
2947  	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
2948  	 */
2949  	tp->tcp_header_len = sizeof(struct tcphdr) +
2950  		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
2951  
2952  #ifdef CONFIG_TCP_MD5SIG
2953  	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
2954  		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2955  #endif
2956  
2957  	/* If user gave his TCP_MAXSEG, record it to clamp */
2958  	if (tp->rx_opt.user_mss)
2959  		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2960  	tp->max_window = 0;
2961  	tcp_mtup_init(sk);
2962  	tcp_sync_mss(sk, dst_mtu(dst));
2963  
2964  	if (!tp->window_clamp)
2965  		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2966  	tp->advmss = dst_metric_advmss(dst);
2967  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2968  		tp->advmss = tp->rx_opt.user_mss;
2969  
2970  	tcp_initialize_rcv_mss(sk);
2971  
2972  	/* limit the window selection if the user enforce a smaller rx buffer */
2973  	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2974  	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2975  		tp->window_clamp = tcp_full_space(sk);
2976  
2977  	tcp_select_initial_window(tcp_full_space(sk),
2978  				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2979  				  &tp->rcv_wnd,
2980  				  &tp->window_clamp,
2981  				  sysctl_tcp_window_scaling,
2982  				  &rcv_wscale,
2983  				  dst_metric(dst, RTAX_INITRWND));
2984  
2985  	tp->rx_opt.rcv_wscale = rcv_wscale;
2986  	tp->rcv_ssthresh = tp->rcv_wnd;
2987  
2988  	sk->sk_err = 0;
2989  	sock_reset_flag(sk, SOCK_DONE);
2990  	tp->snd_wnd = 0;
2991  	tcp_init_wl(tp, 0);
2992  	tp->snd_una = tp->write_seq;
2993  	tp->snd_sml = tp->write_seq;
2994  	tp->snd_up = tp->write_seq;
2995  	tp->snd_nxt = tp->write_seq;
2996  
2997  	if (likely(!tp->repair))
2998  		tp->rcv_nxt = 0;
2999  	else
3000  		tp->rcv_tstamp = tcp_time_stamp;
3001  	tp->rcv_wup = tp->rcv_nxt;
3002  	tp->copied_seq = tp->rcv_nxt;
3003  
3004  	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
3005  	inet_csk(sk)->icsk_retransmits = 0;
3006  	tcp_clear_retrans(tp);
3007  }
3008  
tcp_connect_queue_skb(struct sock * sk,struct sk_buff * skb)3009  static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3010  {
3011  	struct tcp_sock *tp = tcp_sk(sk);
3012  	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3013  
3014  	tcb->end_seq += skb->len;
3015  	__skb_header_release(skb);
3016  	__tcp_add_write_queue_tail(sk, skb);
3017  	sk->sk_wmem_queued += skb->truesize;
3018  	sk_mem_charge(sk, skb->truesize);
3019  	tp->write_seq = tcb->end_seq;
3020  	tp->packets_out += tcp_skb_pcount(skb);
3021  }
3022  
3023  /* Build and send a SYN with data and (cached) Fast Open cookie. However,
3024   * queue a data-only packet after the regular SYN, such that regular SYNs
3025   * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
3026   * only the SYN sequence, the data are retransmitted in the first ACK.
3027   * If cookie is not cached or other error occurs, falls back to send a
3028   * regular SYN with Fast Open cookie request option.
3029   */
tcp_send_syn_data(struct sock * sk,struct sk_buff * syn)3030  static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3031  {
3032  	struct tcp_sock *tp = tcp_sk(sk);
3033  	struct tcp_fastopen_request *fo = tp->fastopen_req;
3034  	int syn_loss = 0, space, err = 0;
3035  	unsigned long last_syn_loss = 0;
3036  	struct sk_buff *syn_data;
3037  
3038  	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
3039  	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
3040  			       &syn_loss, &last_syn_loss);
3041  	/* Recurring FO SYN losses: revert to regular handshake temporarily */
3042  	if (syn_loss > 1 &&
3043  	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
3044  		fo->cookie.len = -1;
3045  		goto fallback;
3046  	}
3047  
3048  	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
3049  		fo->cookie.len = -1;
3050  	else if (fo->cookie.len <= 0)
3051  		goto fallback;
3052  
3053  	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
3054  	 * user-MSS. Reserve maximum option space for middleboxes that add
3055  	 * private TCP options. The cost is reduced data space in SYN :(
3056  	 */
3057  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
3058  		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3059  	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3060  		MAX_TCP_OPTION_SPACE;
3061  
3062  	space = min_t(size_t, space, fo->size);
3063  
3064  	/* limit to order-0 allocations */
3065  	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3066  
3067  	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
3068  	if (!syn_data)
3069  		goto fallback;
3070  	syn_data->ip_summed = CHECKSUM_PARTIAL;
3071  	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3072  	if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
3073  					 fo->data->msg_iov, 0, space))) {
3074  		kfree_skb(syn_data);
3075  		goto fallback;
3076  	}
3077  
3078  	/* No more data pending in inet_wait_for_connect() */
3079  	if (space == fo->size)
3080  		fo->data = NULL;
3081  	fo->copied = space;
3082  
3083  	tcp_connect_queue_skb(sk, syn_data);
3084  
3085  	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3086  
3087  	syn->skb_mstamp = syn_data->skb_mstamp;
3088  
3089  	/* Now full SYN+DATA was cloned and sent (or not),
3090  	 * remove the SYN from the original skb (syn_data)
3091  	 * we keep in write queue in case of a retransmit, as we
3092  	 * also have the SYN packet (with no data) in the same queue.
3093  	 */
3094  	TCP_SKB_CB(syn_data)->seq++;
3095  	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3096  	if (!err) {
3097  		tp->syn_data = (fo->copied > 0);
3098  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3099  		goto done;
3100  	}
3101  
3102  fallback:
3103  	/* Send a regular SYN with Fast Open cookie request option */
3104  	if (fo->cookie.len > 0)
3105  		fo->cookie.len = 0;
3106  	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3107  	if (err)
3108  		tp->syn_fastopen = 0;
3109  done:
3110  	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
3111  	return err;
3112  }
3113  
3114  /* Build a SYN and send it off. */
tcp_connect(struct sock * sk)3115  int tcp_connect(struct sock *sk)
3116  {
3117  	struct tcp_sock *tp = tcp_sk(sk);
3118  	struct sk_buff *buff;
3119  	int err;
3120  
3121  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3122  		return -EHOSTUNREACH; /* Routing failure or similar. */
3123  
3124  	tcp_connect_init(sk);
3125  
3126  	if (unlikely(tp->repair)) {
3127  		tcp_finish_connect(sk, NULL);
3128  		return 0;
3129  	}
3130  
3131  	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
3132  	if (unlikely(!buff))
3133  		return -ENOBUFS;
3134  
3135  	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3136  	tp->retrans_stamp = tcp_time_stamp;
3137  	tcp_connect_queue_skb(sk, buff);
3138  	tcp_ecn_send_syn(sk, buff);
3139  
3140  	/* Send off SYN; include data in Fast Open. */
3141  	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3142  	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3143  	if (err == -ECONNREFUSED)
3144  		return err;
3145  
3146  	/* We change tp->snd_nxt after the tcp_transmit_skb() call
3147  	 * in order to make this packet get counted in tcpOutSegs.
3148  	 */
3149  	tp->snd_nxt = tp->write_seq;
3150  	tp->pushed_seq = tp->write_seq;
3151  	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3152  
3153  	/* Timer for repeating the SYN until an answer. */
3154  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3155  				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3156  	return 0;
3157  }
3158  EXPORT_SYMBOL(tcp_connect);
3159  
3160  /* Send out a delayed ack, the caller does the policy checking
3161   * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
3162   * for details.
3163   */
tcp_send_delayed_ack(struct sock * sk)3164  void tcp_send_delayed_ack(struct sock *sk)
3165  {
3166  	struct inet_connection_sock *icsk = inet_csk(sk);
3167  	int ato = icsk->icsk_ack.ato;
3168  	unsigned long timeout;
3169  
3170  	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
3171  
3172  	if (ato > TCP_DELACK_MIN) {
3173  		const struct tcp_sock *tp = tcp_sk(sk);
3174  		int max_ato = HZ / 2;
3175  
3176  		if (icsk->icsk_ack.pingpong ||
3177  		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3178  			max_ato = TCP_DELACK_MAX;
3179  
3180  		/* Slow path, intersegment interval is "high". */
3181  
3182  		/* If some rtt estimate is known, use it to bound delayed ack.
3183  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
3184  		 * directly.
3185  		 */
3186  		if (tp->srtt_us) {
3187  			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3188  					TCP_DELACK_MIN);
3189  
3190  			if (rtt < max_ato)
3191  				max_ato = rtt;
3192  		}
3193  
3194  		ato = min(ato, max_ato);
3195  	}
3196  
3197  	/* Stay within the limit we were given */
3198  	timeout = jiffies + ato;
3199  
3200  	/* Use new timeout only if there wasn't a older one earlier. */
3201  	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3202  		/* If delack timer was blocked or is about to expire,
3203  		 * send ACK now.
3204  		 */
3205  		if (icsk->icsk_ack.blocked ||
3206  		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3207  			tcp_send_ack(sk);
3208  			return;
3209  		}
3210  
3211  		if (!time_before(timeout, icsk->icsk_ack.timeout))
3212  			timeout = icsk->icsk_ack.timeout;
3213  	}
3214  	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3215  	icsk->icsk_ack.timeout = timeout;
3216  	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3217  }
3218  
3219  /* This routine sends an ack and also updates the window. */
tcp_send_ack(struct sock * sk)3220  void tcp_send_ack(struct sock *sk)
3221  {
3222  	struct sk_buff *buff;
3223  
3224  	/* If we have been reset, we may not send again. */
3225  	if (sk->sk_state == TCP_CLOSE)
3226  		return;
3227  
3228  	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
3229  
3230  	/* We are not putting this on the write queue, so
3231  	 * tcp_transmit_skb() will set the ownership to this
3232  	 * sock.
3233  	 */
3234  	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3235  	if (buff == NULL) {
3236  		inet_csk_schedule_ack(sk);
3237  		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3238  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3239  					  TCP_DELACK_MAX, TCP_RTO_MAX);
3240  		return;
3241  	}
3242  
3243  	/* Reserve space for headers and prepare control bits. */
3244  	skb_reserve(buff, MAX_TCP_HEADER);
3245  	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3246  
3247  	/* Send it off, this clears delayed acks for us. */
3248  	skb_mstamp_get(&buff->skb_mstamp);
3249  	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3250  }
3251  EXPORT_SYMBOL_GPL(tcp_send_ack);
3252  
3253  /* This routine sends a packet with an out of date sequence
3254   * number. It assumes the other end will try to ack it.
3255   *
3256   * Question: what should we make while urgent mode?
3257   * 4.4BSD forces sending single byte of data. We cannot send
3258   * out of window data, because we have SND.NXT==SND.MAX...
3259   *
3260   * Current solution: to send TWO zero-length segments in urgent mode:
3261   * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
3262   * out-of-date with SND.UNA-1 to probe window.
3263   */
tcp_xmit_probe_skb(struct sock * sk,int urgent)3264  static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3265  {
3266  	struct tcp_sock *tp = tcp_sk(sk);
3267  	struct sk_buff *skb;
3268  
3269  	/* We don't queue it, tcp_transmit_skb() sets ownership. */
3270  	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3271  	if (skb == NULL)
3272  		return -1;
3273  
3274  	/* Reserve space for headers and set control bits. */
3275  	skb_reserve(skb, MAX_TCP_HEADER);
3276  	/* Use a previous sequence.  This should cause the other
3277  	 * end to send an ack.  Don't queue or clone SKB, just
3278  	 * send it.
3279  	 */
3280  	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3281  	skb_mstamp_get(&skb->skb_mstamp);
3282  	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
3283  }
3284  
tcp_send_window_probe(struct sock * sk)3285  void tcp_send_window_probe(struct sock *sk)
3286  {
3287  	if (sk->sk_state == TCP_ESTABLISHED) {
3288  		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3289  		tcp_xmit_probe_skb(sk, 0);
3290  	}
3291  }
3292  
3293  /* Initiate keepalive or window probe from timer. */
tcp_write_wakeup(struct sock * sk)3294  int tcp_write_wakeup(struct sock *sk)
3295  {
3296  	struct tcp_sock *tp = tcp_sk(sk);
3297  	struct sk_buff *skb;
3298  
3299  	if (sk->sk_state == TCP_CLOSE)
3300  		return -1;
3301  
3302  	if ((skb = tcp_send_head(sk)) != NULL &&
3303  	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3304  		int err;
3305  		unsigned int mss = tcp_current_mss(sk);
3306  		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3307  
3308  		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3309  			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3310  
3311  		/* We are probing the opening of a window
3312  		 * but the window size is != 0
3313  		 * must have been a result SWS avoidance ( sender )
3314  		 */
3315  		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3316  		    skb->len > mss) {
3317  			seg_size = min(seg_size, mss);
3318  			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3319  			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3320  				return -1;
3321  		} else if (!tcp_skb_pcount(skb))
3322  			tcp_set_skb_tso_segs(sk, skb, mss);
3323  
3324  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3325  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3326  		if (!err)
3327  			tcp_event_new_data_sent(sk, skb);
3328  		return err;
3329  	} else {
3330  		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
3331  			tcp_xmit_probe_skb(sk, 1);
3332  		return tcp_xmit_probe_skb(sk, 0);
3333  	}
3334  }
3335  
3336  /* A window probe timeout has occurred.  If window is not closed send
3337   * a partial packet else a zero probe.
3338   */
tcp_send_probe0(struct sock * sk)3339  void tcp_send_probe0(struct sock *sk)
3340  {
3341  	struct inet_connection_sock *icsk = inet_csk(sk);
3342  	struct tcp_sock *tp = tcp_sk(sk);
3343  	unsigned long probe_max;
3344  	int err;
3345  
3346  	err = tcp_write_wakeup(sk);
3347  
3348  	if (tp->packets_out || !tcp_send_head(sk)) {
3349  		/* Cancel probe timer, if it is not required. */
3350  		icsk->icsk_probes_out = 0;
3351  		icsk->icsk_backoff = 0;
3352  		return;
3353  	}
3354  
3355  	if (err <= 0) {
3356  		if (icsk->icsk_backoff < sysctl_tcp_retries2)
3357  			icsk->icsk_backoff++;
3358  		icsk->icsk_probes_out++;
3359  		probe_max = TCP_RTO_MAX;
3360  	} else {
3361  		/* If packet was not sent due to local congestion,
3362  		 * do not backoff and do not remember icsk_probes_out.
3363  		 * Let local senders to fight for local resources.
3364  		 *
3365  		 * Use accumulated backoff yet.
3366  		 */
3367  		if (!icsk->icsk_probes_out)
3368  			icsk->icsk_probes_out = 1;
3369  		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
3370  	}
3371  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3372  				  inet_csk_rto_backoff(icsk, probe_max),
3373  				  TCP_RTO_MAX);
3374  }
3375  
tcp_rtx_synack(struct sock * sk,struct request_sock * req)3376  int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
3377  {
3378  	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3379  	struct flowi fl;
3380  	int res;
3381  
3382  	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
3383  	if (!res) {
3384  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
3385  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3386  	}
3387  	return res;
3388  }
3389  EXPORT_SYMBOL(tcp_rtx_synack);
3390