• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Based on net/ipv4/tcp_output.c
4  * Authors:	Ross Biro
5  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
6  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
7  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
8  *		Florian La Roche, <flla@stud.uni-sb.de>
9  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
10  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Matthew Dillon, <dillon@apollo.west.oic.com>
13  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *
16  * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
17  *				:	Fragmentation on mtu decrease
18  *				:	Segment collapse on retransmit
19  *				:	AF independence
20  *
21  *		Linus Torvalds	:	send_delayed_ack
22  *		David S. Miller	:	Charge memory using the right skb
23  *					during syn/ack processing.
24  *		David S. Miller :	Output engine completely rewritten.
25  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
26  *		Cacophonix Gaul :	draft-minshall-nagle-01
27  *		J Hadi Salim	:	ECN support
28  *
29  * Based on net/ipv4/tcp_minisocks.c
30  * Authors:	Ross Biro
31  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
32  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
33  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
34  *		Florian La Roche, <flla@stud.uni-sb.de>
35  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
36  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
37  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
38  *		Matthew Dillon, <dillon@apollo.west.oic.com>
39  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
40  *		Jorge Cwik, <jorge@laser.satlink.net>
41  *
42  * NewIP INET
43  * An implementation of the TCP/IP protocol suite for the LINUX
44  * operating system. NewIP INET is implemented using the  BSD Socket
45  * interface as the means of communication with the user level.
46  *
47  * Implementation of the Transmission Control Protocol(TCP).
48  */
49 #define pr_fmt(fmt) KBUILD_MODNAME ": [%s:%d] " fmt, __func__, __LINE__
50 
51 #include <net/nip.h>
52 #include <net/tcp_nip.h>
53 #include <net/tcp.h>
54 #include <net/ninet_connection_sock.h>
55 #include <linux/compiler.h>
56 #include <linux/module.h>
57 #include <net/nip_udp.h>
58 #include "nip_hdr.h"
59 #include "nip_checksum.h"
60 #include "tcp_nip_parameter.h"
61 
62 #define OPTION_SACK_ADVERTISE   BIT(0)
63 #define OPTION_TS               BIT(1)
64 #define OPTION_MD5              BIT(2)
65 #define OPTION_WSCALE           BIT(3)
66 #define OPTION_FAST_OPEN_COOKIE BIT(8)
67 #define TCP_NIP_SND_NUM_MAX     (~0U)
68 
69 /* Store the options contained in TCP when sending TCP packets */
70 struct tcp_nip_out_options {
71 	u16 options;        /* bit field of OPTION_* */
72 	u16 mss;            /* If it is zero, the MSS option is disabled */
73 
74 	u8 ws;              /* window scale, 0 to disable */
75 	__u32 tsval, tsecr; /* need to include OPTION_TS */
76 };
77 
78 static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
79 			       int push_one, gfp_t gfp);
80 
81 /* Calculate MSS not accounting any TCP options.  */
__tcp_nip_mtu_to_mss(struct sock * sk,int pmtu)82 static int __tcp_nip_mtu_to_mss(struct sock *sk, int pmtu)
83 {
84 	const struct tcp_sock *tp = tcp_sk(sk);
85 	const struct inet_connection_sock *icsk = inet_csk(sk);
86 	int mss_now;
87 	int nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &sk->SK_NIP_RCV_SADDR, &sk->SK_NIP_DADDR);
88 
89 	/* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */
90 	nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
91 	mss_now = pmtu - nip_hdr_len - sizeof(struct tcphdr);
92 
93 	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
94 	if (icsk->icsk_af_ops->net_frag_header_len) {
95 		const struct dst_entry *dst = __sk_dst_get(sk);
96 
97 		if (dst && dst_allfrag(dst))
98 			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
99 	}
100 
101 	/* Clamp it (mss_clamp does not include tcp options) */
102 	if (mss_now > tp->rx_opt.mss_clamp)
103 		mss_now = tp->rx_opt.mss_clamp;
104 
105 	/* Now subtract optional transport overhead */
106 	mss_now -= icsk->icsk_ext_hdr_len;
107 
108 	/* Then reserve room for full set of TCP options and 8 bytes of data */
109 	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
110 	return mss_now;
111 }
112 
113 /* Calculate MSS. Not accounting for SACKs here.  */
tcp_nip_mtu_to_mss(struct sock * sk,int pmtu)114 int tcp_nip_mtu_to_mss(struct sock *sk, int pmtu)
115 {
116 	/* Subtract TCP options size, not including SACKs */
117 	return __tcp_nip_mtu_to_mss(sk, pmtu) -
118 	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
119 }
120 
tcp_advance_send_head(struct sock * sk,const struct sk_buff * skb)121 static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
122 {
123 	if (tcp_skb_is_last(sk, skb))
124 		sk->sk_send_head = NULL;
125 	else
126 		sk->sk_send_head = skb_queue_next(&sk->sk_write_queue, skb);
127 }
128 
tcp_nip_event_new_data_sent(struct sock * sk,struct sk_buff * skb)129 static void tcp_nip_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
130 {
131 	struct inet_connection_sock *icsk = inet_csk(sk);
132 	struct tcp_sock *tp = tcp_sk(sk);
133 	unsigned int prior_packets = tp->packets_out;
134 
135 	tcp_advance_send_head(sk, skb);
136 	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
137 	tp->packets_out += tcp_skb_pcount(skb);
138 	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
139 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
140 		tcp_nip_rearm_rto(sk);
141 
142 	tcp_nip_check_space(sk);
143 }
144 
145 /* check probe0 timer */
tcp_nip_check_probe_timer(struct sock * sk)146 static void tcp_nip_check_probe_timer(struct sock *sk)
147 {
148 	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) {
149 		unsigned long when = tcp_probe0_base(sk);
150 
151 		nip_dbg("start probe0 timer, when=%lu, RTO MAX=%u", when, TCP_RTO_MAX);
152 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
153 	} else if (inet_csk(sk)->icsk_pending != ICSK_TIME_PROBE0) {
154 		nip_dbg("can`t start probe0 timer, packets_out=%u, icsk_pending=%u",
155 			tcp_sk(sk)->packets_out, inet_csk(sk)->icsk_pending);
156 	}
157 }
158 
__tcp_nip_push_pending_frames(struct sock * sk,unsigned int cur_mss,int nonagle)159 void __tcp_nip_push_pending_frames(struct sock *sk, unsigned int cur_mss,
160 				   int nonagle)
161 {
162 	if (unlikely(sk->sk_state == TCP_CLOSE))
163 		return;
164 
165 	if (tcp_nip_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC)))
166 		tcp_nip_check_probe_timer(sk);
167 }
168 
__nip_tcp_select_window(struct sock * sk)169 u32 __nip_tcp_select_window(struct sock *sk)
170 {
171 	struct inet_connection_sock *icsk = inet_csk(sk);
172 	struct tcp_sock *tp = tcp_sk(sk);
173 	struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
174 	int mss = tcp_nip_current_mss(sk); /* TCP_BASE_MSS */
175 	int allowed_space = tcp_full_space(sk);
176 	int full_space = min_t(int, tp->window_clamp, allowed_space); /* Total receive cache */
177 	int free_space = tcp_space(sk); /* 3/4 remaining receive cache */
178 	int window;
179 
180 	if (unlikely(mss > full_space)) {
181 		mss = full_space;
182 		if (mss <= 0)
183 			return 0;
184 	}
185 
186 	/* receive buffer is half full */
187 	if (free_space < (full_space >> 1)) {
188 		icsk->icsk_ack.quick = 0;
189 
190 		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
191 		if (free_space < (allowed_space >> TCP_NUM_4) || free_space < mss) {
192 			nip_dbg("rcv_wnd is 0, [allowed|full|free]space=[%u, %u, %u], mss=%u",
193 				allowed_space, full_space, free_space, mss);
194 			return 0;
195 		}
196 	}
197 
198 	if (get_nip_tcp_rcv_win_enable()) {
199 		if (get_ssthresh_enable())
200 			free_space = free_space > ntp->nip_ssthresh ?
201 				     ntp->nip_ssthresh : free_space;
202 		else
203 			free_space = free_space > tp->rcv_ssthresh ? tp->rcv_ssthresh : free_space;
204 	} else {
205 		free_space = free_space > get_ssthresh_high() ? get_ssthresh_high() : free_space;
206 	}
207 
208 	/* Don't do rounding if we are using window scaling, since the
209 	 * scaled window will not line up with the MSS boundary anyway.
210 	 * tp->rx_opt.rcv_wscale is always true
211 	 */
212 	window = free_space;
213 
214 	/* Advertise enough space so that it won't get scaled away.
215 	 * Import case: prevent zero window announcement if
216 	 * 1<<rcv_wscale > mss.
217 	 */
218 	window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
219 	nip_dbg("wscale(%u) win change [%u to %u], [allowed|free]space=[%u, %u], mss=%u",
220 		tp->rx_opt.rcv_wscale, free_space, window, allowed_space, free_space, mss);
221 	return window;
222 }
223 
224 /* The basic algorithm of window size selection:
225  * 1. Calculate the remaining size of the receiving window cur_win.
226  * 2. Calculate the new receive window size NEW_win, which is 3/4 of the remaining receive
227  *    cache and cannot exceed RCV_SSTHresh.
228  * 3. Select the receiving window size with the larger median value of cur_win and new_win.
229  */
nip_tcp_select_window(struct sock * sk)230 static u16 nip_tcp_select_window(struct sock *sk)
231 {
232 	struct tcp_sock *tp = tcp_sk(sk);
233 	u32 old_win = tp->rcv_wnd;
234 	/* The remaining size of the front receive window */
235 	u32 cur_win = tcp_receive_window(tp);
236 	/* Calculate the size of the new receive window based on the remaining receive cache */
237 	u32 new_win = __nip_tcp_select_window(sk);
238 	u32 new_win_bak;
239 
240 	/* Never shrink the offered window */
241 	if (new_win < cur_win) {
242 		/* Danger Will Robinson!
243 		 * Don't update rcv_wup/rcv_wnd here or else
244 		 * we will not be able to advertise a zero
245 		 * window in time.  --DaveM
246 		 *
247 		 * Relax Will Robinson.
248 		 */
249 		if (new_win == 0)
250 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWANTZEROWINDOWADV);
251 		new_win_bak = new_win;
252 		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
253 		nip_dbg("when new_win(%u) < cur_win(%u), win change [%u to %u]",
254 			new_win_bak, cur_win, new_win_bak, new_win);
255 	}
256 	tp->rcv_wnd = new_win;
257 	tp->rcv_wup = tp->rcv_nxt;
258 
259 	/* Make sure we do not exceed the maximum possible
260 	 * scaled window.
261 	 */
262 	if (!tp->rx_opt.rcv_wscale && sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
263 		new_win = min(new_win, MAX_TCP_WINDOW);
264 	else
265 		new_win = min(new_win, (TCP_NIP_WINDOW_MAX << tp->rx_opt.rcv_wscale));
266 
267 	/* RFC1323 Scaling Applied.
268 	 * Scaling the receive window so that it can represent up to 30 bits
269 	 */
270 	new_win_bak = new_win;
271 	new_win >>= tp->rx_opt.rcv_wscale;
272 	nip_dbg("wscale(%u) win change [%u to %u]", tp->rx_opt.rcv_wscale, new_win_bak, new_win);
273 	if (new_win == 0) {
274 		tp->pred_flags = 0;
275 		if (old_win)
276 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV);
277 	} else if (old_win == 0) {
278 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
279 	}
280 
281 	return new_win;
282 }
283 
284 /* Function
285  *    Initialize transport layer parameters.
286  * Parameter
287  *    sk: transmission control block.
288  */
tcp_nip_connect_init(struct sock * sk)289 static void tcp_nip_connect_init(struct sock *sk)
290 {
291 	const struct dst_entry *dst = __sk_dst_get(sk);
292 	struct tcp_sock *tp = tcp_sk(sk);
293 	__u8 rcv_wscale = 0;
294 
295 	/* Header structure length + timestamp length */
296 	tp->tcp_header_len = sizeof(struct tcphdr);
297 	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
298 		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
299 
300 	if (tp->rx_opt.user_mss)
301 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
302 	tp->max_window = 0;
303 
304 	tcp_mtup_init(sk);
305 	tp->rx_opt.mss_clamp = tcp_nip_sync_mss(sk, dst_mtu(dst));
306 
307 	if (!tp->window_clamp)
308 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
309 	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
310 
311 	tcp_initialize_rcv_mss(sk);
312 
313 	/* Initialization window */
314 	tcp_select_initial_window(sk, tcp_full_space(sk),
315 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ?
316 				  tp->tcp_header_len - sizeof(struct tcphdr) : 0),
317 				  &tp->rcv_wnd,
318 				  &tp->window_clamp,
319 				  0,
320 				  &rcv_wscale,
321 				  0);
322 
323 	tp->rx_opt.rcv_wscale = get_wscale_enable() ? get_wscale() : rcv_wscale;
324 	tp->rcv_ssthresh = tp->rcv_wnd;
325 
326 	sk->sk_err = 0;
327 	sock_reset_flag(sk, SOCK_DONE);
328 	tp->snd_wnd = 0;
329 	tp->snd_wl1 = 0;
330 	tcp_write_queue_purge(sk);
331 
332 	tp->snd_una = tp->write_seq;
333 	tp->snd_sml = tp->write_seq;
334 	tp->snd_up = tp->write_seq;
335 	tp->snd_nxt = tp->write_seq;
336 
337 	tp->rcv_nxt = 0;
338 	tp->rcv_wup = tp->rcv_nxt;
339 	tp->copied_seq = tp->rcv_nxt;
340 	inet_csk(sk)->icsk_rto = get_nip_rto() == 0 ? TCP_TIMEOUT_INIT : (HZ / get_nip_rto());
341 	inet_csk(sk)->icsk_retransmits = 0;
342 	tcp_clear_retrans(tp);
343 }
344 
tcp_nip_init_nondata_skb(struct sk_buff * skb,u32 seq,u8 flags)345 static void tcp_nip_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
346 {
347 	skb->ip_summed = CHECKSUM_PARTIAL;
348 	skb->csum = 0;
349 
350 	TCP_SKB_CB(skb)->tcp_flags = flags;
351 	TCP_SKB_CB(skb)->sacked = 0;
352 
353 	tcp_skb_pcount_set(skb, 1);
354 
355 	TCP_SKB_CB(skb)->seq = seq;
356 	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
357 		seq++;
358 	TCP_SKB_CB(skb)->end_seq = seq;
359 }
360 
361 #define OPTION_TS     BIT(1)
362 #define OPTION_WSCALE BIT(3)
363 
tcp_nip_connect_queue_skb(struct sock * sk,struct sk_buff * skb)364 static void tcp_nip_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
365 {
366 	struct tcp_sock *tp = tcp_sk(sk);
367 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
368 
369 	tcb->end_seq += skb->len;
370 	__skb_header_release(skb);
371 	__skb_queue_tail(&sk->sk_write_queue, skb);
372 	sk->sk_wmem_queued += skb->truesize;
373 	sk_mem_charge(sk, skb->truesize);
374 	WRITE_ONCE(tp->write_seq, tcb->end_seq);
375 	tp->packets_out += tcp_skb_pcount(skb);
376 }
377 
tcp_nip_advertise_mss(struct sock * sk)378 static __u16 tcp_nip_advertise_mss(struct sock *sk)
379 {
380 	struct tcp_sock *tp = tcp_sk(sk);
381 	const struct dst_entry *dst = __sk_dst_get(sk);
382 	int mss = tp->advmss;
383 	u32 mtu;
384 
385 	if (dst) {
386 		int nip_hdr_len;
387 		int nip_mss;
388 		unsigned int metric = dst_metric_advmss(dst);
389 
390 		if (metric < (unsigned int)mss) {
391 			mss = metric;
392 			tp->advmss = mss;
393 		}
394 
395 		mtu = dst_mtu(dst); /* NIP_MIN_MTU */
396 		nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &sk->SK_NIP_RCV_SADDR,
397 					      &sk->SK_NIP_DADDR);
398 		nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
399 		nip_mss = mtu - nip_hdr_len - sizeof(struct tcphdr);
400 		if (nip_mss > mss) {
401 			mss = nip_mss;
402 			tp->advmss = mss;
403 		}
404 	}
405 
406 	return (__u16)mss;
407 }
408 
409 /* Compute TCP options for SYN packets. This is not the final
410  * network wire format yet.
411  */
tcp_nip_syn_options(struct sock * sk,struct sk_buff * skb,struct tcp_nip_out_options * opts)412 static unsigned int tcp_nip_syn_options(struct sock *sk, struct sk_buff *skb,
413 					struct tcp_nip_out_options *opts)
414 {
415 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
416 
417 	opts->mss = tcp_nip_advertise_mss(sk);
418 	nip_dbg("advertise mss %d", opts->mss);
419 	remaining -= TCPOLEN_MSS_ALIGNED;
420 
421 	return MAX_TCP_OPTION_SPACE - remaining;
422 }
423 
424 /* Compute TCP options for ESTABLISHED sockets. This is not the
425  * final wire format yet.
426  */
tcp_nip_established_options(struct sock * sk,struct sk_buff * skb,struct tcp_nip_out_options * opts)427 static unsigned int tcp_nip_established_options(struct sock *sk, struct sk_buff *skb,
428 						struct tcp_nip_out_options *opts)
429 {
430 	struct tcp_sock *tp = tcp_sk(sk);
431 	unsigned int size = 0;
432 
433 	opts->options = 0;
434 
435 	if (likely(tp->rx_opt.tstamp_ok)) {
436 		opts->options |= OPTION_TS;
437 		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
438 		opts->tsecr = tp->rx_opt.ts_recent;
439 		size += TCPOLEN_TSTAMP_ALIGNED;
440 	}
441 	return size;
442 }
443 
444 /* Function
445  *    Put the parameters from the TCP option into SKB.
446  *    Write previously computed TCP options to the packet.
447  * Parameter
448  *    ptr: pointer to TCP options in SKB.
449  *    tp: transmission control block.
450  *    opts: structure to be sent to temporarily load TCP options.
451  */
tcp_nip_options_write(__be32 * ptr,struct tcp_sock * tp,struct tcp_nip_out_options * opts)452 static void tcp_nip_options_write(__be32 *ptr, struct tcp_sock *tp,
453 				  struct tcp_nip_out_options *opts)
454 {
455 	if (unlikely(opts->mss))
456 		*ptr++ = htonl((TCPOPT_MSS << TCP_OPT_MSS_PAYLOAD) |
457 			       (TCPOLEN_MSS << TCP_OLEN_MSS_PAYLOAD) |
458 			       opts->mss);
459 }
460 
tcp_nip_event_ack_sent(struct sock * sk,unsigned int pkts,u32 rcv_nxt)461 static inline void tcp_nip_event_ack_sent(struct sock *sk, unsigned int pkts,
462 					  u32 rcv_nxt)
463 {
464 	struct tcp_sock *tp = tcp_sk(sk);
465 
466 	if (unlikely(rcv_nxt != tp->rcv_nxt))
467 		return;
468 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
469 }
470 
nip_get_output_checksum_tcp(struct sk_buff * skb,struct nip_addr src_addr,struct nip_addr dst_addr)471 unsigned short nip_get_output_checksum_tcp(struct sk_buff *skb, struct nip_addr src_addr,
472 					   struct nip_addr dst_addr)
473 {
474 	struct nip_pseudo_header nph = {0};
475 	u8 *tcp_hdr = skb_transport_header(skb);
476 
477 	nph.nexthdr = IPPROTO_TCP;
478 	nph.saddr = src_addr;
479 	nph.daddr = dst_addr;
480 
481 	nph.check_len = htons(skb->len);
482 	return nip_check_sum_build(tcp_hdr, skb->len, &nph);
483 }
484 
__tcp_nip_transmit_skb(struct sock * sk,struct sk_buff * skb,int clone_it,gfp_t gfp_mask,u32 rcv_nxt)485 static int __tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb,
486 				  int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
487 {
488 	const struct inet_connection_sock *icsk = inet_csk(sk);
489 	struct inet_sock *inet;
490 	struct tcp_sock *tp = tcp_sk(sk);
491 	struct tcp_skb_cb *tcb;
492 	struct tcp_nip_out_options opts;
493 	unsigned int tcp_options_size, tcp_header_size;
494 	struct sk_buff *oskb = NULL;
495 	struct tcphdr *th;
496 	int err = 0;
497 	__be16 len;
498 	unsigned short check = 0;
499 	bool ack;
500 
501 	if (skb->tstamp == 0)
502 		skb->tstamp = tcp_jiffies32;
503 
504 	if (clone_it) {
505 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
506 			- tp->snd_una;
507 		oskb = skb;
508 
509 		tcp_skb_tsorted_save(oskb) {
510 			if (unlikely(skb_cloned(oskb)))
511 				skb = pskb_copy(oskb, gfp_mask);
512 			else
513 				skb = skb_clone(oskb, gfp_mask);
514 		} tcp_skb_tsorted_restore(oskb);
515 
516 		if (unlikely(!skb))
517 			return -ENOBUFS;
518 	}
519 
520 	inet = inet_sk(sk);
521 	tcb = TCP_SKB_CB(skb);
522 	memset(&opts, 0, sizeof(opts));
523 
524 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
525 		tcp_options_size = tcp_nip_syn_options(sk, skb, &opts);
526 	else
527 		tcp_options_size = tcp_nip_established_options(sk, skb, &opts);
528 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
529 
530 	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
531 	/* The data pointer moves up */
532 	skb_push(skb, tcp_header_size);
533 	skb_reset_transport_header(skb);
534 
535 	/* Disassociate the control block */
536 	skb_orphan(skb);
537 
538 	/* Establishes associations with control blocks */
539 	skb->sk = sk;
540 	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
541 	skb_set_hash_from_sk(skb, sk);
542 	/* Increase allocated memory */
543 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
544 
545 	/* Build TCP header and checksum it. */
546 	th          = (struct tcphdr *)skb->data;
547 	th->source  = inet->inet_sport;
548 	th->dest    = inet->inet_dport;
549 	th->seq     = htonl(tcb->seq);
550 	th->ack_seq = htonl(rcv_nxt);
551 	/* TCP's header offset is measured in 4 bytes, so moving two to the right
552 	 * means dividing by 4. In addition, according to the position of the offset
553 	 * field in the packet, the offset field is at the beginning of a short type,
554 	 * accounting for 4 bits. Therefore, the offset field should be shifted 12 bits
555 	 * to the left
556 	 */
557 	len = htons(((tcp_header_size >> TCP_NIP_4BYTE_PAYLOAD) << TCP_HDR_LEN_POS_PAYLOAD) |
558 		    tcb->tcp_flags);
559 	*(((__be16 *)th) + TCP_HDR_LEN_OFFSET) = len;
560 
561 	th->check = 0;
562 	/* Newip Urg_ptr is disabled. Urg_ptr is used to carry the number of discarded packets */
563 	th->urg_ptr = htons(tp->snd_up);
564 
565 	/* Write TCP option */
566 	tcp_nip_options_write((__be32 *)(th + 1), tp, &opts);
567 
568 	/* Window Settings */
569 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN)))
570 		th->window = htons(nip_tcp_select_window(sk));
571 	else
572 		th->window = htons(min(tp->rcv_wnd, TCP_NIP_WINDOW_MAX));
573 
574 	ack = tcb->tcp_flags & TCPHDR_ACK;
575 	nip_dbg("sport=%u, dport=%u, win=%u, rcvbuf=%d, sk_rmem_alloc=%d, ack=%u, skb->len=%u",
576 		ntohs(inet->inet_sport), ntohs(inet->inet_dport), ntohs(th->window),
577 		sk->sk_rcvbuf, atomic_read(&sk->sk_rmem_alloc), ack, skb->len);
578 
579 	/* Fill in checksum */
580 	check = nip_get_output_checksum_tcp(skb, sk->SK_NIP_RCV_SADDR, sk->SK_NIP_DADDR);
581 	th->check = htons(check);
582 
583 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
584 		tcp_nip_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
585 
586 	 /* There's data to send */
587 	if (skb->len != tcp_header_size)
588 		tp->data_segs_out += tcp_skb_pcount(skb);
589 
590 	memset(skb->cb, 0, sizeof(struct ninet_skb_parm));
591 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
592 	return err;
593 }
594 
595 /* Function
596  *    TCP's transport layer sends code that builds and initializes the TCP header
597  *    Construct the SK_buff call transport layer to network layer interface
598  * Parameter
599  *    sk: Transmission control block.
600  *    skb: Structure stores all information about network datagrams
601  */
tcp_nip_transmit_skb(struct sock * sk,struct sk_buff * skb,int clone_it,gfp_t gfp_mask)602 int tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
603 			 gfp_t gfp_mask)
604 {
605 	return __tcp_nip_transmit_skb(sk, skb, clone_it, gfp_mask,
606 				  tcp_sk(sk)->rcv_nxt);
607 }
608 
tcp_nip_queue_skb(struct sock * sk,struct sk_buff * skb)609 static void tcp_nip_queue_skb(struct sock *sk, struct sk_buff *skb)
610 {
611 	struct tcp_sock *tp = tcp_sk(sk);
612 
613 	/* Advance write_seq and place onto the write_queue. */
614 	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
615 	tcp_nip_add_write_queue_tail(sk, skb);
616 	sk->sk_wmem_queued += skb->truesize;
617 	sk_mem_charge(sk, skb->truesize);
618 }
619 
620 /* Function
621  *    A function used by the client transport layer to connect requests.
622  * Parameter
623  *    sk: transmission control block.
624  */
__tcp_nip_connect(struct sock * sk)625 int __tcp_nip_connect(struct sock *sk)
626 {
627 	struct tcp_sock *tp = tcp_sk(sk);
628 	struct sk_buff *buff;
629 	int err;
630 
631 	tcp_nip_connect_init(sk);
632 	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
633 	if (unlikely(!buff))
634 		return -ENOBUFS;
635 
636 	/* Initializes the SYN flag bit */
637 	tcp_nip_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
638 	tcp_mstamp_refresh(tp);
639 	tp->retrans_stamp = tcp_time_stamp(tp);
640 	tcp_nip_init_xmit_timers(sk);
641 
642 	tcp_nip_connect_queue_skb(sk, buff);
643 
644 	/* Send off SYN */
645 	err =  tcp_nip_transmit_skb(sk, buff, 1, sk->sk_allocation);
646 	if (err == -ECONNREFUSED)
647 		return err;
648 
649 	tp->snd_nxt = tp->write_seq;
650 	tp->pushed_seq = tp->write_seq;
651 
652 	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
653 
654 	/* Timer for repeating the SYN until an answer. */
655 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
656 
657 	return 0;
658 }
659 
tcp_nip_sync_mss(struct sock * sk,u32 pmtu)660 unsigned int tcp_nip_sync_mss(struct sock *sk, u32 pmtu)
661 {
662 	struct tcp_sock *tp = tcp_sk(sk);
663 	struct inet_connection_sock *icsk = inet_csk(sk);
664 	int mss_now;
665 
666 	if (icsk->icsk_mtup.search_high > pmtu)
667 		icsk->icsk_mtup.search_high = pmtu;
668 
669 	mss_now = tcp_nip_mtu_to_mss(sk, pmtu);
670 	nip_dbg("sync mtu_to_mss %d", mss_now);
671 	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
672 	nip_dbg("sync bound to half wnd %d", mss_now);
673 
674 	/* And store cached results */
675 	icsk->icsk_pmtu_cookie = pmtu;
676 	if (icsk->icsk_mtup.enabled)
677 		mss_now = min(mss_now, tcp_nip_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
678 	tp->mss_cache = mss_now;
679 
680 	nip_dbg("sync final mss %d", mss_now);
681 
682 	return mss_now;
683 }
684 
tcp_nip_current_mss(struct sock * sk)685 unsigned int tcp_nip_current_mss(struct sock *sk)
686 {
687 	const struct tcp_sock *tp = tcp_sk(sk);
688 	const struct dst_entry *dst = __sk_dst_get(sk);
689 	u32 mss_now;
690 	unsigned int header_len;
691 	struct tcp_nip_out_options opts;
692 
693 	mss_now = tp->mss_cache;
694 
695 	if (dst) {
696 		u32 mtu = dst_mtu(dst); /* NIP_MIN_MTU */
697 
698 		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
699 			mss_now = tcp_nip_sync_mss(sk, mtu);
700 	}
701 
702 	header_len = tcp_nip_established_options(sk, NULL, &opts) + sizeof(struct tcphdr);
703 	if (header_len != tp->tcp_header_len) {
704 		int delta = (int)header_len - tp->tcp_header_len;
705 
706 		mss_now -= delta;
707 	}
708 
709 	return mss_now;
710 }
711 
712 /* Function:
713  *    Set up TCP options for SYN-ACKs.
714  *    Initializes the TCP option for the SYN-ACK segment. Returns the SIZE of the TCP header.
715  * Parameter
716  *    req: Request connection control block.
717  *    mss: maximum segment length.
718  *    skb: Transfer control block buffer.
719  *    opts: stores the options contained in TCP packets when they are sent.
720  *    foc: Fast Open option.
721  *    synack_type: type of SYN+ACK segment.
722  */
tcp_nip_synack_options(struct request_sock * req,unsigned int mss,struct sk_buff * skb,struct tcp_nip_out_options * opts,const struct tcp_md5sig_key * md5,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)723 static unsigned int tcp_nip_synack_options(struct request_sock *req,
724 					   unsigned int mss, struct sk_buff *skb,
725 					   struct tcp_nip_out_options *opts,
726 					   const struct tcp_md5sig_key *md5,
727 					   struct tcp_fastopen_cookie *foc,
728 					   enum tcp_synack_type synack_type)
729 {
730 	struct inet_request_sock *ireq = inet_rsk(req);
731 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
732 
733 	/* We always send an MSS option. */
734 	opts->mss = mss;
735 	remaining -= TCPOLEN_MSS_ALIGNED;
736 
737 	if (likely(ireq->tstamp_ok)) {
738 		opts->options |= OPTION_TS;
739 		opts->tsval = tcp_skb_timestamp(skb);
740 		opts->tsecr = req->ts_recent;
741 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
742 	}
743 	return MAX_TCP_OPTION_SPACE - remaining;
744 }
745 
get_nip_mss(const struct sock * sk,struct dst_entry * dst,struct request_sock * req)746 static int get_nip_mss(const struct sock *sk, struct dst_entry *dst, struct request_sock *req)
747 {
748 	struct inet_request_sock *ireq = inet_rsk(req);
749 	struct tcp_sock *tp = tcp_sk(sk);
750 	u16 user_mss;
751 	int mss;
752 	int nip_hdr_len;
753 	int nip_mss;
754 	u32 mtu;
755 
756 	mss = dst_metric_advmss(dst);
757 	user_mss = READ_ONCE(tp->rx_opt.user_mss);
758 	if (user_mss && user_mss < mss)
759 		mss = user_mss;
760 
761 	mtu = dst_mtu(dst); /* NIP_MIN_MTU */
762 	nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &ireq->IR_NIP_LOC_ADDR, &ireq->IR_NIP_RMT_ADDR);
763 	nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
764 	nip_mss = mtu - nip_hdr_len - sizeof(struct tcphdr);
765 
766 	if (nip_mss > mss) {
767 		mss = nip_mss;
768 		tp->advmss = mss;
769 	}
770 
771 	return mss;
772 }
773 
774 /* Function
775  *    The SYN + ACK segment is constructed based on the current transport control block,
776  *    routing information, and request information.
777  * Parameter
778  *    sk: transmission control block.
779  *    dst: routing.
780  *    req: Request connection control block.
781  *    foc: Fast Open option.
782  *    synack_type: type of SYN+ACK segment.
783  */
tcp_nip_make_synack(const struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)784 struct sk_buff *tcp_nip_make_synack(const struct sock *sk, struct dst_entry *dst,
785 				    struct request_sock *req,
786 				    struct tcp_fastopen_cookie *foc,
787 				enum tcp_synack_type synack_type)
788 {
789 	struct inet_request_sock *ireq = inet_rsk(req);
790 	struct tcp_md5sig_key *md5 = NULL;
791 	struct tcp_nip_out_options opts;
792 	struct sk_buff *skb;
793 	int tcp_header_size;
794 	struct tcphdr *th;
795 	int mss;
796 	unsigned short check = 0;
797 
798 	skb = alloc_skb(MAX_TCP_HEADER, 0);
799 	if (unlikely(!skb)) {
800 		dst_release(dst);
801 		return NULL;
802 	}
803 
804 	/* Reserve space for headers. */
805 	skb_reserve(skb, MAX_TCP_HEADER);
806 
807 	switch (synack_type) {
808 	case TCP_SYNACK_NORMAL:
809 		/* Release the original SKB and treat itself as the SKB of the current SK */
810 		skb_set_owner_w(skb, req_to_sk(req));
811 		break;
812 	default:
813 		break;
814 	}
815 	skb_dst_set(skb, dst);
816 	/* set skb priority from sk */
817 	skb->priority = sk->sk_priority;
818 
819 	mss = get_nip_mss(sk, dst, req);
820 
821 	/* Clear the options and set the associated timestamp */
822 	memset(&opts, 0, sizeof(opts));
823 	skb->skb_mstamp_ns = tcp_clock_us();
824 
825 	/* Get the TCP header size, then set the size and reset the transport layer header */
826 	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
827 	tcp_header_size = tcp_nip_synack_options(req, mss, skb, &opts, md5,
828 						 foc, synack_type) + sizeof(*th);
829 	skb_push(skb, tcp_header_size);
830 	skb_reset_transport_header(skb);
831 
832 	/* Clear the TCP header and set the fields of the TCP header */
833 	th = (struct tcphdr *)skb->data;
834 	memset(th, 0, sizeof(struct tcphdr));
835 	th->syn = 1;
836 	th->ack = 1;
837 	if (inet_rsk(req)->ecn_ok)
838 		th->ece = 1;
839 	th->source = htons(ireq->ir_num);
840 	th->dest = ireq->ir_rmt_port;
841 	skb->ip_summed = CHECKSUM_PARTIAL;
842 	th->seq = htonl(tcp_rsk(req)->snt_isn);
843 	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
844 	th->check = 0;
845 
846 	th->window = htons(min(req->rsk_rcv_wnd, TCP_NIP_WINDOW_MAX));
847 
848 	tcp_nip_options_write((__be32 *)(th + 1), NULL, &opts);
849 	/* TCP data offset, divided by 4 because doff is a 32-bit word
850 	 * That is, words four bytes long are counted in units
851 	 */
852 	th->doff = (tcp_header_size >> TCP_NUM_2);
853 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
854 
855 	/* Fill in checksum */
856 	check = nip_get_output_checksum_tcp(skb,  ireq->IR_NIP_LOC_ADDR,  ireq->IR_NIP_RMT_ADDR);
857 	th->check = htons(check);
858 
859 	/* Do not fool tcpdump (if any), clean our debris */
860 	skb->tstamp = 0;
861 	return skb;
862 }
863 
864 /* Function
865  *    Send SKB packets with SYN+ACK segments to the network layer.
866  * Parameter
867  *    req: Request connection control block.
868  *    skb: Transfer control block buffer.
869  */
__nip_send_synack(struct request_sock * req,struct sk_buff * skb)870 int __nip_send_synack(struct request_sock *req, struct sk_buff *skb)
871 {
872 	struct inet_request_sock *ireq = inet_rsk(req); /* 连接请求块 */
873 	int err;
874 	int csummode = CHECKSUM_NONE;
875 	struct nip_addr *saddr, *daddr;
876 	struct nip_hdr_encap head = {0};
877 	unsigned char hdr_buf[NIP_HDR_MAX]; /* Cache the newIP header */
878 
879 	skb->protocol = htons(ETH_P_NEWIP);
880 	skb->ip_summed = csummode;
881 	skb->csum = 0;
882 	saddr = &ireq->IR_NIP_LOC_ADDR;
883 	daddr = &ireq->IR_NIP_RMT_ADDR;
884 
885 	head.saddr = *saddr;
886 	head.daddr = *daddr;
887 	head.ttl = NIP_DEFAULT_TTL;
888 	head.nexthdr = IPPROTO_TCP;
889 	head.hdr_buf = hdr_buf;
890 	nip_hdr_comm_encap(&head);
891 	head.total_len = head.hdr_buf_pos + skb->len;
892 	nip_update_total_len(&head, htons(head.total_len));
893 
894 	skb_push(skb, head.hdr_buf_pos);
895 	memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos);
896 	skb_reset_network_header(skb);
897 	nipcb(skb)->srcaddr = *saddr;
898 	nipcb(skb)->dstaddr = *daddr;
899 	nipcb(skb)->nexthdr = head.nexthdr;
900 
901 	head.total_len = skb->len;
902 	err = nip_send_skb(skb);
903 	if (err)
904 		nip_dbg("failed to send skb, skb->len=%u", head.total_len);
905 	else
906 		nip_dbg("send skb ok, skb->len=%u", head.total_len);
907 
908 	return err;
909 }
910 
nip_send_synack(struct request_sock * req,struct sk_buff * skb)911 int nip_send_synack(struct request_sock *req, struct sk_buff *skb)
912 {
913 	return __nip_send_synack(req, skb);
914 }
915 
916 /* Function:
917  *    Creates a subtransport block to complete the establishment of the three-way handshake
918  * Parameter:
919  *    parent: indicates the parent transmission control block
920  *    child: indicates the child transmission control block
921  *    skb: Transfer control block buffer
922  */
tcp_nip_child_process(struct sock * parent,struct sock * child,struct sk_buff * skb)923 int tcp_nip_child_process(struct sock *parent, struct sock *child,
924 			  struct sk_buff *skb)
925 {
926 	int ret = 0;
927 	int state = child->sk_state;
928 	/* Child is not occupied by the user process */
929 	if (!sock_owned_by_user(child)) {
930 		ret = tcp_nip_rcv_state_process(child, skb);
931 		/* At this point the state of the child has been migrated,
932 		 * waking up the process on the listening socket,
933 		 * which may be blocked due to Accept
934 		 */
935 		if (state == TCP_SYN_RECV && child->sk_state != state)
936 			parent->sk_data_ready(parent);
937 	} else {
938 		__sk_add_backlog(child, skb);
939 	}
940 	bh_unlock_sock(child);
941 	sock_put(child);
942 	return ret;
943 }
944 
tcp_nip_acceptable_seq(const struct sock * sk)945 static inline __u32 tcp_nip_acceptable_seq(const struct sock *sk)
946 {
947 	const struct tcp_sock *tp = tcp_sk(sk);
948 
949 	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
950 		return tp->snd_nxt;
951 	else
952 		return tcp_wnd_end(tp);
953 }
954 
955 /* Function:
956  *    The client sends an ACK
957  * Parameter:
958  *    sk: transmission control block
959  *    rcv_nxt: serial number to be accepted
960  */
__tcp_nip_send_ack(struct sock * sk,u32 rcv_nxt)961 void __tcp_nip_send_ack(struct sock *sk, u32 rcv_nxt)
962 {
963 	struct sk_buff *buff;
964 
965 	if (sk->sk_state == TCP_CLOSE)
966 		return;
967 
968 	buff = alloc_skb(MAX_TCP_HEADER,
969 			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
970 
971 	/* Reserve space for the header. */
972 	skb_reserve(buff, MAX_TCP_HEADER);
973 	/* Initialize SKB without data */
974 	tcp_nip_init_nondata_skb(buff, tcp_nip_acceptable_seq(sk), TCPHDR_ACK);
975 
976 	/* Mark pure ack,skb->truesize set to 2 */
977 	skb_set_tcp_pure_ack(buff);
978 
979 	/* Record the timestamp and send the SKB. */
980 	__tcp_nip_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
981 }
982 
tcp_nip_send_ack(struct sock * sk)983 void tcp_nip_send_ack(struct sock *sk)
984 {
985 	__tcp_nip_send_ack(sk, tcp_sk(sk)->rcv_nxt);
986 }
987 
tcp_nip_send_fin(struct sock * sk)988 void tcp_nip_send_fin(struct sock *sk)
989 {
990 	struct sk_buff *skb;
991 	struct sk_buff *tskb = tcp_write_queue_tail(sk);
992 	struct tcp_sock *tp = tcp_sk(sk);
993 	u32 cur_mss;
994 
995 	nip_dbg("send fin");
996 	/* Set the fin position of the last packet to 1 */
997 	if (tskb && tcp_nip_send_head(sk)) {
998 coalesce:
999 		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
1000 		TCP_SKB_CB(tskb)->end_seq++;
1001 		tp->write_seq++;
1002 	} else {
1003 		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
1004 		if (unlikely(!skb)) {
1005 			if (tskb)
1006 				goto coalesce;
1007 			return;
1008 		}
1009 		skb_reserve(skb, MAX_TCP_HEADER);
1010 
1011 		tcp_nip_init_nondata_skb(skb, tp->write_seq,
1012 					 TCPHDR_ACK | TCPHDR_FIN);
1013 		tcp_nip_queue_skb(sk, skb);
1014 	}
1015 
1016 	cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS
1017 	__tcp_nip_push_pending_frames(sk, cur_mss, TCP_NAGLE_OFF);
1018 }
1019 
tcp_nip_send_active_reset(struct sock * sk,gfp_t priority)1020 void tcp_nip_send_active_reset(struct sock *sk, gfp_t priority)
1021 {
1022 	struct sk_buff *skb;
1023 
1024 	nip_dbg("send rst");
1025 	/* NOTE: No TCP options attached and we never retransmit this. */
1026 	skb = alloc_skb(MAX_TCP_HEADER, priority);
1027 	if (!skb)
1028 		/* If you add log here, there will be an alarm:
1029 		 * WARNING: Possible unnecessary 'out of memory' message
1030 		 */
1031 		return;
1032 
1033 	/* Reserve space for headers and prepare control bits. */
1034 	skb_reserve(skb, MAX_TCP_HEADER);
1035 	tcp_nip_init_nondata_skb(skb, tcp_nip_acceptable_seq(sk),
1036 				 TCPHDR_ACK | TCPHDR_RST);
1037 	/* Send it off. */
1038 	tcp_nip_transmit_skb(sk, skb, 0, priority);
1039 }
1040 
tcp_nip_snd_wnd_test(const struct tcp_sock * tp,const struct sk_buff * skb,unsigned int cur_mss)1041 static bool tcp_nip_snd_wnd_test(const struct tcp_sock *tp,
1042 				 const struct sk_buff *skb,
1043 				 unsigned int cur_mss)
1044 {
1045 	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1046 
1047 	if (skb->len > cur_mss)
1048 		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1049 
1050 	return !after(end_seq, tcp_wnd_end(tp));
1051 }
1052 
tcp_nip_set_skb_tso_segs(struct sk_buff * skb,unsigned int mss_now)1053 static void tcp_nip_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1054 {
1055 	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
1056 		/* Avoid the costly divide in the normal
1057 		 * non-TSO case.
1058 		 */
1059 		tcp_skb_pcount_set(skb, 1);
1060 		TCP_SKB_CB(skb)->tcp_gso_size = 0;
1061 	} else {
1062 		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1063 		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1064 	}
1065 }
1066 
tcp_nip_init_tso_segs(struct sk_buff * skb,unsigned int mss_now)1067 static int tcp_nip_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1068 {
1069 	int tso_segs = tcp_skb_pcount(skb);
1070 
1071 	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1072 		tcp_nip_set_skb_tso_segs(skb, mss_now);
1073 		tso_segs = tcp_skb_pcount(skb);
1074 	}
1075 	return tso_segs;
1076 }
1077 
tcp_nip_write_xmit(struct sock * sk,unsigned int mss_now,int nonagle,int push_one,gfp_t gfp)1078 static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1079 			       int push_one, gfp_t gfp)
1080 {
1081 	struct tcp_sock *tp = tcp_sk(sk);
1082 	struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
1083 	struct sk_buff *skb;
1084 	u32 snd_num;
1085 	u32 last_nip_ssthresh = ntp->nip_ssthresh;
1086 	static const char * const str[] = {"can`t send pkt because no window",
1087 					   "have window to send pkt"};
1088 
1089 	if (!mss_now) {
1090 		nip_dbg("invalid parameter, mss_now=%u", mss_now);
1091 		return false;
1092 	}
1093 	snd_num = get_nip_tcp_snd_win_enable() ? (ntp->nip_ssthresh / mss_now) :
1094 			  TCP_NIP_SND_NUM_MAX;
1095 
1096 	tcp_nip_keepalive_enable(sk);
1097 	ntp->idle_ka_probes_out = 0;
1098 
1099 	tcp_mstamp_refresh(tp);
1100 
1101 	if (tp->rcv_tstamp) {
1102 		u32 tstamp = tcp_jiffies32 - tp->rcv_tstamp;
1103 
1104 		if (tstamp >= get_ack_to_nxt_snd_tstamp()) {
1105 			ntp->nip_ssthresh = get_ssthresh_low_min();
1106 			snd_num = ntp->nip_ssthresh / mss_now;
1107 			ssthresh_dbg("new snd tstamp %u >= %u, ssthresh %u to %u, snd_num=%u",
1108 				     tstamp, get_ack_to_nxt_snd_tstamp(),
1109 				     last_nip_ssthresh, ntp->nip_ssthresh, snd_num);
1110 		}
1111 	}
1112 
1113 	while ((skb = tcp_nip_send_head(sk)) && (snd_num--)) {
1114 		bool snd_wnd_ready;
1115 
1116 		tcp_nip_init_tso_segs(skb, mss_now);
1117 		snd_wnd_ready = tcp_nip_snd_wnd_test(tp, skb, mss_now);
1118 		nip_dbg("%s, skb->len=%u", (snd_wnd_ready ? str[1] : str[0]), skb->len);
1119 		if (unlikely(!snd_wnd_ready))
1120 			break;
1121 
1122 		if (unlikely(tcp_nip_transmit_skb(sk, skb, 1, gfp)))
1123 			break;
1124 
1125 		tcp_nip_event_new_data_sent(sk, skb);
1126 
1127 		if (push_one)
1128 			break;
1129 	}
1130 	return !tp->packets_out && tcp_nip_send_head(sk);
1131 }
1132 
tcp_nip_rtx_synack(const struct sock * sk,struct request_sock * req)1133 int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req)
1134 {
1135 	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
1136 	int res;
1137 	struct dst_entry *dst;
1138 
1139 	dst = af_ops->route_req(sk, NULL, req);
1140 	tcp_rsk(req)->txhash = net_tx_rndhash();
1141 
1142 	res = af_ops->send_synack(sk, dst, NULL, req, NULL, TCP_SYNACK_NORMAL, NULL);
1143 
1144 	return res;
1145 }
1146 
tcp_nip_adjust_pcount(struct sock * sk,const struct sk_buff * skb,int decr)1147 static void tcp_nip_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1148 {
1149 	struct tcp_sock *tp = tcp_sk(sk);
1150 
1151 	tp->packets_out -= decr;
1152 }
1153 
__tcp_nip_retransmit_skb(struct sock * sk,struct sk_buff * skb,int segs)1154 int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1155 {
1156 	struct tcp_sock *tp = tcp_sk(sk);
1157 	unsigned int cur_mss;
1158 	int len, err;
1159 
1160 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1161 		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
1162 			WARN_ON_ONCE(1);
1163 			return -EINVAL;
1164 		}
1165 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1166 			return -ENOMEM;
1167 	}
1168 
1169 	cur_mss = tcp_nip_current_mss(sk);
1170 
1171 	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1172 	    TCP_SKB_CB(skb)->seq != tp->snd_una)
1173 		return -EAGAIN;
1174 
1175 	len = cur_mss * segs;
1176 	if (skb->len > len) {
1177 		if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, len, cur_mss, GFP_ATOMIC))
1178 			return -ENOMEM; /* We'll try again later. */
1179 	} else {
1180 		int diff = tcp_skb_pcount(skb);
1181 
1182 		tcp_nip_set_skb_tso_segs(skb, cur_mss);
1183 		diff -= tcp_skb_pcount(skb);
1184 		if (diff)
1185 			tcp_nip_adjust_pcount(sk, skb, diff);
1186 	}
1187 
1188 	err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1189 	if (likely(!err)) {
1190 		segs = tcp_skb_pcount(skb);
1191 
1192 		tp->total_retrans += segs;
1193 	}
1194 	return err;
1195 }
1196 
tcp_nip_retransmit_skb(struct sock * sk,struct sk_buff * skb,int segs)1197 int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1198 {
1199 	struct tcp_sock *tp = tcp_sk(sk);
1200 	int err = __tcp_nip_retransmit_skb(sk, skb, segs);
1201 
1202 	if (err == 0) {
1203 		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1204 		tp->retrans_out += tcp_skb_pcount(skb);
1205 
1206 		/* Save stamp of the first retransmit. */
1207 		if (!tp->retrans_stamp)
1208 			tp->retrans_stamp = tcp_skb_timestamp(skb);
1209 	} else if (err != -EBUSY) {
1210 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
1211 	}
1212 
1213 	return err;
1214 }
1215 
1216 #define TCP_NIP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED)  | \
1217 			  (1UL << TCP_NIP_WRITE_TIMER_DEFERRED)      | \
1218 			  (1UL << TCP_NIP_DELACK_TIMER_DEFERRED)     | \
1219 			  (1UL << TCP_MTU_REDUCED_DEFERRED))
1220 
tcp_nip_release_cb(struct sock * sk)1221 void tcp_nip_release_cb(struct sock *sk)
1222 {
1223 	unsigned long flags, nflags;
1224 
1225 	/* perform an atomic operation only if at least one flag is set */
1226 	do {
1227 		flags = sk->sk_tsq_flags;
1228 		if (!(flags & TCP_NIP_DEFERRED_ALL))
1229 			return;
1230 		nflags = flags & ~TCP_NIP_DEFERRED_ALL;
1231 	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1232 
1233 	sock_release_ownership(sk);
1234 	if (flags & (1UL << TCP_NIP_WRITE_TIMER_DEFERRED)) {
1235 		tcp_nip_write_timer_handler(sk);
1236 		__sock_put(sk);
1237 	}
1238 	if (flags & (1UL << TCP_NIP_DELACK_TIMER_DEFERRED)) {
1239 		tcp_nip_delack_timer_handler(sk);
1240 		__sock_put(sk);
1241 	}
1242 	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
1243 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1244 		__sock_put(sk);
1245 	}
1246 }
1247 
1248 enum nip_probe_type {
1249 	NIP_PROBE0 = 0,
1250 	NIP_KEEPALIVE = 1,
1251 	NIP_UNKNOWN = 2,
1252 	NIP_PROBE_MAX,
1253 };
1254 
tcp_nip_xmit_probe_skb(struct sock * sk,int urgent,int mib)1255 static int tcp_nip_xmit_probe_skb(struct sock *sk, int urgent, int mib)
1256 {
1257 	struct tcp_sock *tp = tcp_sk(sk);
1258 	struct sk_buff *skb;
1259 	int ret;
1260 	int probe_type;
1261 	const char *str[NIP_PROBE_MAX] = {"probe0", "keepalive", "unknown"};
1262 
1263 	if (mib == LINUX_MIB_TCPWINPROBE)
1264 		probe_type = NIP_PROBE0;
1265 	else if (mib == LINUX_MIB_TCPKEEPALIVE)
1266 		probe_type = NIP_KEEPALIVE;
1267 	else
1268 		probe_type = NIP_UNKNOWN;
1269 
1270 	/* We don't queue it, tcp_transmit_skb() sets ownership. */
1271 	skb = alloc_skb(MAX_TCP_HEADER,
1272 			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
1273 	if (!skb)
1274 		return -1;
1275 
1276 	/* Reserve space for headers and set control bits. */
1277 	skb_reserve(skb, MAX_TCP_HEADER);
1278 
1279 	tcp_nip_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
1280 
1281 	NET_INC_STATS(sock_net(sk), mib);
1282 	ret = tcp_nip_transmit_skb(sk, skb, 0, (__force gfp_t)0);
1283 	nip_dbg("send %s probe packet, ret=%d", str[probe_type], ret);
1284 	return ret;
1285 }
1286 
tcp_nip_write_wakeup(struct sock * sk,int mib)1287 int tcp_nip_write_wakeup(struct sock *sk, int mib)
1288 {
1289 	struct tcp_sock *tp = tcp_sk(sk);
1290 	struct sk_buff *skb;
1291 
1292 	if (sk->sk_state == TCP_CLOSE) {
1293 		nip_dbg("no probe0 when tcp close");
1294 		return -1;
1295 	}
1296 
1297 	skb = tcp_nip_send_head(sk);
1298 	/* If the serial number of the next packet is in the sending window */
1299 	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
1300 		int err;
1301 		unsigned int mss = tcp_nip_current_mss(sk);
1302 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1303 
1304 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1305 			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1306 		/* If the current window size is not enough to send a complete packet */
1307 		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1308 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
1309 			err = tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
1310 					   skb, seg_size, mss, GFP_ATOMIC);
1311 			if (err) {
1312 				nip_dbg("tcp_fragment return err=%d", err);
1313 				return -1;
1314 			}
1315 		}
1316 		err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1317 		if (!err)
1318 			tcp_nip_event_new_data_sent(sk, skb);
1319 		nip_dbg("transmit skb %s", (!err ? "ok" : "fail"));
1320 		return err;
1321 	} else {
1322 		return tcp_nip_xmit_probe_skb(sk, 0, mib);
1323 	}
1324 }
1325 
1326 /* The 0 window probe packet is sent */
tcp_nip_send_probe0(struct sock * sk)1327 void tcp_nip_send_probe0(struct sock *sk)
1328 {
1329 	struct inet_connection_sock *icsk = inet_csk(sk);
1330 	struct tcp_sock *tp = tcp_sk(sk);
1331 	struct net *net = sock_net(sk);
1332 	unsigned long when;
1333 	/* An ACK packet with snd_UNa-1 and length 0 is sent as a zero-window detection packet */
1334 	int err = tcp_nip_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
1335 
1336 	/* If there are packets to be sent on the network and no packets to be
1337 	 * sent in the send queue, the packet is returned directly
1338 	 */
1339 	if (tp->packets_out || !tcp_nip_send_head(sk)) {
1340 		/* Cancel probe timer, if it is not required. */
1341 		nip_dbg("packets_out(%u) not 0 or send_head is NULL, cancel probe0 timer",
1342 			tp->packets_out);
1343 		icsk->icsk_probes_out = 0;
1344 		icsk->icsk_backoff = 0;
1345 		return;
1346 	}
1347 
1348 	/* Err: 0 succeeded, -1 failed */
1349 	icsk->icsk_probes_out++; /* Number of probes +1 */
1350 	if (err <= 0) {
1351 		if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
1352 			icsk->icsk_backoff++;
1353 		when = tcp_probe0_when(sk, TCP_RTO_MAX);
1354 		nip_dbg("probe0 %s, probes_out=%u, probe0_base=%lu, icsk_backoff=%u, when=%lu",
1355 			(!err ? "send ok" : "send fail"), icsk->icsk_probes_out,
1356 			tcp_probe0_base(sk), icsk->icsk_backoff, when);
1357 	} else {
1358 		/* Makes the zero window probe timer time out faster */
1359 		when = TCP_RESOURCE_PROBE_INTERVAL;
1360 		nip_dbg("probe0 not sent due to local congestion, make timer out faster");
1361 	}
1362 
1363 	nip_dbg("restart probe0 timer, when=%lu, icsk_backoff=%u, probe_max=%u",
1364 		when, icsk->icsk_backoff, TCP_RTO_MAX);
1365 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
1366 }
1367