1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Based on net/ipv4/tcp_output.c
4 * Authors: Ross Biro
5 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
6 * Mark Evans, <evansmp@uhura.aston.ac.uk>
7 * Corey Minyard <wf-rch!minyard@relay.EU.net>
8 * Florian La Roche, <flla@stud.uni-sb.de>
9 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
10 * Linus Torvalds, <torvalds@cs.helsinki.fi>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Matthew Dillon, <dillon@apollo.west.oic.com>
13 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 *
16 * Changes: Pedro Roque : Retransmit queue handled by TCP.
17 * : Fragmentation on mtu decrease
18 * : Segment collapse on retransmit
19 * : AF independence
20 *
21 * Linus Torvalds : send_delayed_ack
22 * David S. Miller : Charge memory using the right skb
23 * during syn/ack processing.
24 * David S. Miller : Output engine completely rewritten.
25 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
26 * Cacophonix Gaul : draft-minshall-nagle-01
27 * J Hadi Salim : ECN support
28 *
29 * Based on net/ipv4/tcp_minisocks.c
30 * Authors: Ross Biro
31 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
32 * Mark Evans, <evansmp@uhura.aston.ac.uk>
33 * Corey Minyard <wf-rch!minyard@relay.EU.net>
34 * Florian La Roche, <flla@stud.uni-sb.de>
35 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
36 * Linus Torvalds, <torvalds@cs.helsinki.fi>
37 * Alan Cox, <gw4pts@gw4pts.ampr.org>
38 * Matthew Dillon, <dillon@apollo.west.oic.com>
39 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
40 * Jorge Cwik, <jorge@laser.satlink.net>
41 *
42 * NewIP INET
43 * An implementation of the TCP/IP protocol suite for the LINUX
44 * operating system. NewIP INET is implemented using the BSD Socket
45 * interface as the means of communication with the user level.
46 *
47 * Implementation of the Transmission Control Protocol(TCP).
48 */
49 #define pr_fmt(fmt) KBUILD_MODNAME ": [%s:%d] " fmt, __func__, __LINE__
50
51 #include <net/nip.h>
52 #include <net/tcp_nip.h>
53 #include <net/tcp.h>
54 #include <net/ninet_connection_sock.h>
55 #include <linux/compiler.h>
56 #include <linux/module.h>
57 #include <net/nip_udp.h>
58 #include "nip_hdr.h"
59 #include "nip_checksum.h"
60 #include "tcp_nip_parameter.h"
61
62 #define OPTION_SACK_ADVERTISE BIT(0)
63 #define OPTION_TS BIT(1)
64 #define OPTION_MD5 BIT(2)
65 #define OPTION_WSCALE BIT(3)
66 #define OPTION_FAST_OPEN_COOKIE BIT(8)
67 #define TCP_NIP_SND_NUM_MAX (~0U)
68
69 /* Store the options contained in TCP when sending TCP packets */
70 struct tcp_nip_out_options {
71 u16 options; /* bit field of OPTION_* */
72 u16 mss; /* If it is zero, the MSS option is disabled */
73
74 u8 ws; /* window scale, 0 to disable */
75 __u32 tsval, tsecr; /* need to include OPTION_TS */
76 };
77
78 static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
79 int push_one, gfp_t gfp);
80
81 /* Calculate MSS not accounting any TCP options. */
__tcp_nip_mtu_to_mss(struct sock * sk,int pmtu)82 static int __tcp_nip_mtu_to_mss(struct sock *sk, int pmtu)
83 {
84 const struct tcp_sock *tp = tcp_sk(sk);
85 const struct inet_connection_sock *icsk = inet_csk(sk);
86 int mss_now;
87 int nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &sk->SK_NIP_RCV_SADDR, &sk->SK_NIP_DADDR);
88
89 /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */
90 nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
91 mss_now = pmtu - nip_hdr_len - sizeof(struct tcphdr);
92
93 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
94 if (icsk->icsk_af_ops->net_frag_header_len) {
95 const struct dst_entry *dst = __sk_dst_get(sk);
96
97 if (dst && dst_allfrag(dst))
98 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
99 }
100
101 /* Clamp it (mss_clamp does not include tcp options) */
102 if (mss_now > tp->rx_opt.mss_clamp)
103 mss_now = tp->rx_opt.mss_clamp;
104
105 /* Now subtract optional transport overhead */
106 mss_now -= icsk->icsk_ext_hdr_len;
107
108 /* Then reserve room for full set of TCP options and 8 bytes of data */
109 mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
110 return mss_now;
111 }
112
113 /* Calculate MSS. Not accounting for SACKs here. */
tcp_nip_mtu_to_mss(struct sock * sk,int pmtu)114 int tcp_nip_mtu_to_mss(struct sock *sk, int pmtu)
115 {
116 /* Subtract TCP options size, not including SACKs */
117 return __tcp_nip_mtu_to_mss(sk, pmtu) -
118 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
119 }
120
tcp_advance_send_head(struct sock * sk,const struct sk_buff * skb)121 static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
122 {
123 if (tcp_skb_is_last(sk, skb))
124 sk->sk_send_head = NULL;
125 else
126 sk->sk_send_head = skb_queue_next(&sk->sk_write_queue, skb);
127 }
128
tcp_nip_event_new_data_sent(struct sock * sk,struct sk_buff * skb)129 static void tcp_nip_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
130 {
131 struct inet_connection_sock *icsk = inet_csk(sk);
132 struct tcp_sock *tp = tcp_sk(sk);
133 unsigned int prior_packets = tp->packets_out;
134
135 tcp_advance_send_head(sk, skb);
136 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
137 tp->packets_out += tcp_skb_pcount(skb);
138 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
139 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
140 tcp_nip_rearm_rto(sk);
141
142 tcp_nip_check_space(sk);
143 }
144
145 /* check probe0 timer */
tcp_nip_check_probe_timer(struct sock * sk)146 static void tcp_nip_check_probe_timer(struct sock *sk)
147 {
148 if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) {
149 unsigned long when = tcp_probe0_base(sk);
150
151 nip_dbg("start probe0 timer, when=%lu, RTO MAX=%u", when, TCP_RTO_MAX);
152 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
153 } else if (inet_csk(sk)->icsk_pending != ICSK_TIME_PROBE0) {
154 nip_dbg("can`t start probe0 timer, packets_out=%u, icsk_pending=%u",
155 tcp_sk(sk)->packets_out, inet_csk(sk)->icsk_pending);
156 }
157 }
158
__tcp_nip_push_pending_frames(struct sock * sk,unsigned int cur_mss,int nonagle)159 void __tcp_nip_push_pending_frames(struct sock *sk, unsigned int cur_mss,
160 int nonagle)
161 {
162 if (unlikely(sk->sk_state == TCP_CLOSE))
163 return;
164
165 if (tcp_nip_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC)))
166 tcp_nip_check_probe_timer(sk);
167 }
168
__nip_tcp_select_window(struct sock * sk)169 u32 __nip_tcp_select_window(struct sock *sk)
170 {
171 struct inet_connection_sock *icsk = inet_csk(sk);
172 struct tcp_sock *tp = tcp_sk(sk);
173 struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
174 int mss = tcp_nip_current_mss(sk); /* TCP_BASE_MSS */
175 int allowed_space = tcp_full_space(sk);
176 int full_space = min_t(int, tp->window_clamp, allowed_space); /* Total receive cache */
177 int free_space = tcp_space(sk); /* 3/4 remaining receive cache */
178 int window;
179
180 if (unlikely(mss > full_space)) {
181 mss = full_space;
182 if (mss <= 0)
183 return 0;
184 }
185
186 /* receive buffer is half full */
187 if (free_space < (full_space >> 1)) {
188 icsk->icsk_ack.quick = 0;
189
190 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
191 if (free_space < (allowed_space >> TCP_NUM_4) || free_space < mss) {
192 nip_dbg("rcv_wnd is 0, [allowed|full|free]space=[%u, %u, %u], mss=%u",
193 allowed_space, full_space, free_space, mss);
194 return 0;
195 }
196 }
197
198 if (get_nip_tcp_rcv_win_enable()) {
199 if (get_ssthresh_enable())
200 free_space = free_space > ntp->nip_ssthresh ?
201 ntp->nip_ssthresh : free_space;
202 else
203 free_space = free_space > tp->rcv_ssthresh ? tp->rcv_ssthresh : free_space;
204 } else {
205 free_space = free_space > get_ssthresh_high() ? get_ssthresh_high() : free_space;
206 }
207
208 /* Don't do rounding if we are using window scaling, since the
209 * scaled window will not line up with the MSS boundary anyway.
210 * tp->rx_opt.rcv_wscale is always true
211 */
212 window = free_space;
213
214 /* Advertise enough space so that it won't get scaled away.
215 * Import case: prevent zero window announcement if
216 * 1<<rcv_wscale > mss.
217 */
218 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
219 nip_dbg("wscale(%u) win change [%u to %u], [allowed|free]space=[%u, %u], mss=%u",
220 tp->rx_opt.rcv_wscale, free_space, window, allowed_space, free_space, mss);
221 return window;
222 }
223
224 /* The basic algorithm of window size selection:
225 * 1. Calculate the remaining size of the receiving window cur_win.
226 * 2. Calculate the new receive window size NEW_win, which is 3/4 of the remaining receive
227 * cache and cannot exceed RCV_SSTHresh.
228 * 3. Select the receiving window size with the larger median value of cur_win and new_win.
229 */
nip_tcp_select_window(struct sock * sk)230 static u16 nip_tcp_select_window(struct sock *sk)
231 {
232 struct tcp_sock *tp = tcp_sk(sk);
233 u32 old_win = tp->rcv_wnd;
234 /* The remaining size of the front receive window */
235 u32 cur_win = tcp_receive_window(tp);
236 /* Calculate the size of the new receive window based on the remaining receive cache */
237 u32 new_win = __nip_tcp_select_window(sk);
238 u32 new_win_bak;
239
240 /* Never shrink the offered window */
241 if (new_win < cur_win) {
242 /* Danger Will Robinson!
243 * Don't update rcv_wup/rcv_wnd here or else
244 * we will not be able to advertise a zero
245 * window in time. --DaveM
246 *
247 * Relax Will Robinson.
248 */
249 if (new_win == 0)
250 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWANTZEROWINDOWADV);
251 new_win_bak = new_win;
252 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
253 nip_dbg("when new_win(%u) < cur_win(%u), win change [%u to %u]",
254 new_win_bak, cur_win, new_win_bak, new_win);
255 }
256 tp->rcv_wnd = new_win;
257 tp->rcv_wup = tp->rcv_nxt;
258
259 /* Make sure we do not exceed the maximum possible
260 * scaled window.
261 */
262 if (!tp->rx_opt.rcv_wscale && sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
263 new_win = min(new_win, MAX_TCP_WINDOW);
264 else
265 new_win = min(new_win, (TCP_NIP_WINDOW_MAX << tp->rx_opt.rcv_wscale));
266
267 /* RFC1323 Scaling Applied.
268 * Scaling the receive window so that it can represent up to 30 bits
269 */
270 new_win_bak = new_win;
271 new_win >>= tp->rx_opt.rcv_wscale;
272 nip_dbg("wscale(%u) win change [%u to %u]", tp->rx_opt.rcv_wscale, new_win_bak, new_win);
273 if (new_win == 0) {
274 tp->pred_flags = 0;
275 if (old_win)
276 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV);
277 } else if (old_win == 0) {
278 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
279 }
280
281 return new_win;
282 }
283
284 /* Function
285 * Initialize transport layer parameters.
286 * Parameter
287 * sk: transmission control block.
288 */
tcp_nip_connect_init(struct sock * sk)289 static void tcp_nip_connect_init(struct sock *sk)
290 {
291 const struct dst_entry *dst = __sk_dst_get(sk);
292 struct tcp_sock *tp = tcp_sk(sk);
293 __u8 rcv_wscale = 0;
294
295 /* Header structure length + timestamp length */
296 tp->tcp_header_len = sizeof(struct tcphdr);
297 if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
298 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
299
300 if (tp->rx_opt.user_mss)
301 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
302 tp->max_window = 0;
303
304 tcp_mtup_init(sk);
305 tp->rx_opt.mss_clamp = tcp_nip_sync_mss(sk, dst_mtu(dst));
306
307 if (!tp->window_clamp)
308 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
309 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
310
311 tcp_initialize_rcv_mss(sk);
312
313 /* Initialization window */
314 tcp_select_initial_window(sk, tcp_full_space(sk),
315 tp->advmss - (tp->rx_opt.ts_recent_stamp ?
316 tp->tcp_header_len - sizeof(struct tcphdr) : 0),
317 &tp->rcv_wnd,
318 &tp->window_clamp,
319 0,
320 &rcv_wscale,
321 0);
322
323 tp->rx_opt.rcv_wscale = get_wscale_enable() ? get_wscale() : rcv_wscale;
324 tp->rcv_ssthresh = tp->rcv_wnd;
325
326 sk->sk_err = 0;
327 sock_reset_flag(sk, SOCK_DONE);
328 tp->snd_wnd = 0;
329 tp->snd_wl1 = 0;
330 tcp_write_queue_purge(sk);
331
332 tp->snd_una = tp->write_seq;
333 tp->snd_sml = tp->write_seq;
334 tp->snd_up = tp->write_seq;
335 tp->snd_nxt = tp->write_seq;
336
337 tp->rcv_nxt = 0;
338 tp->rcv_wup = tp->rcv_nxt;
339 tp->copied_seq = tp->rcv_nxt;
340 inet_csk(sk)->icsk_rto = get_nip_rto() == 0 ? TCP_TIMEOUT_INIT : (HZ / get_nip_rto());
341 inet_csk(sk)->icsk_retransmits = 0;
342 tcp_clear_retrans(tp);
343 }
344
tcp_nip_init_nondata_skb(struct sk_buff * skb,u32 seq,u8 flags)345 static void tcp_nip_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
346 {
347 skb->ip_summed = CHECKSUM_PARTIAL;
348 skb->csum = 0;
349
350 TCP_SKB_CB(skb)->tcp_flags = flags;
351 TCP_SKB_CB(skb)->sacked = 0;
352
353 tcp_skb_pcount_set(skb, 1);
354
355 TCP_SKB_CB(skb)->seq = seq;
356 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
357 seq++;
358 TCP_SKB_CB(skb)->end_seq = seq;
359 }
360
361 #define OPTION_TS BIT(1)
362 #define OPTION_WSCALE BIT(3)
363
tcp_nip_connect_queue_skb(struct sock * sk,struct sk_buff * skb)364 static void tcp_nip_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
365 {
366 struct tcp_sock *tp = tcp_sk(sk);
367 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
368
369 tcb->end_seq += skb->len;
370 __skb_header_release(skb);
371 __skb_queue_tail(&sk->sk_write_queue, skb);
372 sk->sk_wmem_queued += skb->truesize;
373 sk_mem_charge(sk, skb->truesize);
374 WRITE_ONCE(tp->write_seq, tcb->end_seq);
375 tp->packets_out += tcp_skb_pcount(skb);
376 }
377
tcp_nip_advertise_mss(struct sock * sk)378 static __u16 tcp_nip_advertise_mss(struct sock *sk)
379 {
380 struct tcp_sock *tp = tcp_sk(sk);
381 const struct dst_entry *dst = __sk_dst_get(sk);
382 int mss = tp->advmss;
383 u32 mtu;
384
385 if (dst) {
386 int nip_hdr_len;
387 int nip_mss;
388 unsigned int metric = dst_metric_advmss(dst);
389
390 if (metric < (unsigned int)mss) {
391 mss = metric;
392 tp->advmss = mss;
393 }
394
395 mtu = dst_mtu(dst); /* NIP_MIN_MTU */
396 nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &sk->SK_NIP_RCV_SADDR,
397 &sk->SK_NIP_DADDR);
398 nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
399 nip_mss = mtu - nip_hdr_len - sizeof(struct tcphdr);
400 if (nip_mss > mss) {
401 mss = nip_mss;
402 tp->advmss = mss;
403 }
404 }
405
406 return (__u16)mss;
407 }
408
409 /* Compute TCP options for SYN packets. This is not the final
410 * network wire format yet.
411 */
tcp_nip_syn_options(struct sock * sk,struct sk_buff * skb,struct tcp_nip_out_options * opts)412 static unsigned int tcp_nip_syn_options(struct sock *sk, struct sk_buff *skb,
413 struct tcp_nip_out_options *opts)
414 {
415 unsigned int remaining = MAX_TCP_OPTION_SPACE;
416
417 opts->mss = tcp_nip_advertise_mss(sk);
418 nip_dbg("advertise mss %d", opts->mss);
419 remaining -= TCPOLEN_MSS_ALIGNED;
420
421 return MAX_TCP_OPTION_SPACE - remaining;
422 }
423
424 /* Compute TCP options for ESTABLISHED sockets. This is not the
425 * final wire format yet.
426 */
tcp_nip_established_options(struct sock * sk,struct sk_buff * skb,struct tcp_nip_out_options * opts)427 static unsigned int tcp_nip_established_options(struct sock *sk, struct sk_buff *skb,
428 struct tcp_nip_out_options *opts)
429 {
430 struct tcp_sock *tp = tcp_sk(sk);
431 unsigned int size = 0;
432
433 opts->options = 0;
434
435 if (likely(tp->rx_opt.tstamp_ok)) {
436 opts->options |= OPTION_TS;
437 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
438 opts->tsecr = tp->rx_opt.ts_recent;
439 size += TCPOLEN_TSTAMP_ALIGNED;
440 }
441 return size;
442 }
443
444 /* Function
445 * Put the parameters from the TCP option into SKB.
446 * Write previously computed TCP options to the packet.
447 * Parameter
448 * ptr: pointer to TCP options in SKB.
449 * tp: transmission control block.
450 * opts: structure to be sent to temporarily load TCP options.
451 */
tcp_nip_options_write(__be32 * ptr,struct tcp_sock * tp,struct tcp_nip_out_options * opts)452 static void tcp_nip_options_write(__be32 *ptr, struct tcp_sock *tp,
453 struct tcp_nip_out_options *opts)
454 {
455 if (unlikely(opts->mss))
456 *ptr++ = htonl((TCPOPT_MSS << TCP_OPT_MSS_PAYLOAD) |
457 (TCPOLEN_MSS << TCP_OLEN_MSS_PAYLOAD) |
458 opts->mss);
459 }
460
tcp_nip_event_ack_sent(struct sock * sk,unsigned int pkts,u32 rcv_nxt)461 static inline void tcp_nip_event_ack_sent(struct sock *sk, unsigned int pkts,
462 u32 rcv_nxt)
463 {
464 struct tcp_sock *tp = tcp_sk(sk);
465
466 if (unlikely(rcv_nxt != tp->rcv_nxt))
467 return;
468 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
469 }
470
nip_get_output_checksum_tcp(struct sk_buff * skb,struct nip_addr src_addr,struct nip_addr dst_addr)471 unsigned short nip_get_output_checksum_tcp(struct sk_buff *skb, struct nip_addr src_addr,
472 struct nip_addr dst_addr)
473 {
474 struct nip_pseudo_header nph = {0};
475 u8 *tcp_hdr = skb_transport_header(skb);
476
477 nph.nexthdr = IPPROTO_TCP;
478 nph.saddr = src_addr;
479 nph.daddr = dst_addr;
480
481 nph.check_len = htons(skb->len);
482 return nip_check_sum_build(tcp_hdr, skb->len, &nph);
483 }
484
__tcp_nip_transmit_skb(struct sock * sk,struct sk_buff * skb,int clone_it,gfp_t gfp_mask,u32 rcv_nxt)485 static int __tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb,
486 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
487 {
488 const struct inet_connection_sock *icsk = inet_csk(sk);
489 struct inet_sock *inet;
490 struct tcp_sock *tp = tcp_sk(sk);
491 struct tcp_skb_cb *tcb;
492 struct tcp_nip_out_options opts;
493 unsigned int tcp_options_size, tcp_header_size;
494 struct sk_buff *oskb = NULL;
495 struct tcphdr *th;
496 int err = 0;
497 __be16 len;
498 unsigned short check = 0;
499 bool ack;
500
501 if (skb->tstamp == 0)
502 skb->tstamp = tcp_jiffies32;
503
504 if (clone_it) {
505 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
506 - tp->snd_una;
507 oskb = skb;
508
509 tcp_skb_tsorted_save(oskb) {
510 if (unlikely(skb_cloned(oskb)))
511 skb = pskb_copy(oskb, gfp_mask);
512 else
513 skb = skb_clone(oskb, gfp_mask);
514 } tcp_skb_tsorted_restore(oskb);
515
516 if (unlikely(!skb))
517 return -ENOBUFS;
518 }
519
520 inet = inet_sk(sk);
521 tcb = TCP_SKB_CB(skb);
522 memset(&opts, 0, sizeof(opts));
523
524 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
525 tcp_options_size = tcp_nip_syn_options(sk, skb, &opts);
526 else
527 tcp_options_size = tcp_nip_established_options(sk, skb, &opts);
528 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
529
530 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
531 /* The data pointer moves up */
532 skb_push(skb, tcp_header_size);
533 skb_reset_transport_header(skb);
534
535 /* Disassociate the control block */
536 skb_orphan(skb);
537
538 /* Establishes associations with control blocks */
539 skb->sk = sk;
540 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
541 skb_set_hash_from_sk(skb, sk);
542 /* Increase allocated memory */
543 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
544
545 /* Build TCP header and checksum it. */
546 th = (struct tcphdr *)skb->data;
547 th->source = inet->inet_sport;
548 th->dest = inet->inet_dport;
549 th->seq = htonl(tcb->seq);
550 th->ack_seq = htonl(rcv_nxt);
551 /* TCP's header offset is measured in 4 bytes, so moving two to the right
552 * means dividing by 4. In addition, according to the position of the offset
553 * field in the packet, the offset field is at the beginning of a short type,
554 * accounting for 4 bits. Therefore, the offset field should be shifted 12 bits
555 * to the left
556 */
557 len = htons(((tcp_header_size >> TCP_NIP_4BYTE_PAYLOAD) << TCP_HDR_LEN_POS_PAYLOAD) |
558 tcb->tcp_flags);
559 *(((__be16 *)th) + TCP_HDR_LEN_OFFSET) = len;
560
561 th->check = 0;
562 /* Newip Urg_ptr is disabled. Urg_ptr is used to carry the number of discarded packets */
563 th->urg_ptr = htons(tp->snd_up);
564
565 /* Write TCP option */
566 tcp_nip_options_write((__be32 *)(th + 1), tp, &opts);
567
568 /* Window Settings */
569 if (likely(!(tcb->tcp_flags & TCPHDR_SYN)))
570 th->window = htons(nip_tcp_select_window(sk));
571 else
572 th->window = htons(min(tp->rcv_wnd, TCP_NIP_WINDOW_MAX));
573
574 ack = tcb->tcp_flags & TCPHDR_ACK;
575 nip_dbg("sport=%u, dport=%u, win=%u, rcvbuf=%d, sk_rmem_alloc=%d, ack=%u, skb->len=%u",
576 ntohs(inet->inet_sport), ntohs(inet->inet_dport), ntohs(th->window),
577 sk->sk_rcvbuf, atomic_read(&sk->sk_rmem_alloc), ack, skb->len);
578
579 /* Fill in checksum */
580 check = nip_get_output_checksum_tcp(skb, sk->SK_NIP_RCV_SADDR, sk->SK_NIP_DADDR);
581 th->check = htons(check);
582
583 if (likely(tcb->tcp_flags & TCPHDR_ACK))
584 tcp_nip_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
585
586 /* There's data to send */
587 if (skb->len != tcp_header_size)
588 tp->data_segs_out += tcp_skb_pcount(skb);
589
590 memset(skb->cb, 0, sizeof(struct ninet_skb_parm));
591 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
592 return err;
593 }
594
595 /* Function
596 * TCP's transport layer sends code that builds and initializes the TCP header
597 * Construct the SK_buff call transport layer to network layer interface
598 * Parameter
599 * sk: Transmission control block.
600 * skb: Structure stores all information about network datagrams
601 */
tcp_nip_transmit_skb(struct sock * sk,struct sk_buff * skb,int clone_it,gfp_t gfp_mask)602 int tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
603 gfp_t gfp_mask)
604 {
605 return __tcp_nip_transmit_skb(sk, skb, clone_it, gfp_mask,
606 tcp_sk(sk)->rcv_nxt);
607 }
608
tcp_nip_queue_skb(struct sock * sk,struct sk_buff * skb)609 static void tcp_nip_queue_skb(struct sock *sk, struct sk_buff *skb)
610 {
611 struct tcp_sock *tp = tcp_sk(sk);
612
613 /* Advance write_seq and place onto the write_queue. */
614 tp->write_seq = TCP_SKB_CB(skb)->end_seq;
615 tcp_nip_add_write_queue_tail(sk, skb);
616 sk->sk_wmem_queued += skb->truesize;
617 sk_mem_charge(sk, skb->truesize);
618 }
619
620 /* Function
621 * A function used by the client transport layer to connect requests.
622 * Parameter
623 * sk: transmission control block.
624 */
__tcp_nip_connect(struct sock * sk)625 int __tcp_nip_connect(struct sock *sk)
626 {
627 struct tcp_sock *tp = tcp_sk(sk);
628 struct sk_buff *buff;
629 int err;
630
631 tcp_nip_connect_init(sk);
632 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
633 if (unlikely(!buff))
634 return -ENOBUFS;
635
636 /* Initializes the SYN flag bit */
637 tcp_nip_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
638 tcp_mstamp_refresh(tp);
639 tp->retrans_stamp = tcp_time_stamp(tp);
640 tcp_nip_init_xmit_timers(sk);
641
642 tcp_nip_connect_queue_skb(sk, buff);
643
644 /* Send off SYN */
645 err = tcp_nip_transmit_skb(sk, buff, 1, sk->sk_allocation);
646 if (err == -ECONNREFUSED)
647 return err;
648
649 tp->snd_nxt = tp->write_seq;
650 tp->pushed_seq = tp->write_seq;
651
652 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
653
654 /* Timer for repeating the SYN until an answer. */
655 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
656
657 return 0;
658 }
659
tcp_nip_sync_mss(struct sock * sk,u32 pmtu)660 unsigned int tcp_nip_sync_mss(struct sock *sk, u32 pmtu)
661 {
662 struct tcp_sock *tp = tcp_sk(sk);
663 struct inet_connection_sock *icsk = inet_csk(sk);
664 int mss_now;
665
666 if (icsk->icsk_mtup.search_high > pmtu)
667 icsk->icsk_mtup.search_high = pmtu;
668
669 mss_now = tcp_nip_mtu_to_mss(sk, pmtu);
670 nip_dbg("sync mtu_to_mss %d", mss_now);
671 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
672 nip_dbg("sync bound to half wnd %d", mss_now);
673
674 /* And store cached results */
675 icsk->icsk_pmtu_cookie = pmtu;
676 if (icsk->icsk_mtup.enabled)
677 mss_now = min(mss_now, tcp_nip_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
678 tp->mss_cache = mss_now;
679
680 nip_dbg("sync final mss %d", mss_now);
681
682 return mss_now;
683 }
684
tcp_nip_current_mss(struct sock * sk)685 unsigned int tcp_nip_current_mss(struct sock *sk)
686 {
687 const struct tcp_sock *tp = tcp_sk(sk);
688 const struct dst_entry *dst = __sk_dst_get(sk);
689 u32 mss_now;
690 unsigned int header_len;
691 struct tcp_nip_out_options opts;
692
693 mss_now = tp->mss_cache;
694
695 if (dst) {
696 u32 mtu = dst_mtu(dst); /* NIP_MIN_MTU */
697
698 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
699 mss_now = tcp_nip_sync_mss(sk, mtu);
700 }
701
702 header_len = tcp_nip_established_options(sk, NULL, &opts) + sizeof(struct tcphdr);
703 if (header_len != tp->tcp_header_len) {
704 int delta = (int)header_len - tp->tcp_header_len;
705
706 mss_now -= delta;
707 }
708
709 return mss_now;
710 }
711
712 /* Function:
713 * Set up TCP options for SYN-ACKs.
714 * Initializes the TCP option for the SYN-ACK segment. Returns the SIZE of the TCP header.
715 * Parameter
716 * req: Request connection control block.
717 * mss: maximum segment length.
718 * skb: Transfer control block buffer.
719 * opts: stores the options contained in TCP packets when they are sent.
720 * foc: Fast Open option.
721 * synack_type: type of SYN+ACK segment.
722 */
tcp_nip_synack_options(struct request_sock * req,unsigned int mss,struct sk_buff * skb,struct tcp_nip_out_options * opts,const struct tcp_md5sig_key * md5,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)723 static unsigned int tcp_nip_synack_options(struct request_sock *req,
724 unsigned int mss, struct sk_buff *skb,
725 struct tcp_nip_out_options *opts,
726 const struct tcp_md5sig_key *md5,
727 struct tcp_fastopen_cookie *foc,
728 enum tcp_synack_type synack_type)
729 {
730 struct inet_request_sock *ireq = inet_rsk(req);
731 unsigned int remaining = MAX_TCP_OPTION_SPACE;
732
733 /* We always send an MSS option. */
734 opts->mss = mss;
735 remaining -= TCPOLEN_MSS_ALIGNED;
736
737 if (likely(ireq->tstamp_ok)) {
738 opts->options |= OPTION_TS;
739 opts->tsval = tcp_skb_timestamp(skb);
740 opts->tsecr = req->ts_recent;
741 remaining -= TCPOLEN_TSTAMP_ALIGNED;
742 }
743 return MAX_TCP_OPTION_SPACE - remaining;
744 }
745
get_nip_mss(const struct sock * sk,struct dst_entry * dst,struct request_sock * req)746 static int get_nip_mss(const struct sock *sk, struct dst_entry *dst, struct request_sock *req)
747 {
748 struct inet_request_sock *ireq = inet_rsk(req);
749 struct tcp_sock *tp = tcp_sk(sk);
750 u16 user_mss;
751 int mss;
752 int nip_hdr_len;
753 int nip_mss;
754 u32 mtu;
755
756 mss = dst_metric_advmss(dst);
757 user_mss = READ_ONCE(tp->rx_opt.user_mss);
758 if (user_mss && user_mss < mss)
759 mss = user_mss;
760
761 mtu = dst_mtu(dst); /* NIP_MIN_MTU */
762 nip_hdr_len = get_nip_hdr_len(NIP_HDR_COMM, &ireq->IR_NIP_LOC_ADDR, &ireq->IR_NIP_RMT_ADDR);
763 nip_hdr_len = nip_hdr_len == 0 ? NIP_HDR_MAX : nip_hdr_len;
764 nip_mss = mtu - nip_hdr_len - sizeof(struct tcphdr);
765
766 if (nip_mss > mss) {
767 mss = nip_mss;
768 tp->advmss = mss;
769 }
770
771 return mss;
772 }
773
774 /* Function
775 * The SYN + ACK segment is constructed based on the current transport control block,
776 * routing information, and request information.
777 * Parameter
778 * sk: transmission control block.
779 * dst: routing.
780 * req: Request connection control block.
781 * foc: Fast Open option.
782 * synack_type: type of SYN+ACK segment.
783 */
tcp_nip_make_synack(const struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)784 struct sk_buff *tcp_nip_make_synack(const struct sock *sk, struct dst_entry *dst,
785 struct request_sock *req,
786 struct tcp_fastopen_cookie *foc,
787 enum tcp_synack_type synack_type)
788 {
789 struct inet_request_sock *ireq = inet_rsk(req);
790 struct tcp_md5sig_key *md5 = NULL;
791 struct tcp_nip_out_options opts;
792 struct sk_buff *skb;
793 int tcp_header_size;
794 struct tcphdr *th;
795 int mss;
796 unsigned short check = 0;
797
798 skb = alloc_skb(MAX_TCP_HEADER, 0);
799 if (unlikely(!skb)) {
800 dst_release(dst);
801 return NULL;
802 }
803
804 /* Reserve space for headers. */
805 skb_reserve(skb, MAX_TCP_HEADER);
806
807 switch (synack_type) {
808 case TCP_SYNACK_NORMAL:
809 /* Release the original SKB and treat itself as the SKB of the current SK */
810 skb_set_owner_w(skb, req_to_sk(req));
811 break;
812 default:
813 break;
814 }
815 skb_dst_set(skb, dst);
816 /* set skb priority from sk */
817 skb->priority = sk->sk_priority;
818
819 mss = get_nip_mss(sk, dst, req);
820
821 /* Clear the options and set the associated timestamp */
822 memset(&opts, 0, sizeof(opts));
823 skb->skb_mstamp_ns = tcp_clock_us();
824
825 /* Get the TCP header size, then set the size and reset the transport layer header */
826 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
827 tcp_header_size = tcp_nip_synack_options(req, mss, skb, &opts, md5,
828 foc, synack_type) + sizeof(*th);
829 skb_push(skb, tcp_header_size);
830 skb_reset_transport_header(skb);
831
832 /* Clear the TCP header and set the fields of the TCP header */
833 th = (struct tcphdr *)skb->data;
834 memset(th, 0, sizeof(struct tcphdr));
835 th->syn = 1;
836 th->ack = 1;
837 if (inet_rsk(req)->ecn_ok)
838 th->ece = 1;
839 th->source = htons(ireq->ir_num);
840 th->dest = ireq->ir_rmt_port;
841 skb->ip_summed = CHECKSUM_PARTIAL;
842 th->seq = htonl(tcp_rsk(req)->snt_isn);
843 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
844 th->check = 0;
845
846 th->window = htons(min(req->rsk_rcv_wnd, TCP_NIP_WINDOW_MAX));
847
848 tcp_nip_options_write((__be32 *)(th + 1), NULL, &opts);
849 /* TCP data offset, divided by 4 because doff is a 32-bit word
850 * That is, words four bytes long are counted in units
851 */
852 th->doff = (tcp_header_size >> TCP_NUM_2);
853 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
854
855 /* Fill in checksum */
856 check = nip_get_output_checksum_tcp(skb, ireq->IR_NIP_LOC_ADDR, ireq->IR_NIP_RMT_ADDR);
857 th->check = htons(check);
858
859 /* Do not fool tcpdump (if any), clean our debris */
860 skb->tstamp = 0;
861 return skb;
862 }
863
864 /* Function
865 * Send SKB packets with SYN+ACK segments to the network layer.
866 * Parameter
867 * req: Request connection control block.
868 * skb: Transfer control block buffer.
869 */
__nip_send_synack(struct request_sock * req,struct sk_buff * skb)870 int __nip_send_synack(struct request_sock *req, struct sk_buff *skb)
871 {
872 struct inet_request_sock *ireq = inet_rsk(req); /* 连接请求块 */
873 int err;
874 int csummode = CHECKSUM_NONE;
875 struct nip_addr *saddr, *daddr;
876 struct nip_hdr_encap head = {0};
877 unsigned char hdr_buf[NIP_HDR_MAX]; /* Cache the newIP header */
878
879 skb->protocol = htons(ETH_P_NEWIP);
880 skb->ip_summed = csummode;
881 skb->csum = 0;
882 saddr = &ireq->IR_NIP_LOC_ADDR;
883 daddr = &ireq->IR_NIP_RMT_ADDR;
884
885 head.saddr = *saddr;
886 head.daddr = *daddr;
887 head.ttl = NIP_DEFAULT_TTL;
888 head.nexthdr = IPPROTO_TCP;
889 head.hdr_buf = hdr_buf;
890 nip_hdr_comm_encap(&head);
891 head.total_len = head.hdr_buf_pos + skb->len;
892 nip_update_total_len(&head, htons(head.total_len));
893
894 skb_push(skb, head.hdr_buf_pos);
895 memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos);
896 skb_reset_network_header(skb);
897 nipcb(skb)->srcaddr = *saddr;
898 nipcb(skb)->dstaddr = *daddr;
899 nipcb(skb)->nexthdr = head.nexthdr;
900
901 head.total_len = skb->len;
902 err = nip_send_skb(skb);
903 if (err)
904 nip_dbg("failed to send skb, skb->len=%u", head.total_len);
905 else
906 nip_dbg("send skb ok, skb->len=%u", head.total_len);
907
908 return err;
909 }
910
nip_send_synack(struct request_sock * req,struct sk_buff * skb)911 int nip_send_synack(struct request_sock *req, struct sk_buff *skb)
912 {
913 return __nip_send_synack(req, skb);
914 }
915
916 /* Function:
917 * Creates a subtransport block to complete the establishment of the three-way handshake
918 * Parameter:
919 * parent: indicates the parent transmission control block
920 * child: indicates the child transmission control block
921 * skb: Transfer control block buffer
922 */
tcp_nip_child_process(struct sock * parent,struct sock * child,struct sk_buff * skb)923 int tcp_nip_child_process(struct sock *parent, struct sock *child,
924 struct sk_buff *skb)
925 {
926 int ret = 0;
927 int state = child->sk_state;
928 /* Child is not occupied by the user process */
929 if (!sock_owned_by_user(child)) {
930 ret = tcp_nip_rcv_state_process(child, skb);
931 /* At this point the state of the child has been migrated,
932 * waking up the process on the listening socket,
933 * which may be blocked due to Accept
934 */
935 if (state == TCP_SYN_RECV && child->sk_state != state)
936 parent->sk_data_ready(parent);
937 } else {
938 __sk_add_backlog(child, skb);
939 }
940 bh_unlock_sock(child);
941 sock_put(child);
942 return ret;
943 }
944
tcp_nip_acceptable_seq(const struct sock * sk)945 static inline __u32 tcp_nip_acceptable_seq(const struct sock *sk)
946 {
947 const struct tcp_sock *tp = tcp_sk(sk);
948
949 if (!before(tcp_wnd_end(tp), tp->snd_nxt))
950 return tp->snd_nxt;
951 else
952 return tcp_wnd_end(tp);
953 }
954
955 /* Function:
956 * The client sends an ACK
957 * Parameter:
958 * sk: transmission control block
959 * rcv_nxt: serial number to be accepted
960 */
__tcp_nip_send_ack(struct sock * sk,u32 rcv_nxt)961 void __tcp_nip_send_ack(struct sock *sk, u32 rcv_nxt)
962 {
963 struct sk_buff *buff;
964
965 if (sk->sk_state == TCP_CLOSE)
966 return;
967
968 buff = alloc_skb(MAX_TCP_HEADER,
969 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
970
971 /* Reserve space for the header. */
972 skb_reserve(buff, MAX_TCP_HEADER);
973 /* Initialize SKB without data */
974 tcp_nip_init_nondata_skb(buff, tcp_nip_acceptable_seq(sk), TCPHDR_ACK);
975
976 /* Mark pure ack,skb->truesize set to 2 */
977 skb_set_tcp_pure_ack(buff);
978
979 /* Record the timestamp and send the SKB. */
980 __tcp_nip_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
981 }
982
tcp_nip_send_ack(struct sock * sk)983 void tcp_nip_send_ack(struct sock *sk)
984 {
985 __tcp_nip_send_ack(sk, tcp_sk(sk)->rcv_nxt);
986 }
987
tcp_nip_send_fin(struct sock * sk)988 void tcp_nip_send_fin(struct sock *sk)
989 {
990 struct sk_buff *skb;
991 struct sk_buff *tskb = tcp_write_queue_tail(sk);
992 struct tcp_sock *tp = tcp_sk(sk);
993 u32 cur_mss;
994
995 nip_dbg("send fin");
996 /* Set the fin position of the last packet to 1 */
997 if (tskb && tcp_nip_send_head(sk)) {
998 coalesce:
999 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
1000 TCP_SKB_CB(tskb)->end_seq++;
1001 tp->write_seq++;
1002 } else {
1003 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
1004 if (unlikely(!skb)) {
1005 if (tskb)
1006 goto coalesce;
1007 return;
1008 }
1009 skb_reserve(skb, MAX_TCP_HEADER);
1010
1011 tcp_nip_init_nondata_skb(skb, tp->write_seq,
1012 TCPHDR_ACK | TCPHDR_FIN);
1013 tcp_nip_queue_skb(sk, skb);
1014 }
1015
1016 cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS
1017 __tcp_nip_push_pending_frames(sk, cur_mss, TCP_NAGLE_OFF);
1018 }
1019
tcp_nip_send_active_reset(struct sock * sk,gfp_t priority)1020 void tcp_nip_send_active_reset(struct sock *sk, gfp_t priority)
1021 {
1022 struct sk_buff *skb;
1023
1024 nip_dbg("send rst");
1025 /* NOTE: No TCP options attached and we never retransmit this. */
1026 skb = alloc_skb(MAX_TCP_HEADER, priority);
1027 if (!skb)
1028 /* If you add log here, there will be an alarm:
1029 * WARNING: Possible unnecessary 'out of memory' message
1030 */
1031 return;
1032
1033 /* Reserve space for headers and prepare control bits. */
1034 skb_reserve(skb, MAX_TCP_HEADER);
1035 tcp_nip_init_nondata_skb(skb, tcp_nip_acceptable_seq(sk),
1036 TCPHDR_ACK | TCPHDR_RST);
1037 /* Send it off. */
1038 tcp_nip_transmit_skb(sk, skb, 0, priority);
1039 }
1040
tcp_nip_snd_wnd_test(const struct tcp_sock * tp,const struct sk_buff * skb,unsigned int cur_mss)1041 static bool tcp_nip_snd_wnd_test(const struct tcp_sock *tp,
1042 const struct sk_buff *skb,
1043 unsigned int cur_mss)
1044 {
1045 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1046
1047 if (skb->len > cur_mss)
1048 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1049
1050 return !after(end_seq, tcp_wnd_end(tp));
1051 }
1052
tcp_nip_set_skb_tso_segs(struct sk_buff * skb,unsigned int mss_now)1053 static void tcp_nip_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1054 {
1055 if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
1056 /* Avoid the costly divide in the normal
1057 * non-TSO case.
1058 */
1059 tcp_skb_pcount_set(skb, 1);
1060 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1061 } else {
1062 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1063 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1064 }
1065 }
1066
tcp_nip_init_tso_segs(struct sk_buff * skb,unsigned int mss_now)1067 static int tcp_nip_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1068 {
1069 int tso_segs = tcp_skb_pcount(skb);
1070
1071 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1072 tcp_nip_set_skb_tso_segs(skb, mss_now);
1073 tso_segs = tcp_skb_pcount(skb);
1074 }
1075 return tso_segs;
1076 }
1077
tcp_nip_write_xmit(struct sock * sk,unsigned int mss_now,int nonagle,int push_one,gfp_t gfp)1078 static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1079 int push_one, gfp_t gfp)
1080 {
1081 struct tcp_sock *tp = tcp_sk(sk);
1082 struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
1083 struct sk_buff *skb;
1084 u32 snd_num;
1085 u32 last_nip_ssthresh = ntp->nip_ssthresh;
1086 static const char * const str[] = {"can`t send pkt because no window",
1087 "have window to send pkt"};
1088
1089 if (!mss_now) {
1090 nip_dbg("invalid parameter, mss_now=%u", mss_now);
1091 return false;
1092 }
1093 snd_num = get_nip_tcp_snd_win_enable() ? (ntp->nip_ssthresh / mss_now) :
1094 TCP_NIP_SND_NUM_MAX;
1095
1096 tcp_nip_keepalive_enable(sk);
1097 ntp->idle_ka_probes_out = 0;
1098
1099 tcp_mstamp_refresh(tp);
1100
1101 if (tp->rcv_tstamp) {
1102 u32 tstamp = tcp_jiffies32 - tp->rcv_tstamp;
1103
1104 if (tstamp >= get_ack_to_nxt_snd_tstamp()) {
1105 ntp->nip_ssthresh = get_ssthresh_low_min();
1106 snd_num = ntp->nip_ssthresh / mss_now;
1107 ssthresh_dbg("new snd tstamp %u >= %u, ssthresh %u to %u, snd_num=%u",
1108 tstamp, get_ack_to_nxt_snd_tstamp(),
1109 last_nip_ssthresh, ntp->nip_ssthresh, snd_num);
1110 }
1111 }
1112
1113 while ((skb = tcp_nip_send_head(sk)) && (snd_num--)) {
1114 bool snd_wnd_ready;
1115
1116 tcp_nip_init_tso_segs(skb, mss_now);
1117 snd_wnd_ready = tcp_nip_snd_wnd_test(tp, skb, mss_now);
1118 nip_dbg("%s, skb->len=%u", (snd_wnd_ready ? str[1] : str[0]), skb->len);
1119 if (unlikely(!snd_wnd_ready))
1120 break;
1121
1122 if (unlikely(tcp_nip_transmit_skb(sk, skb, 1, gfp)))
1123 break;
1124
1125 tcp_nip_event_new_data_sent(sk, skb);
1126
1127 if (push_one)
1128 break;
1129 }
1130 return !tp->packets_out && tcp_nip_send_head(sk);
1131 }
1132
tcp_nip_rtx_synack(const struct sock * sk,struct request_sock * req)1133 int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req)
1134 {
1135 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
1136 int res;
1137 struct dst_entry *dst;
1138
1139 dst = af_ops->route_req(sk, NULL, req);
1140 tcp_rsk(req)->txhash = net_tx_rndhash();
1141
1142 res = af_ops->send_synack(sk, dst, NULL, req, NULL, TCP_SYNACK_NORMAL, NULL);
1143
1144 return res;
1145 }
1146
tcp_nip_adjust_pcount(struct sock * sk,const struct sk_buff * skb,int decr)1147 static void tcp_nip_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1148 {
1149 struct tcp_sock *tp = tcp_sk(sk);
1150
1151 tp->packets_out -= decr;
1152 }
1153
__tcp_nip_retransmit_skb(struct sock * sk,struct sk_buff * skb,int segs)1154 int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1155 {
1156 struct tcp_sock *tp = tcp_sk(sk);
1157 unsigned int cur_mss;
1158 int len, err;
1159
1160 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1161 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
1162 WARN_ON_ONCE(1);
1163 return -EINVAL;
1164 }
1165 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1166 return -ENOMEM;
1167 }
1168
1169 cur_mss = tcp_nip_current_mss(sk);
1170
1171 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1172 TCP_SKB_CB(skb)->seq != tp->snd_una)
1173 return -EAGAIN;
1174
1175 len = cur_mss * segs;
1176 if (skb->len > len) {
1177 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, len, cur_mss, GFP_ATOMIC))
1178 return -ENOMEM; /* We'll try again later. */
1179 } else {
1180 int diff = tcp_skb_pcount(skb);
1181
1182 tcp_nip_set_skb_tso_segs(skb, cur_mss);
1183 diff -= tcp_skb_pcount(skb);
1184 if (diff)
1185 tcp_nip_adjust_pcount(sk, skb, diff);
1186 }
1187
1188 err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1189 if (likely(!err)) {
1190 segs = tcp_skb_pcount(skb);
1191
1192 tp->total_retrans += segs;
1193 }
1194 return err;
1195 }
1196
tcp_nip_retransmit_skb(struct sock * sk,struct sk_buff * skb,int segs)1197 int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1198 {
1199 struct tcp_sock *tp = tcp_sk(sk);
1200 int err = __tcp_nip_retransmit_skb(sk, skb, segs);
1201
1202 if (err == 0) {
1203 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
1204 tp->retrans_out += tcp_skb_pcount(skb);
1205
1206 /* Save stamp of the first retransmit. */
1207 if (!tp->retrans_stamp)
1208 tp->retrans_stamp = tcp_skb_timestamp(skb);
1209 } else if (err != -EBUSY) {
1210 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
1211 }
1212
1213 return err;
1214 }
1215
1216 #define TCP_NIP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
1217 (1UL << TCP_NIP_WRITE_TIMER_DEFERRED) | \
1218 (1UL << TCP_NIP_DELACK_TIMER_DEFERRED) | \
1219 (1UL << TCP_MTU_REDUCED_DEFERRED))
1220
tcp_nip_release_cb(struct sock * sk)1221 void tcp_nip_release_cb(struct sock *sk)
1222 {
1223 unsigned long flags, nflags;
1224
1225 /* perform an atomic operation only if at least one flag is set */
1226 do {
1227 flags = sk->sk_tsq_flags;
1228 if (!(flags & TCP_NIP_DEFERRED_ALL))
1229 return;
1230 nflags = flags & ~TCP_NIP_DEFERRED_ALL;
1231 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1232
1233 sock_release_ownership(sk);
1234 if (flags & (1UL << TCP_NIP_WRITE_TIMER_DEFERRED)) {
1235 tcp_nip_write_timer_handler(sk);
1236 __sock_put(sk);
1237 }
1238 if (flags & (1UL << TCP_NIP_DELACK_TIMER_DEFERRED)) {
1239 tcp_nip_delack_timer_handler(sk);
1240 __sock_put(sk);
1241 }
1242 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
1243 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1244 __sock_put(sk);
1245 }
1246 }
1247
1248 enum nip_probe_type {
1249 NIP_PROBE0 = 0,
1250 NIP_KEEPALIVE = 1,
1251 NIP_UNKNOWN = 2,
1252 NIP_PROBE_MAX,
1253 };
1254
tcp_nip_xmit_probe_skb(struct sock * sk,int urgent,int mib)1255 static int tcp_nip_xmit_probe_skb(struct sock *sk, int urgent, int mib)
1256 {
1257 struct tcp_sock *tp = tcp_sk(sk);
1258 struct sk_buff *skb;
1259 int ret;
1260 int probe_type;
1261 const char *str[NIP_PROBE_MAX] = {"probe0", "keepalive", "unknown"};
1262
1263 if (mib == LINUX_MIB_TCPWINPROBE)
1264 probe_type = NIP_PROBE0;
1265 else if (mib == LINUX_MIB_TCPKEEPALIVE)
1266 probe_type = NIP_KEEPALIVE;
1267 else
1268 probe_type = NIP_UNKNOWN;
1269
1270 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1271 skb = alloc_skb(MAX_TCP_HEADER,
1272 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
1273 if (!skb)
1274 return -1;
1275
1276 /* Reserve space for headers and set control bits. */
1277 skb_reserve(skb, MAX_TCP_HEADER);
1278
1279 tcp_nip_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
1280
1281 NET_INC_STATS(sock_net(sk), mib);
1282 ret = tcp_nip_transmit_skb(sk, skb, 0, (__force gfp_t)0);
1283 nip_dbg("send %s probe packet, ret=%d", str[probe_type], ret);
1284 return ret;
1285 }
1286
tcp_nip_write_wakeup(struct sock * sk,int mib)1287 int tcp_nip_write_wakeup(struct sock *sk, int mib)
1288 {
1289 struct tcp_sock *tp = tcp_sk(sk);
1290 struct sk_buff *skb;
1291
1292 if (sk->sk_state == TCP_CLOSE) {
1293 nip_dbg("no probe0 when tcp close");
1294 return -1;
1295 }
1296
1297 skb = tcp_nip_send_head(sk);
1298 /* If the serial number of the next packet is in the sending window */
1299 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
1300 int err;
1301 unsigned int mss = tcp_nip_current_mss(sk);
1302 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1303
1304 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1305 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1306 /* If the current window size is not enough to send a complete packet */
1307 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1308 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
1309 err = tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
1310 skb, seg_size, mss, GFP_ATOMIC);
1311 if (err) {
1312 nip_dbg("tcp_fragment return err=%d", err);
1313 return -1;
1314 }
1315 }
1316 err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1317 if (!err)
1318 tcp_nip_event_new_data_sent(sk, skb);
1319 nip_dbg("transmit skb %s", (!err ? "ok" : "fail"));
1320 return err;
1321 } else {
1322 return tcp_nip_xmit_probe_skb(sk, 0, mib);
1323 }
1324 }
1325
1326 /* The 0 window probe packet is sent */
tcp_nip_send_probe0(struct sock * sk)1327 void tcp_nip_send_probe0(struct sock *sk)
1328 {
1329 struct inet_connection_sock *icsk = inet_csk(sk);
1330 struct tcp_sock *tp = tcp_sk(sk);
1331 struct net *net = sock_net(sk);
1332 unsigned long when;
1333 /* An ACK packet with snd_UNa-1 and length 0 is sent as a zero-window detection packet */
1334 int err = tcp_nip_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
1335
1336 /* If there are packets to be sent on the network and no packets to be
1337 * sent in the send queue, the packet is returned directly
1338 */
1339 if (tp->packets_out || !tcp_nip_send_head(sk)) {
1340 /* Cancel probe timer, if it is not required. */
1341 nip_dbg("packets_out(%u) not 0 or send_head is NULL, cancel probe0 timer",
1342 tp->packets_out);
1343 icsk->icsk_probes_out = 0;
1344 icsk->icsk_backoff = 0;
1345 return;
1346 }
1347
1348 /* Err: 0 succeeded, -1 failed */
1349 icsk->icsk_probes_out++; /* Number of probes +1 */
1350 if (err <= 0) {
1351 if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
1352 icsk->icsk_backoff++;
1353 when = tcp_probe0_when(sk, TCP_RTO_MAX);
1354 nip_dbg("probe0 %s, probes_out=%u, probe0_base=%lu, icsk_backoff=%u, when=%lu",
1355 (!err ? "send ok" : "send fail"), icsk->icsk_probes_out,
1356 tcp_probe0_base(sk), icsk->icsk_backoff, when);
1357 } else {
1358 /* Makes the zero window probe timer time out faster */
1359 when = TCP_RESOURCE_PROBE_INTERVAL;
1360 nip_dbg("probe0 not sent due to local congestion, make timer out faster");
1361 }
1362
1363 nip_dbg("restart probe0 timer, when=%lu, icsk_backoff=%u, probe_max=%u",
1364 when, icsk->icsk_backoff, TCP_RTO_MAX);
1365 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
1366 }
1367