1From abeef0770f76cd0eff8e5c6e50de0b280079d7f0 Mon Sep 17 00:00:00 2001 2From: jiangheng <jiangheng14@huawei.com> 3Date: Mon, 13 Mar 2023 19:25:42 +0800 4Subject: [PATCH] fix tso small packet drop in kernel server 5 6--- 7 src/core/tcp_out.c | 254 +++++++++++++++++++++-------------------- 8 src/include/lwipopts.h | 2 + 9 2 files changed, 130 insertions(+), 126 deletions(-) 10 11diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c 12index 8a0d653..b1c317d 100644 13--- a/src/core/tcp_out.c 14+++ b/src/core/tcp_out.c 15@@ -1312,60 +1312,33 @@ tcp_build_wnd_scale_option(u32_t *opts) 16 #endif 17 18 #if GAZELLE_ENABLE 19-static struct tcp_seg *tcp_output_over(struct tcp_pcb *pcb, struct tcp_seg *seg, struct tcp_seg *useg) 20-{ 21- if (TCP_TCPLEN(seg) > 0) { 22- seg->next = NULL; 23- if (useg == NULL) { 24- pcb->unacked = seg; 25- pcb->last_unacked = seg; 26- useg = seg; 27- } else { 28- if (TCP_SEQ_LT(lwip_ntohl(seg->tcphdr->seqno), lwip_ntohl(useg->tcphdr->seqno))) { 29- /* add segment to before tail of unacked list, keeping the list sorted */ 30- struct tcp_seg **cur_seg = &(pcb->unacked); 31- while (*cur_seg && 32- TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) { 33- cur_seg = &((*cur_seg)->next ); 34- } 35- seg->next = (*cur_seg); 36- (*cur_seg) = seg; 37- } else { 38- /* add segment to tail of unacked list */ 39- useg->next = seg; 40- useg = seg; 41- pcb->last_unacked = seg; 42- } 43- } 44- } else { 45- tcp_seg_free(seg); 46- } 47- 48- return useg; 49-} 50-static err_t tcp_output_seg(struct tcp_pcb *pcb, struct tcp_seg *seg, struct netif *netif, u32_t snd_nxt) 51-{ 52- if (pcb->state != SYN_SENT) { 53- TCPH_SET_FLAG(seg->tcphdr, TCP_ACK); 54- } 55- 56- err_t err = tcp_output_segment(seg, pcb, netif); 57- if (err != ERR_OK) { 58- /* segment could not be sent, for whatever reason */ 59- tcp_set_flags(pcb, TF_NAGLEMEMERR); 60- return err; 61- } 62- 63- if (pcb->state != SYN_SENT) { 64- tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW); 65- } 66- 67- if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { 68- pcb->snd_nxt = snd_nxt; 69- } 70- 71- return ERR_OK; 72-} 73+u32_t start_seqno = 0; 74+#define TCP_INIT_SEGMENT(tem_seg, _pcb, _p, _hdrflags, _seqno, _optflags) \ 75+do { \ 76+ struct tcp_seg *_seg = tem_seg; \ 77+ u8_t _optlen; \ 78+ rte_prefetch2(_p); \ 79+ \ 80+ _optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(_optflags, _pcb); \ 81+ _seg->flags = _optflags; \ 82+ _seg->next = NULL; \ 83+ _seg->p = _p; \ 84+ _seg->len = _p->tot_len - _optlen; \ 85+ /* build TCP header */ \ 86+ pbuf_add_header(_p, TCP_HLEN); \ 87+ _seg->tcphdr = (struct tcp_hdr *)_seg->p->payload; \ 88+ _seg->tcphdr->src = lwip_htons(_pcb->local_port); \ 89+ _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \ 90+ /* _seg->tcphdr->src = lwip_htons(_pcb->local_port); \ */ \ 91+ /* _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \ */ \ 92+ _seg->tcphdr->seqno = lwip_htonl(_seqno); \ 93+ \ 94+ if (start_seqno == 0) {\ 95+ start_seqno = _seqno; \ 96+ } \ 97+ TCPH_HDRLEN_FLAGS_SET(_seg->tcphdr, (5 + _optlen / 4), _hdrflags); \ 98+ _seg->tcphdr->urgp = 0; \ 99+} while(0) 100 #endif 101 /** 102 * @ingroup tcp_raw 103@@ -1471,97 +1444,127 @@ tcp_output(struct tcp_pcb *pcb) 104 pcb->persist_backoff = 0; 105 106 /* useg should point to last segment on unacked queue */ 107- useg = pcb->last_unacked; 108+ useg = pcb->unacked; 109+ if (useg != NULL) { 110+ for (; useg->next != NULL; useg = useg->next); 111+ } 112 113 /* data available and window allows it to be sent? */ 114- 115- u32_t send_len = 0; 116 #if GAZELLE_ENABLE 117 if ((get_eth_params_tx_ol() & DEV_TX_OFFLOAD_TCP_TSO) && pcb->need_tso_send) { 118- while(seg && send_len < 0xffff) { 119- /** 120- * 1) walk unsent queue, find all seg witch wait to send. chain buf in these segs. 121- * 2) create new segment, send and free new segment. 122- * 3) update snd_nxt, unacked queue, and unsent queue 123- */ 124- struct tcp_seg *start_seg = seg; 125- struct pbuf *first_pbuf = NULL; 126- struct pbuf *pre_pbuf = NULL; 127- u8_t pbuf_chain_len = 0; 128- u32_t next_seqno = lwip_ntohl(seg->tcphdr->seqno); 129- while (seg != NULL && pbuf_chain_len < GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) { 130+ uint16_t send_pkt = 0; 131+ 132+ do { 133+ struct tcp_seg * start_seg = seg; 134+ struct pbuf *new_pbuf = NULL; 135+ 136+ struct pbuf *tmp_pbuf = NULL; 137 u32_t seg_seqno = lwip_ntohl(seg->tcphdr->seqno); 138- if (seg_seqno - pcb->lastack + seg->len > wnd) { 139- if (first_pbuf) 140- break; 141- else 142- goto output_done; 143+ u32_t last_seg_seqno = seg_seqno; 144+ 145+ struct tcp_seg *last_seg = NULL; 146+ u16_t last_seg_len = 0; 147+ u8_t pbuf_chain_len = 0; 148+ while (seg != NULL && seg_seqno - pcb->lastack + seg->len <= wnd && pbuf_chain_len < GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) { 149+ if (last_seg_len != 0 && (last_seg_len + seg->len < 1460) && seg->len < GAZELLE_TCP_MIN_TSO_SEG_LEN) { 150+ break; 151+ } 152+ 153+ if ((tcp_do_output_nagle(pcb) == 0) && 154+ ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) { 155+ break; 156+ } 157+ if (last_seg_seqno + last_seg_len == seg_seqno) { 158+ pbuf_remove_header(seg->p, seg->p->tot_len - seg->len); 159+ if (new_pbuf == NULL) { 160+ new_pbuf = seg->p; 161+ tmp_pbuf = new_pbuf; 162+ } else { 163+ new_pbuf->tot_len += seg->p->len; 164+ tmp_pbuf->next = seg->p; 165+ tmp_pbuf = tmp_pbuf->next; 166+ } 167+ } else { 168+ break; 169+ } 170+ 171+ last_seg = seg; 172+ last_seg_len = seg->len; 173+ last_seg_seqno = seg_seqno; 174+ seg = seg->next; 175+ seg_seqno = (seg != NULL) ? lwip_ntohl(seg->tcphdr->seqno) : seg_seqno; 176+ pbuf_chain_len++; 177 } 178 179- if ((tcp_do_output_nagle(pcb) == 0) && ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) { 180- if (first_pbuf) 181- break; 182- else 183- goto output_done; 184+ // tcp_do_output_nagle, break 185+ if (new_pbuf == NULL) { 186+ goto end_loop; 187 } 188 189- if (seg->len < TCP_MSS || next_seqno != seg_seqno || pbuf_chain_len >= GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) { 190- break; 191- } 192- if (first_pbuf == NULL && (seg->next == NULL || seg->next->len < TCP_MSS)) { 193- break; 194- } 195+ struct tcp_seg new_seg; 196+ TCP_INIT_SEGMENT(&new_seg, pcb, new_pbuf, 0, lwip_ntohl(start_seg->tcphdr->seqno), 0); 197 198- pbuf_remove_header(seg->p, seg->p->tot_len - seg->len); 199- if (first_pbuf == NULL) { 200- first_pbuf = seg->p; 201- } else { 202- first_pbuf->tot_len += seg->p->len; 203- pre_pbuf->next = seg->p; 204+ if (pcb->state != SYN_SENT) { 205+ TCPH_SET_FLAG(new_seg.tcphdr, TCP_ACK); 206 } 207 208- send_len += seg->len; 209- pre_pbuf = seg->p; 210- next_seqno = seg_seqno + TCP_TCPLEN(seg); 211- seg = seg->next; 212- pcb->unsent = seg; 213- pbuf_chain_len++; 214- } 215- 216- if (first_pbuf == NULL) { 217- err = tcp_output_seg(pcb, seg, netif, next_seqno + seg->len); 218+ err = tcp_output_segment(&new_seg, pcb, netif); 219 if (err != ERR_OK) { 220- if (pcb->unsent == NULL) 221- pcb->last_unsent = NULL; 222- pcb->need_tso_send = 0; 223- return err; 224+ /* segment could not be sent, for whatever reason */ 225+ tcp_set_flags(pcb, TF_NAGLEMEMERR); 226+ return err; 227 } 228- pcb->unsent = seg->next; 229- useg = tcp_output_over(pcb, seg, useg); 230- seg = pcb->unsent; 231- continue; 232- } 233- 234- struct tcp_seg new_seg; 235- tcp_init_segment(&new_seg, pcb, first_pbuf, 0, lwip_ntohl(start_seg->tcphdr->seqno), 0); 236 237- err = tcp_output_seg(pcb, &new_seg, netif, next_seqno); 238+ pcb->unsent = last_seg->next; 239+ if (pcb->state != SYN_SENT) { 240+ tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW); 241+ } 242 243- for (u32_t i = 0; i < pbuf_chain_len; i++) { 244- struct tcp_seg *next_seg = start_seg->next; 245- start_seg->p->next = NULL; 246- useg = tcp_output_over(pcb, start_seg, useg); 247- start_seg = next_seg; 248- } 249+ snd_nxt = last_seg_seqno + TCP_TCPLEN(last_seg); 250+ if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { 251+ pcb->snd_nxt = snd_nxt; 252+ } 253 254- pbuf_remove_header(new_seg.p, new_seg.p->tot_len - new_seg.len - TCPH_HDRLEN_BYTES(new_seg.tcphdr)); 255- new_seg.p->tot_len = new_seg.p->len; 256- } 257- pcb->need_tso_send = 0; 258+ pbuf_remove_header(new_seg.p, new_seg.p->tot_len - new_seg.len - TCP_HLEN); 259+ new_seg.p->tot_len = new_seg.p->len; 260+ 261+ for (int start = pbuf_chain_len; start > 0; start--) { 262+ struct tcp_seg *tmp_seg = start_seg; 263+ start_seg = start_seg->next; 264+ tmp_seg->p->next = NULL; 265+ if (TCP_TCPLEN(tmp_seg) > 0) { 266+ tmp_seg->next = NULL; 267+ if (pcb->unacked == NULL) { 268+ pcb->unacked = tmp_seg; 269+ useg = tmp_seg; 270+ } else { 271+ if (TCP_SEQ_LT(lwip_ntohl(tmp_seg->tcphdr->seqno), lwip_ntohl(useg->tcphdr->seqno))) { 272+ /* add segment to before tail of unacked list, keeping the list sorted */ 273+ struct tcp_seg **cur_seg = &(pcb->unacked); 274+ while (*cur_seg && 275+ TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(tmp_seg->tcphdr->seqno))) { 276+ cur_seg = &((*cur_seg)->next ); 277+ } 278+ tmp_seg->next = (*cur_seg); 279+ (*cur_seg) = tmp_seg; 280+ } else { 281+ /* add segment to tail of unacked list */ 282+ useg->next = tmp_seg; 283+ useg = useg->next; 284+ } 285+ } 286+ } else { 287+ tcp_seg_free(tmp_seg); 288+ } 289+ } 290+ } while(seg != NULL && lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd && send_pkt++ < 10); 291+end_loop: 292+ pcb->need_tso_send = 0; 293 } else 294 #endif 295 { 296- while (seg != NULL && send_len < 0xffff && 297+ uint16_t send_pkt = 0; 298+ while (seg != NULL && send_pkt++ < 10 && 299 lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd) { 300 LWIP_ASSERT("RST not expected here!", 301 (TCPH_FLAGS(seg->tcphdr) & TCP_RST) == 0); 302@@ -1576,7 +1579,6 @@ tcp_output(struct tcp_pcb *pcb) 303 ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) { 304 break; 305 } 306- send_len += seg->len; 307 #if TCP_CWND_DEBUG 308 LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_output: snd_wnd %"TCPWNDSIZE_F", cwnd %"TCPWNDSIZE_F", wnd %"U32_F", effwnd %"U32_F", seq %"U32_F", ack %"U32_F", i %"S16_F"\n", 309 pcb->snd_wnd, pcb->cwnd, wnd, 310diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h 311index 742b4a9..0d2a6d9 100644 312--- a/src/include/lwipopts.h 313+++ b/src/include/lwipopts.h 314@@ -55,6 +55,8 @@ 315 316 #define GAZELLE_TCP_MAX_PBUF_CHAIN_LEN 40 317 318+#define GAZELLE_TCP_MIN_TSO_SEG_LEN 256 319+ 320 /* 321 ---------------------------------- 322 ---------- NIC offloads ---------- 323-- 3242.33.0 325 326