• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *
3  * Copyright 2015 gRPC authors.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #include <grpc/support/port_platform.h>
20 
21 #include "src/core/lib/iomgr/port.h"
22 
23 #ifdef GRPC_POSIX_SOCKET_TCP
24 
25 #include "src/core/lib/iomgr/network_status_tracker.h"
26 #include "src/core/lib/iomgr/tcp_posix.h"
27 
28 #include <errno.h>
29 #include <limits.h>
30 #include <netinet/in.h>
31 #include <stdbool.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <sys/socket.h>
36 #include <sys/types.h>
37 #include <unistd.h>
38 
39 #include <grpc/slice.h>
40 #include <grpc/support/alloc.h>
41 #include <grpc/support/log.h>
42 #include <grpc/support/string_util.h>
43 #include <grpc/support/sync.h>
44 #include <grpc/support/time.h>
45 
46 #include "src/core/lib/channel/channel_args.h"
47 #include "src/core/lib/debug/stats.h"
48 #include "src/core/lib/debug/trace.h"
49 #include "src/core/lib/gpr/string.h"
50 #include "src/core/lib/gpr/useful.h"
51 #include "src/core/lib/iomgr/buffer_list.h"
52 #include "src/core/lib/iomgr/ev_posix.h"
53 #include "src/core/lib/iomgr/executor.h"
54 #include "src/core/lib/profiling/timers.h"
55 #include "src/core/lib/slice/slice_internal.h"
56 #include "src/core/lib/slice/slice_string_helpers.h"
57 
58 #ifdef GRPC_HAVE_MSG_NOSIGNAL
59 #define SENDMSG_FLAGS MSG_NOSIGNAL
60 #else
61 #define SENDMSG_FLAGS 0
62 #endif
63 
64 #ifdef GRPC_MSG_IOVLEN_TYPE
65 typedef GRPC_MSG_IOVLEN_TYPE msg_iovlen_type;
66 #else
67 typedef size_t msg_iovlen_type;
68 #endif
69 
70 extern grpc_core::TraceFlag grpc_tcp_trace;
71 
72 namespace {
73 struct grpc_tcp {
74   grpc_endpoint base;
75   grpc_fd* em_fd;
76   int fd;
77   /* Used by the endpoint read function to distinguish the very first read call
78    * from the rest */
79   bool is_first_read;
80   double target_length;
81   double bytes_read_this_round;
82   gpr_refcount refcount;
83   gpr_atm shutdown_count;
84 
85   int min_read_chunk_size;
86   int max_read_chunk_size;
87 
88   /* garbage after the last read */
89   grpc_slice_buffer last_read_buffer;
90 
91   grpc_slice_buffer* incoming_buffer;
92   grpc_slice_buffer* outgoing_buffer;
93   /** byte within outgoing_buffer->slices[0] to write next */
94   size_t outgoing_byte_idx;
95 
96   grpc_closure* read_cb;
97   grpc_closure* write_cb;
98   grpc_closure* release_fd_cb;
99   int* release_fd;
100 
101   grpc_closure read_done_closure;
102   grpc_closure write_done_closure;
103   grpc_closure error_closure;
104 
105   char* peer_string;
106 
107   grpc_resource_user* resource_user;
108   grpc_resource_user_slice_allocator slice_allocator;
109 
110   grpc_core::TracedBuffer* tb_head; /* List of traced buffers */
111   gpr_mu tb_mu; /* Lock for access to list of traced buffers */
112 
113   /* grpc_endpoint_write takes an argument which if non-null means that the
114    * transport layer wants the TCP layer to collect timestamps for this write.
115    * This arg is forwarded to the timestamps callback function when the ACK
116    * timestamp is received from the kernel. This arg is a (void *) which allows
117    * users of this API to pass in a pointer to any kind of structure. This
118    * structure could actually be a tag or any book-keeping object that the user
119    * can use to distinguish between different traced writes. The only
120    * requirement from the TCP endpoint layer is that this arg should be non-null
121    * if the user wants timestamps for the write. */
122   void* outgoing_buffer_arg;
123   /* A counter which starts at 0. It is initialized the first time the socket
124    * options for collecting timestamps are set, and is incremented with each
125    * byte sent. */
126   int bytes_counter;
127   bool socket_ts_enabled; /* True if timestamping options are set on the socket
128                            */
129   gpr_atm
130       stop_error_notification; /* Set to 1 if we do not want to be notified on
131                                   errors anymore */
132 };
133 
134 struct backup_poller {
135   gpr_mu* pollset_mu;
136   grpc_closure run_poller;
137 };
138 
139 }  // namespace
140 
141 #define BACKUP_POLLER_POLLSET(b) ((grpc_pollset*)((b) + 1))
142 
143 static gpr_atm g_uncovered_notifications_pending;
144 static gpr_atm g_backup_poller; /* backup_poller* */
145 
146 static void tcp_handle_read(void* arg /* grpc_tcp */, grpc_error* error);
147 static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error);
148 static void tcp_drop_uncovered_then_handle_write(void* arg /* grpc_tcp */,
149                                                  grpc_error* error);
150 
done_poller(void * bp,grpc_error * error_ignored)151 static void done_poller(void* bp, grpc_error* error_ignored) {
152   backup_poller* p = static_cast<backup_poller*>(bp);
153   if (grpc_tcp_trace.enabled()) {
154     gpr_log(GPR_INFO, "BACKUP_POLLER:%p destroy", p);
155   }
156   grpc_pollset_destroy(BACKUP_POLLER_POLLSET(p));
157   gpr_free(p);
158 }
159 
run_poller(void * bp,grpc_error * error_ignored)160 static void run_poller(void* bp, grpc_error* error_ignored) {
161   backup_poller* p = static_cast<backup_poller*>(bp);
162   if (grpc_tcp_trace.enabled()) {
163     gpr_log(GPR_INFO, "BACKUP_POLLER:%p run", p);
164   }
165   gpr_mu_lock(p->pollset_mu);
166   grpc_millis deadline = grpc_core::ExecCtx::Get()->Now() + 10 * GPR_MS_PER_SEC;
167   GRPC_STATS_INC_TCP_BACKUP_POLLER_POLLS();
168   GRPC_LOG_IF_ERROR(
169       "backup_poller:pollset_work",
170       grpc_pollset_work(BACKUP_POLLER_POLLSET(p), nullptr, deadline));
171   gpr_mu_unlock(p->pollset_mu);
172   /* last "uncovered" notification is the ref that keeps us polling, if we get
173    * there try a cas to release it */
174   if (gpr_atm_no_barrier_load(&g_uncovered_notifications_pending) == 1 &&
175       gpr_atm_full_cas(&g_uncovered_notifications_pending, 1, 0)) {
176     gpr_mu_lock(p->pollset_mu);
177     bool cas_ok = gpr_atm_full_cas(&g_backup_poller, (gpr_atm)p, 0);
178     if (grpc_tcp_trace.enabled()) {
179       gpr_log(GPR_INFO, "BACKUP_POLLER:%p done cas_ok=%d", p, cas_ok);
180     }
181     gpr_mu_unlock(p->pollset_mu);
182     if (grpc_tcp_trace.enabled()) {
183       gpr_log(GPR_INFO, "BACKUP_POLLER:%p shutdown", p);
184     }
185     grpc_pollset_shutdown(BACKUP_POLLER_POLLSET(p),
186                           GRPC_CLOSURE_INIT(&p->run_poller, done_poller, p,
187                                             grpc_schedule_on_exec_ctx));
188   } else {
189     if (grpc_tcp_trace.enabled()) {
190       gpr_log(GPR_INFO, "BACKUP_POLLER:%p reschedule", p);
191     }
192     GRPC_CLOSURE_SCHED(&p->run_poller, GRPC_ERROR_NONE);
193   }
194 }
195 
drop_uncovered(grpc_tcp * tcp)196 static void drop_uncovered(grpc_tcp* tcp) {
197   backup_poller* p = (backup_poller*)gpr_atm_acq_load(&g_backup_poller);
198   gpr_atm old_count =
199       gpr_atm_no_barrier_fetch_add(&g_uncovered_notifications_pending, -1);
200   if (grpc_tcp_trace.enabled()) {
201     gpr_log(GPR_INFO, "BACKUP_POLLER:%p uncover cnt %d->%d", p,
202             static_cast<int>(old_count), static_cast<int>(old_count) - 1);
203   }
204   GPR_ASSERT(old_count != 1);
205 }
206 
cover_self(grpc_tcp * tcp)207 static void cover_self(grpc_tcp* tcp) {
208   backup_poller* p;
209   gpr_atm old_count =
210       gpr_atm_no_barrier_fetch_add(&g_uncovered_notifications_pending, 2);
211   if (grpc_tcp_trace.enabled()) {
212     gpr_log(GPR_INFO, "BACKUP_POLLER: cover cnt %d->%d",
213             static_cast<int>(old_count), 2 + static_cast<int>(old_count));
214   }
215   if (old_count == 0) {
216     GRPC_STATS_INC_TCP_BACKUP_POLLERS_CREATED();
217     p = static_cast<backup_poller*>(
218         gpr_zalloc(sizeof(*p) + grpc_pollset_size()));
219     if (grpc_tcp_trace.enabled()) {
220       gpr_log(GPR_INFO, "BACKUP_POLLER:%p create", p);
221     }
222     grpc_pollset_init(BACKUP_POLLER_POLLSET(p), &p->pollset_mu);
223     gpr_atm_rel_store(&g_backup_poller, (gpr_atm)p);
224     GRPC_CLOSURE_SCHED(
225         GRPC_CLOSURE_INIT(&p->run_poller, run_poller, p,
226                           grpc_executor_scheduler(GRPC_EXECUTOR_LONG)),
227         GRPC_ERROR_NONE);
228   } else {
229     while ((p = (backup_poller*)gpr_atm_acq_load(&g_backup_poller)) ==
230            nullptr) {
231       // spin waiting for backup poller
232     }
233   }
234   if (grpc_tcp_trace.enabled()) {
235     gpr_log(GPR_INFO, "BACKUP_POLLER:%p add %p", p, tcp);
236   }
237   grpc_pollset_add_fd(BACKUP_POLLER_POLLSET(p), tcp->em_fd);
238   if (old_count != 0) {
239     drop_uncovered(tcp);
240   }
241 }
242 
notify_on_read(grpc_tcp * tcp)243 static void notify_on_read(grpc_tcp* tcp) {
244   if (grpc_tcp_trace.enabled()) {
245     gpr_log(GPR_INFO, "TCP:%p notify_on_read", tcp);
246   }
247   GRPC_CLOSURE_INIT(&tcp->read_done_closure, tcp_handle_read, tcp,
248                     grpc_schedule_on_exec_ctx);
249   grpc_fd_notify_on_read(tcp->em_fd, &tcp->read_done_closure);
250 }
251 
notify_on_write(grpc_tcp * tcp)252 static void notify_on_write(grpc_tcp* tcp) {
253   if (grpc_tcp_trace.enabled()) {
254     gpr_log(GPR_INFO, "TCP:%p notify_on_write", tcp);
255   }
256   cover_self(tcp);
257   GRPC_CLOSURE_INIT(&tcp->write_done_closure,
258                     tcp_drop_uncovered_then_handle_write, tcp,
259                     grpc_schedule_on_exec_ctx);
260   grpc_fd_notify_on_write(tcp->em_fd, &tcp->write_done_closure);
261 }
262 
tcp_drop_uncovered_then_handle_write(void * arg,grpc_error * error)263 static void tcp_drop_uncovered_then_handle_write(void* arg, grpc_error* error) {
264   if (grpc_tcp_trace.enabled()) {
265     gpr_log(GPR_INFO, "TCP:%p got_write: %s", arg, grpc_error_string(error));
266   }
267   drop_uncovered(static_cast<grpc_tcp*>(arg));
268   tcp_handle_write(arg, error);
269 }
270 
add_to_estimate(grpc_tcp * tcp,size_t bytes)271 static void add_to_estimate(grpc_tcp* tcp, size_t bytes) {
272   tcp->bytes_read_this_round += static_cast<double>(bytes);
273 }
274 
finish_estimate(grpc_tcp * tcp)275 static void finish_estimate(grpc_tcp* tcp) {
276   /* If we read >80% of the target buffer in one read loop, increase the size
277      of the target buffer to either the amount read, or twice its previous
278      value */
279   if (tcp->bytes_read_this_round > tcp->target_length * 0.8) {
280     tcp->target_length =
281         GPR_MAX(2 * tcp->target_length, tcp->bytes_read_this_round);
282   } else {
283     tcp->target_length =
284         0.99 * tcp->target_length + 0.01 * tcp->bytes_read_this_round;
285   }
286   tcp->bytes_read_this_round = 0;
287 }
288 
get_target_read_size(grpc_tcp * tcp)289 static size_t get_target_read_size(grpc_tcp* tcp) {
290   grpc_resource_quota* rq = grpc_resource_user_quota(tcp->resource_user);
291   double pressure = grpc_resource_quota_get_memory_pressure(rq);
292   double target =
293       tcp->target_length * (pressure > 0.8 ? (1.0 - pressure) / 0.2 : 1.0);
294   size_t sz = ((static_cast<size_t> GPR_CLAMP(target, tcp->min_read_chunk_size,
295                                               tcp->max_read_chunk_size)) +
296                255) &
297               ~static_cast<size_t>(255);
298   /* don't use more than 1/16th of the overall resource quota for a single read
299    * alloc */
300   size_t rqmax = grpc_resource_quota_peek_size(rq);
301   if (sz > rqmax / 16 && rqmax > 1024) {
302     sz = rqmax / 16;
303   }
304   return sz;
305 }
306 
tcp_annotate_error(grpc_error * src_error,grpc_tcp * tcp)307 static grpc_error* tcp_annotate_error(grpc_error* src_error, grpc_tcp* tcp) {
308   return grpc_error_set_str(
309       grpc_error_set_int(
310           grpc_error_set_int(src_error, GRPC_ERROR_INT_FD, tcp->fd),
311           /* All tcp errors are marked with UNAVAILABLE so that application may
312            * choose to retry. */
313           GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_UNAVAILABLE),
314       GRPC_ERROR_STR_TARGET_ADDRESS,
315       grpc_slice_from_copied_string(tcp->peer_string));
316 }
317 
318 static void tcp_handle_read(void* arg /* grpc_tcp */, grpc_error* error);
319 static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error);
320 
tcp_shutdown(grpc_endpoint * ep,grpc_error * why)321 static void tcp_shutdown(grpc_endpoint* ep, grpc_error* why) {
322   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
323   grpc_fd_shutdown(tcp->em_fd, why);
324   grpc_resource_user_shutdown(tcp->resource_user);
325 }
326 
tcp_free(grpc_tcp * tcp)327 static void tcp_free(grpc_tcp* tcp) {
328   grpc_fd_orphan(tcp->em_fd, tcp->release_fd_cb, tcp->release_fd,
329                  "tcp_unref_orphan");
330   grpc_slice_buffer_destroy_internal(&tcp->last_read_buffer);
331   grpc_resource_user_unref(tcp->resource_user);
332   gpr_free(tcp->peer_string);
333   gpr_mu_destroy(&tcp->tb_mu);
334   gpr_free(tcp);
335 }
336 
337 #ifndef NDEBUG
338 #define TCP_UNREF(tcp, reason) tcp_unref((tcp), (reason), __FILE__, __LINE__)
339 #define TCP_REF(tcp, reason) tcp_ref((tcp), (reason), __FILE__, __LINE__)
tcp_unref(grpc_tcp * tcp,const char * reason,const char * file,int line)340 static void tcp_unref(grpc_tcp* tcp, const char* reason, const char* file,
341                       int line) {
342   if (grpc_tcp_trace.enabled()) {
343     gpr_atm val = gpr_atm_no_barrier_load(&tcp->refcount.count);
344     gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG,
345             "TCP unref %p : %s %" PRIdPTR " -> %" PRIdPTR, tcp, reason, val,
346             val - 1);
347   }
348   if (gpr_unref(&tcp->refcount)) {
349     tcp_free(tcp);
350   }
351 }
352 
tcp_ref(grpc_tcp * tcp,const char * reason,const char * file,int line)353 static void tcp_ref(grpc_tcp* tcp, const char* reason, const char* file,
354                     int line) {
355   if (grpc_tcp_trace.enabled()) {
356     gpr_atm val = gpr_atm_no_barrier_load(&tcp->refcount.count);
357     gpr_log(file, line, GPR_LOG_SEVERITY_DEBUG,
358             "TCP   ref %p : %s %" PRIdPTR " -> %" PRIdPTR, tcp, reason, val,
359             val + 1);
360   }
361   gpr_ref(&tcp->refcount);
362 }
363 #else
364 #define TCP_UNREF(tcp, reason) tcp_unref((tcp))
365 #define TCP_REF(tcp, reason) tcp_ref((tcp))
tcp_unref(grpc_tcp * tcp)366 static void tcp_unref(grpc_tcp* tcp) {
367   if (gpr_unref(&tcp->refcount)) {
368     tcp_free(tcp);
369   }
370 }
371 
tcp_ref(grpc_tcp * tcp)372 static void tcp_ref(grpc_tcp* tcp) { gpr_ref(&tcp->refcount); }
373 #endif
374 
tcp_destroy(grpc_endpoint * ep)375 static void tcp_destroy(grpc_endpoint* ep) {
376   grpc_network_status_unregister_endpoint(ep);
377   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
378   grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
379   if (grpc_event_engine_can_track_errors()) {
380     gpr_atm_no_barrier_store(&tcp->stop_error_notification, true);
381     grpc_fd_set_error(tcp->em_fd);
382   }
383   TCP_UNREF(tcp, "destroy");
384 }
385 
call_read_cb(grpc_tcp * tcp,grpc_error * error)386 static void call_read_cb(grpc_tcp* tcp, grpc_error* error) {
387   grpc_closure* cb = tcp->read_cb;
388 
389   if (grpc_tcp_trace.enabled()) {
390     gpr_log(GPR_INFO, "TCP:%p call_cb %p %p:%p", tcp, cb, cb->cb, cb->cb_arg);
391     size_t i;
392     const char* str = grpc_error_string(error);
393     gpr_log(GPR_INFO, "read: error=%s", str);
394 
395     for (i = 0; i < tcp->incoming_buffer->count; i++) {
396       char* dump = grpc_dump_slice(tcp->incoming_buffer->slices[i],
397                                    GPR_DUMP_HEX | GPR_DUMP_ASCII);
398       gpr_log(GPR_INFO, "READ %p (peer=%s): %s", tcp, tcp->peer_string, dump);
399       gpr_free(dump);
400     }
401   }
402 
403   tcp->read_cb = nullptr;
404   tcp->incoming_buffer = nullptr;
405   GRPC_CLOSURE_SCHED(cb, error);
406 }
407 
408 #define MAX_READ_IOVEC 4
tcp_do_read(grpc_tcp * tcp)409 static void tcp_do_read(grpc_tcp* tcp) {
410   GPR_TIMER_SCOPE("tcp_do_read", 0);
411   struct msghdr msg;
412   struct iovec iov[MAX_READ_IOVEC];
413   ssize_t read_bytes;
414   size_t i;
415 
416   GPR_ASSERT(tcp->incoming_buffer->count <= MAX_READ_IOVEC);
417 
418   for (i = 0; i < tcp->incoming_buffer->count; i++) {
419     iov[i].iov_base = GRPC_SLICE_START_PTR(tcp->incoming_buffer->slices[i]);
420     iov[i].iov_len = GRPC_SLICE_LENGTH(tcp->incoming_buffer->slices[i]);
421   }
422 
423   msg.msg_name = nullptr;
424   msg.msg_namelen = 0;
425   msg.msg_iov = iov;
426   msg.msg_iovlen = static_cast<msg_iovlen_type>(tcp->incoming_buffer->count);
427   msg.msg_control = nullptr;
428   msg.msg_controllen = 0;
429   msg.msg_flags = 0;
430 
431   GRPC_STATS_INC_TCP_READ_OFFER(tcp->incoming_buffer->length);
432   GRPC_STATS_INC_TCP_READ_OFFER_IOV_SIZE(tcp->incoming_buffer->count);
433 
434   do {
435     GPR_TIMER_SCOPE("recvmsg", 0);
436     GRPC_STATS_INC_SYSCALL_READ();
437     read_bytes = recvmsg(tcp->fd, &msg, 0);
438   } while (read_bytes < 0 && errno == EINTR);
439 
440   if (read_bytes < 0) {
441     /* NB: After calling call_read_cb a parallel call of the read handler may
442      * be running. */
443     if (errno == EAGAIN) {
444       finish_estimate(tcp);
445       /* We've consumed the edge, request a new one */
446       notify_on_read(tcp);
447     } else {
448       grpc_slice_buffer_reset_and_unref_internal(tcp->incoming_buffer);
449       call_read_cb(tcp,
450                    tcp_annotate_error(GRPC_OS_ERROR(errno, "recvmsg"), tcp));
451       TCP_UNREF(tcp, "read");
452     }
453   } else if (read_bytes == 0) {
454     /* 0 read size ==> end of stream */
455     grpc_slice_buffer_reset_and_unref_internal(tcp->incoming_buffer);
456     call_read_cb(
457         tcp, tcp_annotate_error(
458                  GRPC_ERROR_CREATE_FROM_STATIC_STRING("Socket closed"), tcp));
459     TCP_UNREF(tcp, "read");
460   } else {
461     GRPC_STATS_INC_TCP_READ_SIZE(read_bytes);
462     add_to_estimate(tcp, static_cast<size_t>(read_bytes));
463     GPR_ASSERT((size_t)read_bytes <= tcp->incoming_buffer->length);
464     if (static_cast<size_t>(read_bytes) < tcp->incoming_buffer->length) {
465       grpc_slice_buffer_trim_end(
466           tcp->incoming_buffer,
467           tcp->incoming_buffer->length - static_cast<size_t>(read_bytes),
468           &tcp->last_read_buffer);
469     }
470     GPR_ASSERT((size_t)read_bytes == tcp->incoming_buffer->length);
471     call_read_cb(tcp, GRPC_ERROR_NONE);
472     TCP_UNREF(tcp, "read");
473   }
474 }
475 
tcp_read_allocation_done(void * tcpp,grpc_error * error)476 static void tcp_read_allocation_done(void* tcpp, grpc_error* error) {
477   grpc_tcp* tcp = static_cast<grpc_tcp*>(tcpp);
478   if (grpc_tcp_trace.enabled()) {
479     gpr_log(GPR_INFO, "TCP:%p read_allocation_done: %s", tcp,
480             grpc_error_string(error));
481   }
482   if (error != GRPC_ERROR_NONE) {
483     grpc_slice_buffer_reset_and_unref_internal(tcp->incoming_buffer);
484     grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
485     call_read_cb(tcp, GRPC_ERROR_REF(error));
486     TCP_UNREF(tcp, "read");
487   } else {
488     tcp_do_read(tcp);
489   }
490 }
491 
tcp_continue_read(grpc_tcp * tcp)492 static void tcp_continue_read(grpc_tcp* tcp) {
493   size_t target_read_size = get_target_read_size(tcp);
494   if (tcp->incoming_buffer->length < target_read_size &&
495       tcp->incoming_buffer->count < MAX_READ_IOVEC) {
496     if (grpc_tcp_trace.enabled()) {
497       gpr_log(GPR_INFO, "TCP:%p alloc_slices", tcp);
498     }
499     grpc_resource_user_alloc_slices(&tcp->slice_allocator, target_read_size, 1,
500                                     tcp->incoming_buffer);
501   } else {
502     if (grpc_tcp_trace.enabled()) {
503       gpr_log(GPR_INFO, "TCP:%p do_read", tcp);
504     }
505     tcp_do_read(tcp);
506   }
507 }
508 
tcp_handle_read(void * arg,grpc_error * error)509 static void tcp_handle_read(void* arg /* grpc_tcp */, grpc_error* error) {
510   grpc_tcp* tcp = static_cast<grpc_tcp*>(arg);
511   if (grpc_tcp_trace.enabled()) {
512     gpr_log(GPR_INFO, "TCP:%p got_read: %s", tcp, grpc_error_string(error));
513   }
514 
515   if (error != GRPC_ERROR_NONE) {
516     grpc_slice_buffer_reset_and_unref_internal(tcp->incoming_buffer);
517     grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
518     call_read_cb(tcp, GRPC_ERROR_REF(error));
519     TCP_UNREF(tcp, "read");
520   } else {
521     tcp_continue_read(tcp);
522   }
523 }
524 
tcp_read(grpc_endpoint * ep,grpc_slice_buffer * incoming_buffer,grpc_closure * cb)525 static void tcp_read(grpc_endpoint* ep, grpc_slice_buffer* incoming_buffer,
526                      grpc_closure* cb) {
527   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
528   GPR_ASSERT(tcp->read_cb == nullptr);
529   tcp->read_cb = cb;
530   tcp->incoming_buffer = incoming_buffer;
531   grpc_slice_buffer_reset_and_unref_internal(incoming_buffer);
532   grpc_slice_buffer_swap(incoming_buffer, &tcp->last_read_buffer);
533   TCP_REF(tcp, "read");
534   if (tcp->is_first_read) {
535     /* Endpoint read called for the very first time. Register read callback with
536      * the polling engine */
537     tcp->is_first_read = false;
538     notify_on_read(tcp);
539   } else {
540     /* Not the first time. We may or may not have more bytes available. In any
541      * case call tcp->read_done_closure (i.e tcp_handle_read()) which does the
542      * right thing (i.e calls tcp_do_read() which either reads the available
543      * bytes or calls notify_on_read() to be notified when new bytes become
544      * available */
545     GRPC_CLOSURE_SCHED(&tcp->read_done_closure, GRPC_ERROR_NONE);
546   }
547 }
548 
549 /* A wrapper around sendmsg. It sends \a msg over \a fd and returns the number
550  * of bytes sent. */
tcp_send(int fd,const struct msghdr * msg)551 ssize_t tcp_send(int fd, const struct msghdr* msg) {
552   GPR_TIMER_SCOPE("sendmsg", 1);
553   ssize_t sent_length;
554   do {
555     /* TODO(klempner): Cork if this is a partial write */
556     GRPC_STATS_INC_SYSCALL_WRITE();
557     sent_length = sendmsg(fd, msg, SENDMSG_FLAGS);
558   } while (sent_length < 0 && errno == EINTR);
559   return sent_length;
560 }
561 
562 /** This is to be called if outgoing_buffer_arg is not null. On linux platforms,
563  * this will call sendmsg with socket options set to collect timestamps inside
564  * the kernel. On return, sent_length is set to the return value of the sendmsg
565  * call. Returns false if setting the socket options failed. This is not
566  * implemented for non-linux platforms currently, and crashes out.
567  */
568 static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
569                                       size_t sending_length,
570                                       ssize_t* sent_length, grpc_error** error);
571 
572 /** The callback function to be invoked when we get an error on the socket. */
573 static void tcp_handle_error(void* arg /* grpc_tcp */, grpc_error* error);
574 
575 #ifdef GRPC_LINUX_ERRQUEUE
tcp_write_with_timestamps(grpc_tcp * tcp,struct msghdr * msg,size_t sending_length,ssize_t * sent_length,grpc_error ** error)576 static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
577                                       size_t sending_length,
578                                       ssize_t* sent_length,
579                                       grpc_error** error) {
580   if (!tcp->socket_ts_enabled) {
581     uint32_t opt = grpc_core::kTimestampingSocketOptions;
582     if (setsockopt(tcp->fd, SOL_SOCKET, SO_TIMESTAMPING,
583                    static_cast<void*>(&opt), sizeof(opt)) != 0) {
584       *error = tcp_annotate_error(GRPC_OS_ERROR(errno, "setsockopt"), tcp);
585       grpc_slice_buffer_reset_and_unref_internal(tcp->outgoing_buffer);
586       if (grpc_tcp_trace.enabled()) {
587         gpr_log(GPR_ERROR, "Failed to set timestamping options on the socket.");
588       }
589       return false;
590     }
591     tcp->bytes_counter = -1;
592     tcp->socket_ts_enabled = true;
593   }
594   /* Set control message to indicate that you want timestamps. */
595   union {
596     char cmsg_buf[CMSG_SPACE(sizeof(uint32_t))];
597     struct cmsghdr align;
598   } u;
599   cmsghdr* cmsg = reinterpret_cast<cmsghdr*>(u.cmsg_buf);
600   cmsg->cmsg_level = SOL_SOCKET;
601   cmsg->cmsg_type = SO_TIMESTAMPING;
602   cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
603   *reinterpret_cast<int*>(CMSG_DATA(cmsg)) =
604       grpc_core::kTimestampingRecordingOptions;
605   msg->msg_control = u.cmsg_buf;
606   msg->msg_controllen = CMSG_SPACE(sizeof(uint32_t));
607 
608   /* If there was an error on sendmsg the logic in tcp_flush will handle it. */
609   ssize_t length = tcp_send(tcp->fd, msg);
610   *sent_length = length;
611   /* Only save timestamps if all the bytes were taken by sendmsg. */
612   if (sending_length == static_cast<size_t>(length)) {
613     gpr_mu_lock(&tcp->tb_mu);
614     grpc_core::TracedBuffer::AddNewEntry(
615         &tcp->tb_head, static_cast<int>(tcp->bytes_counter + length),
616         tcp->outgoing_buffer_arg);
617     gpr_mu_unlock(&tcp->tb_mu);
618     tcp->outgoing_buffer_arg = nullptr;
619   }
620   return true;
621 }
622 
623 /** Reads \a cmsg to derive timestamps from the control messages. If a valid
624  * timestamp is found, the traced buffer list is updated with this timestamp.
625  * The caller of this function should be looping on the control messages found
626  * in \a msg. \a cmsg should point to the control message that the caller wants
627  * processed.
628  * On return, a pointer to a control message is returned. On the next iteration,
629  * CMSG_NXTHDR(msg, ret_val) should be passed as \a cmsg. */
process_timestamp(grpc_tcp * tcp,msghdr * msg,struct cmsghdr * cmsg)630 struct cmsghdr* process_timestamp(grpc_tcp* tcp, msghdr* msg,
631                                   struct cmsghdr* cmsg) {
632   auto next_cmsg = CMSG_NXTHDR(msg, cmsg);
633   if (next_cmsg == nullptr) {
634     if (grpc_tcp_trace.enabled()) {
635       gpr_log(GPR_ERROR, "Received timestamp without extended error");
636     }
637     return cmsg;
638   }
639 
640   if (!(next_cmsg->cmsg_level == SOL_IP || next_cmsg->cmsg_level == SOL_IPV6) ||
641       !(next_cmsg->cmsg_type == IP_RECVERR ||
642         next_cmsg->cmsg_type == IPV6_RECVERR)) {
643     if (grpc_tcp_trace.enabled()) {
644       gpr_log(GPR_ERROR, "Unexpected control message");
645     }
646     return cmsg;
647   }
648 
649   auto tss =
650       reinterpret_cast<struct grpc_core::scm_timestamping*>(CMSG_DATA(cmsg));
651   auto serr = reinterpret_cast<struct sock_extended_err*>(CMSG_DATA(next_cmsg));
652   if (serr->ee_errno != ENOMSG ||
653       serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) {
654     gpr_log(GPR_ERROR, "Unexpected control message");
655     return cmsg;
656   }
657   /* The error handling can potentially be done on another thread so we need
658    * to protect the traced buffer list. A lock free list might be better. Using
659    * a simple mutex for now. */
660   gpr_mu_lock(&tcp->tb_mu);
661   grpc_core::TracedBuffer::ProcessTimestamp(&tcp->tb_head, serr, tss);
662   gpr_mu_unlock(&tcp->tb_mu);
663   return next_cmsg;
664 }
665 
666 /** For linux platforms, reads the socket's error queue and processes error
667  * messages from the queue. Returns true if all the errors processed were
668  * timestamps. Returns false if any of the errors were not timestamps. For
669  * non-linux platforms, error processing is not used/enabled currently.
670  */
process_errors(grpc_tcp * tcp)671 static bool process_errors(grpc_tcp* tcp) {
672   while (true) {
673     struct iovec iov;
674     iov.iov_base = nullptr;
675     iov.iov_len = 0;
676     struct msghdr msg;
677     msg.msg_name = nullptr;
678     msg.msg_namelen = 0;
679     msg.msg_iov = &iov;
680     msg.msg_iovlen = 0;
681     msg.msg_flags = 0;
682 
683     union {
684       char rbuf[1024 /*CMSG_SPACE(sizeof(scm_timestamping)) +
685                 CMSG_SPACE(sizeof(sock_extended_err) + sizeof(sockaddr_in))*/];
686       struct cmsghdr align;
687     } aligned_buf;
688     memset(&aligned_buf, 0, sizeof(aligned_buf));
689 
690     msg.msg_control = aligned_buf.rbuf;
691     msg.msg_controllen = sizeof(aligned_buf.rbuf);
692 
693     int r, saved_errno;
694     do {
695       r = recvmsg(tcp->fd, &msg, MSG_ERRQUEUE);
696       saved_errno = errno;
697     } while (r < 0 && saved_errno == EINTR);
698 
699     if (r == -1 && saved_errno == EAGAIN) {
700       return true; /* No more errors to process */
701     }
702     if (r == -1) {
703       return false;
704     }
705     if (grpc_tcp_trace.enabled()) {
706       if ((msg.msg_flags & MSG_CTRUNC) == 1) {
707         gpr_log(GPR_INFO, "Error message was truncated.");
708       }
709     }
710 
711     if (msg.msg_controllen == 0) {
712       /* There was no control message found. It was probably spurious. */
713       return true;
714     }
715     for (auto cmsg = CMSG_FIRSTHDR(&msg); cmsg && cmsg->cmsg_len;
716          cmsg = CMSG_NXTHDR(&msg, cmsg)) {
717       if (cmsg->cmsg_level != SOL_SOCKET ||
718           cmsg->cmsg_type != SCM_TIMESTAMPING) {
719         /* Got a control message that is not a timestamp. Don't know how to
720          * handle this. */
721         if (grpc_tcp_trace.enabled()) {
722           gpr_log(GPR_INFO,
723                   "unknown control message cmsg_level:%d cmsg_type:%d",
724                   cmsg->cmsg_level, cmsg->cmsg_type);
725         }
726         return false;
727       }
728       process_timestamp(tcp, &msg, cmsg);
729     }
730   }
731 }
732 
tcp_handle_error(void * arg,grpc_error * error)733 static void tcp_handle_error(void* arg /* grpc_tcp */, grpc_error* error) {
734   grpc_tcp* tcp = static_cast<grpc_tcp*>(arg);
735   if (grpc_tcp_trace.enabled()) {
736     gpr_log(GPR_INFO, "TCP:%p got_error: %s", tcp, grpc_error_string(error));
737   }
738 
739   if (error != GRPC_ERROR_NONE ||
740       static_cast<bool>(gpr_atm_acq_load(&tcp->stop_error_notification))) {
741     /* We aren't going to register to hear on error anymore, so it is safe to
742      * unref. */
743     grpc_core::TracedBuffer::Shutdown(&tcp->tb_head, GRPC_ERROR_REF(error));
744     TCP_UNREF(tcp, "error-tracking");
745     return;
746   }
747 
748   /* We are still interested in collecting timestamps, so let's try reading
749    * them. */
750   if (!process_errors(tcp)) {
751     /* This was not a timestamps error. This was an actual error. Set the
752      * read and write closures to be ready.
753      */
754     grpc_fd_set_readable(tcp->em_fd);
755     grpc_fd_set_writable(tcp->em_fd);
756   }
757   GRPC_CLOSURE_INIT(&tcp->error_closure, tcp_handle_error, tcp,
758                     grpc_schedule_on_exec_ctx);
759   grpc_fd_notify_on_error(tcp->em_fd, &tcp->error_closure);
760 }
761 
762 #else  /* GRPC_LINUX_ERRQUEUE */
tcp_write_with_timestamps(grpc_tcp * tcp,struct msghdr * msg,size_t sending_length,ssize_t * sent_length,grpc_error ** error)763 static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
764                                       size_t sending_length,
765                                       ssize_t* sent_length,
766                                       grpc_error** error) {
767   gpr_log(GPR_ERROR, "Write with timestamps not supported for this platform");
768   GPR_ASSERT(0);
769   return false;
770 }
771 
tcp_handle_error(void * arg,grpc_error * error)772 static void tcp_handle_error(void* arg /* grpc_tcp */, grpc_error* error) {
773   gpr_log(GPR_ERROR, "Error handling is not supported for this platform");
774   GPR_ASSERT(0);
775 }
776 #endif /* GRPC_LINUX_ERRQUEUE */
777 
778 /* returns true if done, false if pending; if returning true, *error is set */
779 #if defined(IOV_MAX) && IOV_MAX < 1000
780 #define MAX_WRITE_IOVEC IOV_MAX
781 #else
782 #define MAX_WRITE_IOVEC 1000
783 #endif
tcp_flush(grpc_tcp * tcp,grpc_error ** error)784 static bool tcp_flush(grpc_tcp* tcp, grpc_error** error) {
785   struct msghdr msg;
786   struct iovec iov[MAX_WRITE_IOVEC];
787   msg_iovlen_type iov_size;
788   ssize_t sent_length;
789   size_t sending_length;
790   size_t trailing;
791   size_t unwind_slice_idx;
792   size_t unwind_byte_idx;
793 
794   // We always start at zero, because we eagerly unref and trim the slice
795   // buffer as we write
796   size_t outgoing_slice_idx = 0;
797 
798   for (;;) {
799     sending_length = 0;
800     unwind_slice_idx = outgoing_slice_idx;
801     unwind_byte_idx = tcp->outgoing_byte_idx;
802     for (iov_size = 0; outgoing_slice_idx != tcp->outgoing_buffer->count &&
803                        iov_size != MAX_WRITE_IOVEC;
804          iov_size++) {
805       iov[iov_size].iov_base =
806           GRPC_SLICE_START_PTR(
807               tcp->outgoing_buffer->slices[outgoing_slice_idx]) +
808           tcp->outgoing_byte_idx;
809       iov[iov_size].iov_len =
810           GRPC_SLICE_LENGTH(tcp->outgoing_buffer->slices[outgoing_slice_idx]) -
811           tcp->outgoing_byte_idx;
812       sending_length += iov[iov_size].iov_len;
813       outgoing_slice_idx++;
814       tcp->outgoing_byte_idx = 0;
815     }
816     GPR_ASSERT(iov_size > 0);
817 
818     msg.msg_name = nullptr;
819     msg.msg_namelen = 0;
820     msg.msg_iov = iov;
821     msg.msg_iovlen = iov_size;
822     msg.msg_flags = 0;
823     if (tcp->outgoing_buffer_arg != nullptr) {
824       if (!tcp_write_with_timestamps(tcp, &msg, sending_length, &sent_length,
825                                      error))
826         return true; /* something went wrong with timestamps */
827     } else {
828       msg.msg_control = nullptr;
829       msg.msg_controllen = 0;
830 
831       GRPC_STATS_INC_TCP_WRITE_SIZE(sending_length);
832       GRPC_STATS_INC_TCP_WRITE_IOV_SIZE(iov_size);
833 
834       sent_length = tcp_send(tcp->fd, &msg);
835     }
836 
837     if (sent_length < 0) {
838       if (errno == EAGAIN) {
839         tcp->outgoing_byte_idx = unwind_byte_idx;
840         // unref all and forget about all slices that have been written to this
841         // point
842         for (size_t idx = 0; idx < unwind_slice_idx; ++idx) {
843           grpc_slice_unref_internal(
844               grpc_slice_buffer_take_first(tcp->outgoing_buffer));
845         }
846         return false;
847       } else if (errno == EPIPE) {
848         *error = tcp_annotate_error(GRPC_OS_ERROR(errno, "sendmsg"), tcp);
849         grpc_slice_buffer_reset_and_unref_internal(tcp->outgoing_buffer);
850         return true;
851       } else {
852         *error = tcp_annotate_error(GRPC_OS_ERROR(errno, "sendmsg"), tcp);
853         grpc_slice_buffer_reset_and_unref_internal(tcp->outgoing_buffer);
854         return true;
855       }
856     }
857 
858     GPR_ASSERT(tcp->outgoing_byte_idx == 0);
859     tcp->bytes_counter += sent_length;
860     trailing = sending_length - static_cast<size_t>(sent_length);
861     while (trailing > 0) {
862       size_t slice_length;
863 
864       outgoing_slice_idx--;
865       slice_length =
866           GRPC_SLICE_LENGTH(tcp->outgoing_buffer->slices[outgoing_slice_idx]);
867       if (slice_length > trailing) {
868         tcp->outgoing_byte_idx = slice_length - trailing;
869         break;
870       } else {
871         trailing -= slice_length;
872       }
873     }
874     if (outgoing_slice_idx == tcp->outgoing_buffer->count) {
875       *error = GRPC_ERROR_NONE;
876       grpc_slice_buffer_reset_and_unref_internal(tcp->outgoing_buffer);
877       return true;
878     }
879   }
880 }
881 
tcp_handle_write(void * arg,grpc_error * error)882 static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error) {
883   grpc_tcp* tcp = static_cast<grpc_tcp*>(arg);
884   grpc_closure* cb;
885 
886   if (error != GRPC_ERROR_NONE) {
887     cb = tcp->write_cb;
888     tcp->write_cb = nullptr;
889     cb->cb(cb->cb_arg, error);
890     TCP_UNREF(tcp, "write");
891     return;
892   }
893 
894   if (!tcp_flush(tcp, &error)) {
895     if (grpc_tcp_trace.enabled()) {
896       gpr_log(GPR_INFO, "write: delayed");
897     }
898     notify_on_write(tcp);
899   } else {
900     cb = tcp->write_cb;
901     tcp->write_cb = nullptr;
902     if (grpc_tcp_trace.enabled()) {
903       const char* str = grpc_error_string(error);
904       gpr_log(GPR_INFO, "write: %s", str);
905     }
906     GRPC_CLOSURE_SCHED(cb, error);
907     TCP_UNREF(tcp, "write");
908   }
909 }
910 
tcp_write(grpc_endpoint * ep,grpc_slice_buffer * buf,grpc_closure * cb,void * arg)911 static void tcp_write(grpc_endpoint* ep, grpc_slice_buffer* buf,
912                       grpc_closure* cb, void* arg) {
913   GPR_TIMER_SCOPE("tcp_write", 0);
914   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
915   grpc_error* error = GRPC_ERROR_NONE;
916 
917   if (grpc_tcp_trace.enabled()) {
918     size_t i;
919 
920     for (i = 0; i < buf->count; i++) {
921       char* data =
922           grpc_dump_slice(buf->slices[i], GPR_DUMP_HEX | GPR_DUMP_ASCII);
923       gpr_log(GPR_INFO, "WRITE %p (peer=%s): %s", tcp, tcp->peer_string, data);
924       gpr_free(data);
925     }
926   }
927 
928   GPR_ASSERT(tcp->write_cb == nullptr);
929 
930   if (buf->length == 0) {
931     GRPC_CLOSURE_SCHED(
932         cb, grpc_fd_is_shutdown(tcp->em_fd)
933                 ? tcp_annotate_error(
934                       GRPC_ERROR_CREATE_FROM_STATIC_STRING("EOF"), tcp)
935                 : GRPC_ERROR_NONE);
936     return;
937   }
938   tcp->outgoing_buffer = buf;
939   tcp->outgoing_byte_idx = 0;
940   tcp->outgoing_buffer_arg = arg;
941   if (arg) {
942     GPR_ASSERT(grpc_event_engine_can_track_errors());
943   }
944 
945   if (!tcp_flush(tcp, &error)) {
946     TCP_REF(tcp, "write");
947     tcp->write_cb = cb;
948     if (grpc_tcp_trace.enabled()) {
949       gpr_log(GPR_INFO, "write: delayed");
950     }
951     notify_on_write(tcp);
952   } else {
953     if (grpc_tcp_trace.enabled()) {
954       const char* str = grpc_error_string(error);
955       gpr_log(GPR_INFO, "write: %s", str);
956     }
957     GRPC_CLOSURE_SCHED(cb, error);
958   }
959 }
960 
tcp_add_to_pollset(grpc_endpoint * ep,grpc_pollset * pollset)961 static void tcp_add_to_pollset(grpc_endpoint* ep, grpc_pollset* pollset) {
962   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
963   grpc_pollset_add_fd(pollset, tcp->em_fd);
964 }
965 
tcp_add_to_pollset_set(grpc_endpoint * ep,grpc_pollset_set * pollset_set)966 static void tcp_add_to_pollset_set(grpc_endpoint* ep,
967                                    grpc_pollset_set* pollset_set) {
968   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
969   grpc_pollset_set_add_fd(pollset_set, tcp->em_fd);
970 }
971 
tcp_delete_from_pollset_set(grpc_endpoint * ep,grpc_pollset_set * pollset_set)972 static void tcp_delete_from_pollset_set(grpc_endpoint* ep,
973                                         grpc_pollset_set* pollset_set) {
974   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
975   grpc_pollset_set_del_fd(pollset_set, tcp->em_fd);
976 }
977 
tcp_get_peer(grpc_endpoint * ep)978 static char* tcp_get_peer(grpc_endpoint* ep) {
979   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
980   return gpr_strdup(tcp->peer_string);
981 }
982 
tcp_get_fd(grpc_endpoint * ep)983 static int tcp_get_fd(grpc_endpoint* ep) {
984   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
985   return tcp->fd;
986 }
987 
tcp_get_resource_user(grpc_endpoint * ep)988 static grpc_resource_user* tcp_get_resource_user(grpc_endpoint* ep) {
989   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
990   return tcp->resource_user;
991 }
992 
993 static const grpc_endpoint_vtable vtable = {tcp_read,
994                                             tcp_write,
995                                             tcp_add_to_pollset,
996                                             tcp_add_to_pollset_set,
997                                             tcp_delete_from_pollset_set,
998                                             tcp_shutdown,
999                                             tcp_destroy,
1000                                             tcp_get_resource_user,
1001                                             tcp_get_peer,
1002                                             tcp_get_fd};
1003 
1004 #define MAX_CHUNK_SIZE 32 * 1024 * 1024
1005 
grpc_tcp_create(grpc_fd * em_fd,const grpc_channel_args * channel_args,const char * peer_string)1006 grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
1007                                const grpc_channel_args* channel_args,
1008                                const char* peer_string) {
1009   int tcp_read_chunk_size = GRPC_TCP_DEFAULT_READ_SLICE_SIZE;
1010   int tcp_max_read_chunk_size = 4 * 1024 * 1024;
1011   int tcp_min_read_chunk_size = 256;
1012   grpc_resource_quota* resource_quota = grpc_resource_quota_create(nullptr);
1013   if (channel_args != nullptr) {
1014     for (size_t i = 0; i < channel_args->num_args; i++) {
1015       if (0 ==
1016           strcmp(channel_args->args[i].key, GRPC_ARG_TCP_READ_CHUNK_SIZE)) {
1017         grpc_integer_options options = {tcp_read_chunk_size, 1, MAX_CHUNK_SIZE};
1018         tcp_read_chunk_size =
1019             grpc_channel_arg_get_integer(&channel_args->args[i], options);
1020       } else if (0 == strcmp(channel_args->args[i].key,
1021                              GRPC_ARG_TCP_MIN_READ_CHUNK_SIZE)) {
1022         grpc_integer_options options = {tcp_read_chunk_size, 1, MAX_CHUNK_SIZE};
1023         tcp_min_read_chunk_size =
1024             grpc_channel_arg_get_integer(&channel_args->args[i], options);
1025       } else if (0 == strcmp(channel_args->args[i].key,
1026                              GRPC_ARG_TCP_MAX_READ_CHUNK_SIZE)) {
1027         grpc_integer_options options = {tcp_read_chunk_size, 1, MAX_CHUNK_SIZE};
1028         tcp_max_read_chunk_size =
1029             grpc_channel_arg_get_integer(&channel_args->args[i], options);
1030       } else if (0 ==
1031                  strcmp(channel_args->args[i].key, GRPC_ARG_RESOURCE_QUOTA)) {
1032         grpc_resource_quota_unref_internal(resource_quota);
1033         resource_quota =
1034             grpc_resource_quota_ref_internal(static_cast<grpc_resource_quota*>(
1035                 channel_args->args[i].value.pointer.p));
1036       }
1037     }
1038   }
1039 
1040   if (tcp_min_read_chunk_size > tcp_max_read_chunk_size) {
1041     tcp_min_read_chunk_size = tcp_max_read_chunk_size;
1042   }
1043   tcp_read_chunk_size = GPR_CLAMP(tcp_read_chunk_size, tcp_min_read_chunk_size,
1044                                   tcp_max_read_chunk_size);
1045 
1046   grpc_tcp* tcp = static_cast<grpc_tcp*>(gpr_malloc(sizeof(grpc_tcp)));
1047   tcp->base.vtable = &vtable;
1048   tcp->peer_string = gpr_strdup(peer_string);
1049   tcp->fd = grpc_fd_wrapped_fd(em_fd);
1050   tcp->read_cb = nullptr;
1051   tcp->write_cb = nullptr;
1052   tcp->release_fd_cb = nullptr;
1053   tcp->release_fd = nullptr;
1054   tcp->incoming_buffer = nullptr;
1055   tcp->target_length = static_cast<double>(tcp_read_chunk_size);
1056   tcp->min_read_chunk_size = tcp_min_read_chunk_size;
1057   tcp->max_read_chunk_size = tcp_max_read_chunk_size;
1058   tcp->bytes_read_this_round = 0;
1059   /* Will be set to false by the very first endpoint read function */
1060   tcp->is_first_read = true;
1061   tcp->bytes_counter = -1;
1062   tcp->socket_ts_enabled = false;
1063   /* paired with unref in grpc_tcp_destroy */
1064   gpr_ref_init(&tcp->refcount, 1);
1065   gpr_atm_no_barrier_store(&tcp->shutdown_count, 0);
1066   tcp->em_fd = em_fd;
1067   grpc_slice_buffer_init(&tcp->last_read_buffer);
1068   tcp->resource_user = grpc_resource_user_create(resource_quota, peer_string);
1069   grpc_resource_user_slice_allocator_init(
1070       &tcp->slice_allocator, tcp->resource_user, tcp_read_allocation_done, tcp);
1071   /* Tell network status tracker about new endpoint */
1072   grpc_network_status_register_endpoint(&tcp->base);
1073   grpc_resource_quota_unref_internal(resource_quota);
1074   gpr_mu_init(&tcp->tb_mu);
1075   tcp->tb_head = nullptr;
1076   /* Start being notified on errors if event engine can track errors. */
1077   if (grpc_event_engine_can_track_errors()) {
1078     /* Grab a ref to tcp so that we can safely access the tcp struct when
1079      * processing errors. We unref when we no longer want to track errors
1080      * separately. */
1081     TCP_REF(tcp, "error-tracking");
1082     gpr_atm_rel_store(&tcp->stop_error_notification, 0);
1083     GRPC_CLOSURE_INIT(&tcp->error_closure, tcp_handle_error, tcp,
1084                       grpc_schedule_on_exec_ctx);
1085     grpc_fd_notify_on_error(tcp->em_fd, &tcp->error_closure);
1086   }
1087 
1088   return &tcp->base;
1089 }
1090 
grpc_tcp_fd(grpc_endpoint * ep)1091 int grpc_tcp_fd(grpc_endpoint* ep) {
1092   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
1093   GPR_ASSERT(ep->vtable == &vtable);
1094   return grpc_fd_wrapped_fd(tcp->em_fd);
1095 }
1096 
grpc_tcp_destroy_and_release_fd(grpc_endpoint * ep,int * fd,grpc_closure * done)1097 void grpc_tcp_destroy_and_release_fd(grpc_endpoint* ep, int* fd,
1098                                      grpc_closure* done) {
1099   grpc_network_status_unregister_endpoint(ep);
1100   grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
1101   GPR_ASSERT(ep->vtable == &vtable);
1102   tcp->release_fd = fd;
1103   tcp->release_fd_cb = done;
1104   grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
1105   if (grpc_event_engine_can_track_errors()) {
1106     /* Stop errors notification. */
1107     gpr_atm_no_barrier_store(&tcp->stop_error_notification, true);
1108     grpc_fd_set_error(tcp->em_fd);
1109   }
1110   TCP_UNREF(tcp, "destroy");
1111 }
1112 
1113 #endif /* GRPC_POSIX_SOCKET_TCP */
1114