• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/socket/tcp_socket_win.h"
6 
7 #include <mstcpip.h>
8 
9 #include "base/callback_helpers.h"
10 #include "base/logging.h"
11 #include "base/metrics/stats_counters.h"
12 #include "base/win/windows_version.h"
13 #include "net/base/address_list.h"
14 #include "net/base/connection_type_histograms.h"
15 #include "net/base/io_buffer.h"
16 #include "net/base/ip_endpoint.h"
17 #include "net/base/net_errors.h"
18 #include "net/base/net_util.h"
19 #include "net/base/network_change_notifier.h"
20 #include "net/base/winsock_init.h"
21 #include "net/base/winsock_util.h"
22 #include "net/socket/socket_descriptor.h"
23 #include "net/socket/socket_net_log_params.h"
24 
25 namespace net {
26 
27 namespace {
28 
29 const int kTCPKeepAliveSeconds = 45;
30 
SetSocketReceiveBufferSize(SOCKET socket,int32 size)31 int SetSocketReceiveBufferSize(SOCKET socket, int32 size) {
32   int rv = setsockopt(socket, SOL_SOCKET, SO_RCVBUF,
33                       reinterpret_cast<const char*>(&size), sizeof(size));
34   int net_error = (rv == 0) ? OK : MapSystemError(WSAGetLastError());
35   DCHECK(!rv) << "Could not set socket receive buffer size: " << net_error;
36   return net_error;
37 }
38 
SetSocketSendBufferSize(SOCKET socket,int32 size)39 int SetSocketSendBufferSize(SOCKET socket, int32 size) {
40   int rv = setsockopt(socket, SOL_SOCKET, SO_SNDBUF,
41                       reinterpret_cast<const char*>(&size), sizeof(size));
42   int net_error = (rv == 0) ? OK : MapSystemError(WSAGetLastError());
43   DCHECK(!rv) << "Could not set socket send buffer size: " << net_error;
44   return net_error;
45 }
46 
47 // Disable Nagle.
48 // The Nagle implementation on windows is governed by RFC 896.  The idea
49 // behind Nagle is to reduce small packets on the network.  When Nagle is
50 // enabled, if a partial packet has been sent, the TCP stack will disallow
51 // further *partial* packets until an ACK has been received from the other
52 // side.  Good applications should always strive to send as much data as
53 // possible and avoid partial-packet sends.  However, in most real world
54 // applications, there are edge cases where this does not happen, and two
55 // partial packets may be sent back to back.  For a browser, it is NEVER
56 // a benefit to delay for an RTT before the second packet is sent.
57 //
58 // As a practical example in Chromium today, consider the case of a small
59 // POST.  I have verified this:
60 //     Client writes 649 bytes of header  (partial packet #1)
61 //     Client writes 50 bytes of POST data (partial packet #2)
62 // In the above example, with Nagle, a RTT delay is inserted between these
63 // two sends due to nagle.  RTTs can easily be 100ms or more.  The best
64 // fix is to make sure that for POSTing data, we write as much data as
65 // possible and minimize partial packets.  We will fix that.  But disabling
66 // Nagle also ensure we don't run into this delay in other edge cases.
67 // See also:
68 //    http://technet.microsoft.com/en-us/library/bb726981.aspx
DisableNagle(SOCKET socket,bool disable)69 bool DisableNagle(SOCKET socket, bool disable) {
70   BOOL val = disable ? TRUE : FALSE;
71   int rv = setsockopt(socket, IPPROTO_TCP, TCP_NODELAY,
72                       reinterpret_cast<const char*>(&val),
73                       sizeof(val));
74   DCHECK(!rv) << "Could not disable nagle";
75   return rv == 0;
76 }
77 
78 // Enable TCP Keep-Alive to prevent NAT routers from timing out TCP
79 // connections. See http://crbug.com/27400 for details.
SetTCPKeepAlive(SOCKET socket,BOOL enable,int delay_secs)80 bool SetTCPKeepAlive(SOCKET socket, BOOL enable, int delay_secs) {
81   int delay = delay_secs * 1000;
82   struct tcp_keepalive keepalive_vals = {
83     enable ? 1 : 0,  // TCP keep-alive on.
84     delay,  // Delay seconds before sending first TCP keep-alive packet.
85     delay,  // Delay seconds between sending TCP keep-alive packets.
86   };
87   DWORD bytes_returned = 0xABAB;
88   int rv = WSAIoctl(socket, SIO_KEEPALIVE_VALS, &keepalive_vals,
89                     sizeof(keepalive_vals), NULL, 0,
90                     &bytes_returned, NULL, NULL);
91   DCHECK(!rv) << "Could not enable TCP Keep-Alive for socket: " << socket
92               << " [error: " << WSAGetLastError() << "].";
93 
94   // Disregard any failure in disabling nagle or enabling TCP Keep-Alive.
95   return rv == 0;
96 }
97 
MapConnectError(int os_error)98 int MapConnectError(int os_error) {
99   switch (os_error) {
100     // connect fails with WSAEACCES when Windows Firewall blocks the
101     // connection.
102     case WSAEACCES:
103       return ERR_NETWORK_ACCESS_DENIED;
104     case WSAETIMEDOUT:
105       return ERR_CONNECTION_TIMED_OUT;
106     default: {
107       int net_error = MapSystemError(os_error);
108       if (net_error == ERR_FAILED)
109         return ERR_CONNECTION_FAILED;  // More specific than ERR_FAILED.
110 
111       // Give a more specific error when the user is offline.
112       if (net_error == ERR_ADDRESS_UNREACHABLE &&
113           NetworkChangeNotifier::IsOffline()) {
114         return ERR_INTERNET_DISCONNECTED;
115       }
116 
117       return net_error;
118     }
119   }
120 }
121 
122 }  // namespace
123 
124 //-----------------------------------------------------------------------------
125 
126 // This class encapsulates all the state that has to be preserved as long as
127 // there is a network IO operation in progress. If the owner TCPSocketWin is
128 // destroyed while an operation is in progress, the Core is detached and it
129 // lives until the operation completes and the OS doesn't reference any resource
130 // declared on this class anymore.
131 class TCPSocketWin::Core : public base::RefCounted<Core> {
132  public:
133   explicit Core(TCPSocketWin* socket);
134 
135   // Start watching for the end of a read or write operation.
136   void WatchForRead();
137   void WatchForWrite();
138 
139   // The TCPSocketWin is going away.
Detach()140   void Detach() { socket_ = NULL; }
141 
142   // The separate OVERLAPPED variables for asynchronous operation.
143   // |read_overlapped_| is used for both Connect() and Read().
144   // |write_overlapped_| is only used for Write();
145   OVERLAPPED read_overlapped_;
146   OVERLAPPED write_overlapped_;
147 
148   // The buffers used in Read() and Write().
149   scoped_refptr<IOBuffer> read_iobuffer_;
150   scoped_refptr<IOBuffer> write_iobuffer_;
151   int read_buffer_length_;
152   int write_buffer_length_;
153 
154   bool non_blocking_reads_initialized_;
155 
156  private:
157   friend class base::RefCounted<Core>;
158 
159   class ReadDelegate : public base::win::ObjectWatcher::Delegate {
160    public:
ReadDelegate(Core * core)161     explicit ReadDelegate(Core* core) : core_(core) {}
~ReadDelegate()162     virtual ~ReadDelegate() {}
163 
164     // base::ObjectWatcher::Delegate methods:
165     virtual void OnObjectSignaled(HANDLE object);
166 
167    private:
168     Core* const core_;
169   };
170 
171   class WriteDelegate : public base::win::ObjectWatcher::Delegate {
172    public:
WriteDelegate(Core * core)173     explicit WriteDelegate(Core* core) : core_(core) {}
~WriteDelegate()174     virtual ~WriteDelegate() {}
175 
176     // base::ObjectWatcher::Delegate methods:
177     virtual void OnObjectSignaled(HANDLE object);
178 
179    private:
180     Core* const core_;
181   };
182 
183   ~Core();
184 
185   // The socket that created this object.
186   TCPSocketWin* socket_;
187 
188   // |reader_| handles the signals from |read_watcher_|.
189   ReadDelegate reader_;
190   // |writer_| handles the signals from |write_watcher_|.
191   WriteDelegate writer_;
192 
193   // |read_watcher_| watches for events from Connect() and Read().
194   base::win::ObjectWatcher read_watcher_;
195   // |write_watcher_| watches for events from Write();
196   base::win::ObjectWatcher write_watcher_;
197 
198   DISALLOW_COPY_AND_ASSIGN(Core);
199 };
200 
Core(TCPSocketWin * socket)201 TCPSocketWin::Core::Core(TCPSocketWin* socket)
202     : read_buffer_length_(0),
203       write_buffer_length_(0),
204       non_blocking_reads_initialized_(false),
205       socket_(socket),
206       reader_(this),
207       writer_(this) {
208   memset(&read_overlapped_, 0, sizeof(read_overlapped_));
209   memset(&write_overlapped_, 0, sizeof(write_overlapped_));
210 
211   read_overlapped_.hEvent = WSACreateEvent();
212   write_overlapped_.hEvent = WSACreateEvent();
213 }
214 
~Core()215 TCPSocketWin::Core::~Core() {
216   // Make sure the message loop is not watching this object anymore.
217   read_watcher_.StopWatching();
218   write_watcher_.StopWatching();
219 
220   WSACloseEvent(read_overlapped_.hEvent);
221   memset(&read_overlapped_, 0xaf, sizeof(read_overlapped_));
222   WSACloseEvent(write_overlapped_.hEvent);
223   memset(&write_overlapped_, 0xaf, sizeof(write_overlapped_));
224 }
225 
WatchForRead()226 void TCPSocketWin::Core::WatchForRead() {
227   // We grab an extra reference because there is an IO operation in progress.
228   // Balanced in ReadDelegate::OnObjectSignaled().
229   AddRef();
230   read_watcher_.StartWatching(read_overlapped_.hEvent, &reader_);
231 }
232 
WatchForWrite()233 void TCPSocketWin::Core::WatchForWrite() {
234   // We grab an extra reference because there is an IO operation in progress.
235   // Balanced in WriteDelegate::OnObjectSignaled().
236   AddRef();
237   write_watcher_.StartWatching(write_overlapped_.hEvent, &writer_);
238 }
239 
OnObjectSignaled(HANDLE object)240 void TCPSocketWin::Core::ReadDelegate::OnObjectSignaled(HANDLE object) {
241   DCHECK_EQ(object, core_->read_overlapped_.hEvent);
242   if (core_->socket_) {
243     if (core_->socket_->waiting_connect_)
244       core_->socket_->DidCompleteConnect();
245     else
246       core_->socket_->DidSignalRead();
247   }
248 
249   core_->Release();
250 }
251 
OnObjectSignaled(HANDLE object)252 void TCPSocketWin::Core::WriteDelegate::OnObjectSignaled(
253     HANDLE object) {
254   DCHECK_EQ(object, core_->write_overlapped_.hEvent);
255   if (core_->socket_)
256     core_->socket_->DidCompleteWrite();
257 
258   core_->Release();
259 }
260 
261 //-----------------------------------------------------------------------------
262 
TCPSocketWin(net::NetLog * net_log,const net::NetLog::Source & source)263 TCPSocketWin::TCPSocketWin(net::NetLog* net_log,
264                            const net::NetLog::Source& source)
265     : socket_(INVALID_SOCKET),
266       accept_event_(WSA_INVALID_EVENT),
267       accept_socket_(NULL),
268       accept_address_(NULL),
269       waiting_connect_(false),
270       waiting_read_(false),
271       waiting_write_(false),
272       connect_os_error_(0),
273       logging_multiple_connect_attempts_(false),
274       net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_SOCKET)) {
275   net_log_.BeginEvent(NetLog::TYPE_SOCKET_ALIVE,
276                       source.ToEventParametersCallback());
277   EnsureWinsockInit();
278 }
279 
~TCPSocketWin()280 TCPSocketWin::~TCPSocketWin() {
281   Close();
282   net_log_.EndEvent(NetLog::TYPE_SOCKET_ALIVE);
283 }
284 
Open(AddressFamily family)285 int TCPSocketWin::Open(AddressFamily family) {
286   DCHECK(CalledOnValidThread());
287   DCHECK_EQ(socket_, INVALID_SOCKET);
288 
289   socket_ = CreatePlatformSocket(ConvertAddressFamily(family), SOCK_STREAM,
290                                  IPPROTO_TCP);
291   if (socket_ == INVALID_SOCKET) {
292     PLOG(ERROR) << "CreatePlatformSocket() returned an error";
293     return MapSystemError(WSAGetLastError());
294   }
295 
296   if (SetNonBlocking(socket_)) {
297     int result = MapSystemError(WSAGetLastError());
298     Close();
299     return result;
300   }
301 
302   return OK;
303 }
304 
AdoptConnectedSocket(SOCKET socket,const IPEndPoint & peer_address)305 int TCPSocketWin::AdoptConnectedSocket(SOCKET socket,
306                                        const IPEndPoint& peer_address) {
307   DCHECK(CalledOnValidThread());
308   DCHECK_EQ(socket_, INVALID_SOCKET);
309   DCHECK(!core_);
310 
311   socket_ = socket;
312 
313   if (SetNonBlocking(socket_)) {
314     int result = MapSystemError(WSAGetLastError());
315     Close();
316     return result;
317   }
318 
319   core_ = new Core(this);
320   peer_address_.reset(new IPEndPoint(peer_address));
321 
322   return OK;
323 }
324 
AdoptListenSocket(SOCKET socket)325 int TCPSocketWin::AdoptListenSocket(SOCKET socket) {
326   DCHECK(CalledOnValidThread());
327   DCHECK_EQ(socket_, INVALID_SOCKET);
328 
329   socket_ = socket;
330 
331   if (SetNonBlocking(socket_)) {
332     int result = MapSystemError(WSAGetLastError());
333     Close();
334     return result;
335   }
336 
337   // |core_| is not needed for sockets that are used to accept connections.
338   // The operation here is more like Open but with an existing socket.
339 
340   return OK;
341 }
342 
Bind(const IPEndPoint & address)343 int TCPSocketWin::Bind(const IPEndPoint& address) {
344   DCHECK(CalledOnValidThread());
345   DCHECK_NE(socket_, INVALID_SOCKET);
346 
347   SockaddrStorage storage;
348   if (!address.ToSockAddr(storage.addr, &storage.addr_len))
349     return ERR_ADDRESS_INVALID;
350 
351   int result = bind(socket_, storage.addr, storage.addr_len);
352   if (result < 0) {
353     PLOG(ERROR) << "bind() returned an error";
354     return MapSystemError(WSAGetLastError());
355   }
356 
357   return OK;
358 }
359 
Listen(int backlog)360 int TCPSocketWin::Listen(int backlog) {
361   DCHECK(CalledOnValidThread());
362   DCHECK_GT(backlog, 0);
363   DCHECK_NE(socket_, INVALID_SOCKET);
364   DCHECK_EQ(accept_event_, WSA_INVALID_EVENT);
365 
366   accept_event_ = WSACreateEvent();
367   if (accept_event_ == WSA_INVALID_EVENT) {
368     PLOG(ERROR) << "WSACreateEvent()";
369     return MapSystemError(WSAGetLastError());
370   }
371 
372   int result = listen(socket_, backlog);
373   if (result < 0) {
374     PLOG(ERROR) << "listen() returned an error";
375     return MapSystemError(WSAGetLastError());
376   }
377 
378   return OK;
379 }
380 
Accept(scoped_ptr<TCPSocketWin> * socket,IPEndPoint * address,const CompletionCallback & callback)381 int TCPSocketWin::Accept(scoped_ptr<TCPSocketWin>* socket,
382                          IPEndPoint* address,
383                          const CompletionCallback& callback) {
384   DCHECK(CalledOnValidThread());
385   DCHECK(socket);
386   DCHECK(address);
387   DCHECK(!callback.is_null());
388   DCHECK(accept_callback_.is_null());
389 
390   net_log_.BeginEvent(NetLog::TYPE_TCP_ACCEPT);
391 
392   int result = AcceptInternal(socket, address);
393 
394   if (result == ERR_IO_PENDING) {
395     // Start watching.
396     WSAEventSelect(socket_, accept_event_, FD_ACCEPT);
397     accept_watcher_.StartWatching(accept_event_, this);
398 
399     accept_socket_ = socket;
400     accept_address_ = address;
401     accept_callback_ = callback;
402   }
403 
404   return result;
405 }
406 
Connect(const IPEndPoint & address,const CompletionCallback & callback)407 int TCPSocketWin::Connect(const IPEndPoint& address,
408                           const CompletionCallback& callback) {
409   DCHECK(CalledOnValidThread());
410   DCHECK_NE(socket_, INVALID_SOCKET);
411   DCHECK(!waiting_connect_);
412 
413   // |peer_address_| and |core_| will be non-NULL if Connect() has been called.
414   // Unless Close() is called to reset the internal state, a second call to
415   // Connect() is not allowed.
416   // Please note that we enforce this even if the previous Connect() has
417   // completed and failed. Although it is allowed to connect the same |socket_|
418   // again after a connection attempt failed on Windows, it results in
419   // unspecified behavior according to POSIX. Therefore, we make it behave in
420   // the same way as TCPSocketLibevent.
421   DCHECK(!peer_address_ && !core_);
422 
423   if (!logging_multiple_connect_attempts_)
424     LogConnectBegin(AddressList(address));
425 
426   peer_address_.reset(new IPEndPoint(address));
427 
428   int rv = DoConnect();
429   if (rv == ERR_IO_PENDING) {
430     // Synchronous operation not supported.
431     DCHECK(!callback.is_null());
432     read_callback_ = callback;
433     waiting_connect_ = true;
434   } else {
435     DoConnectComplete(rv);
436   }
437 
438   return rv;
439 }
440 
IsConnected() const441 bool TCPSocketWin::IsConnected() const {
442   DCHECK(CalledOnValidThread());
443 
444   if (socket_ == INVALID_SOCKET || waiting_connect_)
445     return false;
446 
447   if (waiting_read_)
448     return true;
449 
450   // Check if connection is alive.
451   char c;
452   int rv = recv(socket_, &c, 1, MSG_PEEK);
453   if (rv == 0)
454     return false;
455   if (rv == SOCKET_ERROR && WSAGetLastError() != WSAEWOULDBLOCK)
456     return false;
457 
458   return true;
459 }
460 
IsConnectedAndIdle() const461 bool TCPSocketWin::IsConnectedAndIdle() const {
462   DCHECK(CalledOnValidThread());
463 
464   if (socket_ == INVALID_SOCKET || waiting_connect_)
465     return false;
466 
467   if (waiting_read_)
468     return true;
469 
470   // Check if connection is alive and we haven't received any data
471   // unexpectedly.
472   char c;
473   int rv = recv(socket_, &c, 1, MSG_PEEK);
474   if (rv >= 0)
475     return false;
476   if (WSAGetLastError() != WSAEWOULDBLOCK)
477     return false;
478 
479   return true;
480 }
481 
Read(IOBuffer * buf,int buf_len,const CompletionCallback & callback)482 int TCPSocketWin::Read(IOBuffer* buf,
483                        int buf_len,
484                        const CompletionCallback& callback) {
485   DCHECK(CalledOnValidThread());
486   DCHECK_NE(socket_, INVALID_SOCKET);
487   DCHECK(!waiting_read_);
488   DCHECK(read_callback_.is_null());
489   DCHECK(!core_->read_iobuffer_);
490 
491   return DoRead(buf, buf_len, callback);
492 }
493 
Write(IOBuffer * buf,int buf_len,const CompletionCallback & callback)494 int TCPSocketWin::Write(IOBuffer* buf,
495                         int buf_len,
496                         const CompletionCallback& callback) {
497   DCHECK(CalledOnValidThread());
498   DCHECK_NE(socket_, INVALID_SOCKET);
499   DCHECK(!waiting_write_);
500   DCHECK(write_callback_.is_null());
501   DCHECK_GT(buf_len, 0);
502   DCHECK(!core_->write_iobuffer_);
503 
504   base::StatsCounter writes("tcp.writes");
505   writes.Increment();
506 
507   WSABUF write_buffer;
508   write_buffer.len = buf_len;
509   write_buffer.buf = buf->data();
510 
511   // TODO(wtc): Remove the assertion after enough testing.
512   AssertEventNotSignaled(core_->write_overlapped_.hEvent);
513   DWORD num;
514   int rv = WSASend(socket_, &write_buffer, 1, &num, 0,
515                    &core_->write_overlapped_, NULL);
516   if (rv == 0) {
517     if (ResetEventIfSignaled(core_->write_overlapped_.hEvent)) {
518       rv = static_cast<int>(num);
519       if (rv > buf_len || rv < 0) {
520         // It seems that some winsock interceptors report that more was written
521         // than was available. Treat this as an error.  http://crbug.com/27870
522         LOG(ERROR) << "Detected broken LSP: Asked to write " << buf_len
523                    << " bytes, but " << rv << " bytes reported.";
524         return ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
525       }
526       base::StatsCounter write_bytes("tcp.write_bytes");
527       write_bytes.Add(rv);
528       net_log_.AddByteTransferEvent(NetLog::TYPE_SOCKET_BYTES_SENT, rv,
529                                     buf->data());
530       return rv;
531     }
532   } else {
533     int os_error = WSAGetLastError();
534     if (os_error != WSA_IO_PENDING) {
535       int net_error = MapSystemError(os_error);
536       net_log_.AddEvent(NetLog::TYPE_SOCKET_WRITE_ERROR,
537                         CreateNetLogSocketErrorCallback(net_error, os_error));
538       return net_error;
539     }
540   }
541   waiting_write_ = true;
542   write_callback_ = callback;
543   core_->write_iobuffer_ = buf;
544   core_->write_buffer_length_ = buf_len;
545   core_->WatchForWrite();
546   return ERR_IO_PENDING;
547 }
548 
GetLocalAddress(IPEndPoint * address) const549 int TCPSocketWin::GetLocalAddress(IPEndPoint* address) const {
550   DCHECK(CalledOnValidThread());
551   DCHECK(address);
552 
553   SockaddrStorage storage;
554   if (getsockname(socket_, storage.addr, &storage.addr_len))
555     return MapSystemError(WSAGetLastError());
556   if (!address->FromSockAddr(storage.addr, storage.addr_len))
557     return ERR_ADDRESS_INVALID;
558 
559   return OK;
560 }
561 
GetPeerAddress(IPEndPoint * address) const562 int TCPSocketWin::GetPeerAddress(IPEndPoint* address) const {
563   DCHECK(CalledOnValidThread());
564   DCHECK(address);
565   if (!IsConnected())
566     return ERR_SOCKET_NOT_CONNECTED;
567   *address = *peer_address_;
568   return OK;
569 }
570 
SetDefaultOptionsForServer()571 int TCPSocketWin::SetDefaultOptionsForServer() {
572   return SetExclusiveAddrUse();
573 }
574 
SetDefaultOptionsForClient()575 void TCPSocketWin::SetDefaultOptionsForClient() {
576   // Increase the socket buffer sizes from the default sizes for WinXP.  In
577   // performance testing, there is substantial benefit by increasing from 8KB
578   // to 64KB.
579   // See also:
580   //    http://support.microsoft.com/kb/823764/EN-US
581   // On Vista, if we manually set these sizes, Vista turns off its receive
582   // window auto-tuning feature.
583   //    http://blogs.msdn.com/wndp/archive/2006/05/05/Winhec-blog-tcpip-2.aspx
584   // Since Vista's auto-tune is better than any static value we can could set,
585   // only change these on pre-vista machines.
586   if (base::win::GetVersion() < base::win::VERSION_VISTA) {
587     const int32 kSocketBufferSize = 64 * 1024;
588     SetSocketReceiveBufferSize(socket_, kSocketBufferSize);
589     SetSocketSendBufferSize(socket_, kSocketBufferSize);
590   }
591 
592   DisableNagle(socket_, true);
593   SetTCPKeepAlive(socket_, true, kTCPKeepAliveSeconds);
594 }
595 
SetExclusiveAddrUse()596 int TCPSocketWin::SetExclusiveAddrUse() {
597   // On Windows, a bound end point can be hijacked by another process by
598   // setting SO_REUSEADDR. Therefore a Windows-only option SO_EXCLUSIVEADDRUSE
599   // was introduced in Windows NT 4.0 SP4. If the socket that is bound to the
600   // end point has SO_EXCLUSIVEADDRUSE enabled, it is not possible for another
601   // socket to forcibly bind to the end point until the end point is unbound.
602   // It is recommend that all server applications must use SO_EXCLUSIVEADDRUSE.
603   // MSDN: http://goo.gl/M6fjQ.
604   //
605   // Unlike on *nix, on Windows a TCP server socket can always bind to an end
606   // point in TIME_WAIT state without setting SO_REUSEADDR, therefore it is not
607   // needed here.
608   //
609   // SO_EXCLUSIVEADDRUSE will prevent a TCP client socket from binding to an end
610   // point in TIME_WAIT status. It does not have this effect for a TCP server
611   // socket.
612 
613   BOOL true_value = 1;
614   int rv = setsockopt(socket_, SOL_SOCKET, SO_EXCLUSIVEADDRUSE,
615                       reinterpret_cast<const char*>(&true_value),
616                       sizeof(true_value));
617   if (rv < 0)
618     return MapSystemError(errno);
619   return OK;
620 }
621 
SetReceiveBufferSize(int32 size)622 int TCPSocketWin::SetReceiveBufferSize(int32 size) {
623   DCHECK(CalledOnValidThread());
624   return SetSocketReceiveBufferSize(socket_, size);
625 }
626 
SetSendBufferSize(int32 size)627 int TCPSocketWin::SetSendBufferSize(int32 size) {
628   DCHECK(CalledOnValidThread());
629   return SetSocketSendBufferSize(socket_, size);
630 }
631 
SetKeepAlive(bool enable,int delay)632 bool TCPSocketWin::SetKeepAlive(bool enable, int delay) {
633   return SetTCPKeepAlive(socket_, enable, delay);
634 }
635 
SetNoDelay(bool no_delay)636 bool TCPSocketWin::SetNoDelay(bool no_delay) {
637   return DisableNagle(socket_, no_delay);
638 }
639 
Close()640 void TCPSocketWin::Close() {
641   DCHECK(CalledOnValidThread());
642 
643   if (socket_ != INVALID_SOCKET) {
644     // Only log the close event if there's actually a socket to close.
645     net_log_.AddEvent(NetLog::EventType::TYPE_SOCKET_CLOSED);
646 
647     // Note: don't use CancelIo to cancel pending IO because it doesn't work
648     // when there is a Winsock layered service provider.
649 
650     // In most socket implementations, closing a socket results in a graceful
651     // connection shutdown, but in Winsock we have to call shutdown explicitly.
652     // See the MSDN page "Graceful Shutdown, Linger Options, and Socket Closure"
653     // at http://msdn.microsoft.com/en-us/library/ms738547.aspx
654     shutdown(socket_, SD_SEND);
655 
656     // This cancels any pending IO.
657     if (closesocket(socket_) < 0)
658       PLOG(ERROR) << "closesocket";
659     socket_ = INVALID_SOCKET;
660   }
661 
662   if (!accept_callback_.is_null()) {
663     accept_watcher_.StopWatching();
664     accept_socket_ = NULL;
665     accept_address_ = NULL;
666     accept_callback_.Reset();
667   }
668 
669   if (accept_event_) {
670     WSACloseEvent(accept_event_);
671     accept_event_ = WSA_INVALID_EVENT;
672   }
673 
674   if (core_) {
675     if (waiting_connect_) {
676       // We closed the socket, so this notification will never come.
677       // From MSDN' WSAEventSelect documentation:
678       // "Closing a socket with closesocket also cancels the association and
679       // selection of network events specified in WSAEventSelect for the
680       // socket".
681       core_->Release();
682     }
683     core_->Detach();
684     core_ = NULL;
685   }
686 
687   waiting_connect_ = false;
688   waiting_read_ = false;
689   waiting_write_ = false;
690 
691   read_callback_.Reset();
692   write_callback_.Reset();
693   peer_address_.reset();
694   connect_os_error_ = 0;
695 }
696 
UsingTCPFastOpen() const697 bool TCPSocketWin::UsingTCPFastOpen() const {
698   // Not supported on windows.
699   return false;
700 }
701 
StartLoggingMultipleConnectAttempts(const AddressList & addresses)702 void TCPSocketWin::StartLoggingMultipleConnectAttempts(
703     const AddressList& addresses) {
704   if (!logging_multiple_connect_attempts_) {
705     logging_multiple_connect_attempts_ = true;
706     LogConnectBegin(addresses);
707   } else {
708     NOTREACHED();
709   }
710 }
711 
EndLoggingMultipleConnectAttempts(int net_error)712 void TCPSocketWin::EndLoggingMultipleConnectAttempts(int net_error) {
713   if (logging_multiple_connect_attempts_) {
714     LogConnectEnd(net_error);
715     logging_multiple_connect_attempts_ = false;
716   } else {
717     NOTREACHED();
718   }
719 }
720 
AcceptInternal(scoped_ptr<TCPSocketWin> * socket,IPEndPoint * address)721 int TCPSocketWin::AcceptInternal(scoped_ptr<TCPSocketWin>* socket,
722                                  IPEndPoint* address) {
723   SockaddrStorage storage;
724   int new_socket = accept(socket_, storage.addr, &storage.addr_len);
725   if (new_socket < 0) {
726     int net_error = MapSystemError(WSAGetLastError());
727     if (net_error != ERR_IO_PENDING)
728       net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_ACCEPT, net_error);
729     return net_error;
730   }
731 
732   IPEndPoint ip_end_point;
733   if (!ip_end_point.FromSockAddr(storage.addr, storage.addr_len)) {
734     NOTREACHED();
735     if (closesocket(new_socket) < 0)
736       PLOG(ERROR) << "closesocket";
737     int net_error = ERR_ADDRESS_INVALID;
738     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_ACCEPT, net_error);
739     return net_error;
740   }
741   scoped_ptr<TCPSocketWin> tcp_socket(new TCPSocketWin(
742       net_log_.net_log(), net_log_.source()));
743   int adopt_result = tcp_socket->AdoptConnectedSocket(new_socket, ip_end_point);
744   if (adopt_result != OK) {
745     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_ACCEPT, adopt_result);
746     return adopt_result;
747   }
748   *socket = tcp_socket.Pass();
749   *address = ip_end_point;
750   net_log_.EndEvent(NetLog::TYPE_TCP_ACCEPT,
751                     CreateNetLogIPEndPointCallback(&ip_end_point));
752   return OK;
753 }
754 
OnObjectSignaled(HANDLE object)755 void TCPSocketWin::OnObjectSignaled(HANDLE object) {
756   WSANETWORKEVENTS ev;
757   if (WSAEnumNetworkEvents(socket_, accept_event_, &ev) == SOCKET_ERROR) {
758     PLOG(ERROR) << "WSAEnumNetworkEvents()";
759     return;
760   }
761 
762   if (ev.lNetworkEvents & FD_ACCEPT) {
763     int result = AcceptInternal(accept_socket_, accept_address_);
764     if (result != ERR_IO_PENDING) {
765       accept_socket_ = NULL;
766       accept_address_ = NULL;
767       base::ResetAndReturn(&accept_callback_).Run(result);
768     }
769   } else {
770     // This happens when a client opens a connection and closes it before we
771     // have a chance to accept it.
772     DCHECK(ev.lNetworkEvents == 0);
773 
774     // Start watching the next FD_ACCEPT event.
775     WSAEventSelect(socket_, accept_event_, FD_ACCEPT);
776     accept_watcher_.StartWatching(accept_event_, this);
777   }
778 }
779 
DoConnect()780 int TCPSocketWin::DoConnect() {
781   DCHECK_EQ(connect_os_error_, 0);
782   DCHECK(!core_);
783 
784   net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
785                       CreateNetLogIPEndPointCallback(peer_address_.get()));
786 
787   core_ = new Core(this);
788   // WSAEventSelect sets the socket to non-blocking mode as a side effect.
789   // Our connect() and recv() calls require that the socket be non-blocking.
790   WSAEventSelect(socket_, core_->read_overlapped_.hEvent, FD_CONNECT);
791 
792   SockaddrStorage storage;
793   if (!peer_address_->ToSockAddr(storage.addr, &storage.addr_len))
794     return ERR_ADDRESS_INVALID;
795   if (!connect(socket_, storage.addr, storage.addr_len)) {
796     // Connected without waiting!
797     //
798     // The MSDN page for connect says:
799     //   With a nonblocking socket, the connection attempt cannot be completed
800     //   immediately. In this case, connect will return SOCKET_ERROR, and
801     //   WSAGetLastError will return WSAEWOULDBLOCK.
802     // which implies that for a nonblocking socket, connect never returns 0.
803     // It's not documented whether the event object will be signaled or not
804     // if connect does return 0.  So the code below is essentially dead code
805     // and we don't know if it's correct.
806     NOTREACHED();
807 
808     if (ResetEventIfSignaled(core_->read_overlapped_.hEvent))
809       return OK;
810   } else {
811     int os_error = WSAGetLastError();
812     if (os_error != WSAEWOULDBLOCK) {
813       LOG(ERROR) << "connect failed: " << os_error;
814       connect_os_error_ = os_error;
815       int rv = MapConnectError(os_error);
816       CHECK_NE(ERR_IO_PENDING, rv);
817       return rv;
818     }
819   }
820 
821   core_->WatchForRead();
822   return ERR_IO_PENDING;
823 }
824 
DoConnectComplete(int result)825 void TCPSocketWin::DoConnectComplete(int result) {
826   // Log the end of this attempt (and any OS error it threw).
827   int os_error = connect_os_error_;
828   connect_os_error_ = 0;
829   if (result != OK) {
830     net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
831                       NetLog::IntegerCallback("os_error", os_error));
832   } else {
833     net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT);
834   }
835 
836   if (!logging_multiple_connect_attempts_)
837     LogConnectEnd(result);
838 }
839 
LogConnectBegin(const AddressList & addresses)840 void TCPSocketWin::LogConnectBegin(const AddressList& addresses) {
841   base::StatsCounter connects("tcp.connect");
842   connects.Increment();
843 
844   net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT,
845                       addresses.CreateNetLogCallback());
846 }
847 
LogConnectEnd(int net_error)848 void TCPSocketWin::LogConnectEnd(int net_error) {
849   if (net_error == OK)
850     UpdateConnectionTypeHistograms(CONNECTION_ANY);
851 
852   if (net_error != OK) {
853     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, net_error);
854     return;
855   }
856 
857   struct sockaddr_storage source_address;
858   socklen_t addrlen = sizeof(source_address);
859   int rv = getsockname(
860       socket_, reinterpret_cast<struct sockaddr*>(&source_address), &addrlen);
861   if (rv != 0) {
862     LOG(ERROR) << "getsockname() [rv: " << rv
863                << "] error: " << WSAGetLastError();
864     NOTREACHED();
865     net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, rv);
866     return;
867   }
868 
869   net_log_.EndEvent(
870       NetLog::TYPE_TCP_CONNECT,
871       CreateNetLogSourceAddressCallback(
872           reinterpret_cast<const struct sockaddr*>(&source_address),
873           sizeof(source_address)));
874 }
875 
DoRead(IOBuffer * buf,int buf_len,const CompletionCallback & callback)876 int TCPSocketWin::DoRead(IOBuffer* buf, int buf_len,
877                          const CompletionCallback& callback) {
878   if (!core_->non_blocking_reads_initialized_) {
879     WSAEventSelect(socket_, core_->read_overlapped_.hEvent,
880                    FD_READ | FD_CLOSE);
881     core_->non_blocking_reads_initialized_ = true;
882   }
883   int rv = recv(socket_, buf->data(), buf_len, 0);
884   if (rv == SOCKET_ERROR) {
885     int os_error = WSAGetLastError();
886     if (os_error != WSAEWOULDBLOCK) {
887       int net_error = MapSystemError(os_error);
888       net_log_.AddEvent(
889           NetLog::TYPE_SOCKET_READ_ERROR,
890           CreateNetLogSocketErrorCallback(net_error, os_error));
891       return net_error;
892     }
893   } else {
894     base::StatsCounter read_bytes("tcp.read_bytes");
895     if (rv > 0)
896       read_bytes.Add(rv);
897     net_log_.AddByteTransferEvent(NetLog::TYPE_SOCKET_BYTES_RECEIVED, rv,
898                                   buf->data());
899     return rv;
900   }
901 
902   waiting_read_ = true;
903   read_callback_ = callback;
904   core_->read_iobuffer_ = buf;
905   core_->read_buffer_length_ = buf_len;
906   core_->WatchForRead();
907   return ERR_IO_PENDING;
908 }
909 
DidCompleteConnect()910 void TCPSocketWin::DidCompleteConnect() {
911   DCHECK(waiting_connect_);
912   DCHECK(!read_callback_.is_null());
913   int result;
914 
915   WSANETWORKEVENTS events;
916   int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
917                                 &events);
918   int os_error = 0;
919   if (rv == SOCKET_ERROR) {
920     NOTREACHED();
921     os_error = WSAGetLastError();
922     result = MapSystemError(os_error);
923   } else if (events.lNetworkEvents & FD_CONNECT) {
924     os_error = events.iErrorCode[FD_CONNECT_BIT];
925     result = MapConnectError(os_error);
926   } else {
927     NOTREACHED();
928     result = ERR_UNEXPECTED;
929   }
930 
931   connect_os_error_ = os_error;
932   DoConnectComplete(result);
933   waiting_connect_ = false;
934 
935   DCHECK_NE(result, ERR_IO_PENDING);
936   base::ResetAndReturn(&read_callback_).Run(result);
937 }
938 
DidCompleteWrite()939 void TCPSocketWin::DidCompleteWrite() {
940   DCHECK(waiting_write_);
941   DCHECK(!write_callback_.is_null());
942 
943   DWORD num_bytes, flags;
944   BOOL ok = WSAGetOverlappedResult(socket_, &core_->write_overlapped_,
945                                    &num_bytes, FALSE, &flags);
946   WSAResetEvent(core_->write_overlapped_.hEvent);
947   waiting_write_ = false;
948   int rv;
949   if (!ok) {
950     int os_error = WSAGetLastError();
951     rv = MapSystemError(os_error);
952     net_log_.AddEvent(NetLog::TYPE_SOCKET_WRITE_ERROR,
953                       CreateNetLogSocketErrorCallback(rv, os_error));
954   } else {
955     rv = static_cast<int>(num_bytes);
956     if (rv > core_->write_buffer_length_ || rv < 0) {
957       // It seems that some winsock interceptors report that more was written
958       // than was available. Treat this as an error.  http://crbug.com/27870
959       LOG(ERROR) << "Detected broken LSP: Asked to write "
960                  << core_->write_buffer_length_ << " bytes, but " << rv
961                  << " bytes reported.";
962       rv = ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
963     } else {
964       base::StatsCounter write_bytes("tcp.write_bytes");
965       write_bytes.Add(num_bytes);
966       net_log_.AddByteTransferEvent(NetLog::TYPE_SOCKET_BYTES_SENT, num_bytes,
967                                     core_->write_iobuffer_->data());
968     }
969   }
970 
971   core_->write_iobuffer_ = NULL;
972 
973   DCHECK_NE(rv, ERR_IO_PENDING);
974   base::ResetAndReturn(&write_callback_).Run(rv);
975 }
976 
DidSignalRead()977 void TCPSocketWin::DidSignalRead() {
978   DCHECK(waiting_read_);
979   DCHECK(!read_callback_.is_null());
980 
981   int os_error = 0;
982   WSANETWORKEVENTS network_events;
983   int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
984                                 &network_events);
985   if (rv == SOCKET_ERROR) {
986     os_error = WSAGetLastError();
987     rv = MapSystemError(os_error);
988   } else if (network_events.lNetworkEvents) {
989     DCHECK_EQ(network_events.lNetworkEvents & ~(FD_READ | FD_CLOSE), 0);
990     // If network_events.lNetworkEvents is FD_CLOSE and
991     // network_events.iErrorCode[FD_CLOSE_BIT] is 0, it is a graceful
992     // connection closure. It is tempting to directly set rv to 0 in
993     // this case, but the MSDN pages for WSAEventSelect and
994     // WSAAsyncSelect recommend we still call DoRead():
995     //   FD_CLOSE should only be posted after all data is read from a
996     //   socket, but an application should check for remaining data upon
997     //   receipt of FD_CLOSE to avoid any possibility of losing data.
998     //
999     // If network_events.iErrorCode[FD_READ_BIT] or
1000     // network_events.iErrorCode[FD_CLOSE_BIT] is nonzero, still call
1001     // DoRead() because recv() reports a more accurate error code
1002     // (WSAECONNRESET vs. WSAECONNABORTED) when the connection was
1003     // reset.
1004     rv = DoRead(core_->read_iobuffer_, core_->read_buffer_length_,
1005                 read_callback_);
1006     if (rv == ERR_IO_PENDING)
1007       return;
1008   } else {
1009     // This may happen because Read() may succeed synchronously and
1010     // consume all the received data without resetting the event object.
1011     core_->WatchForRead();
1012     return;
1013   }
1014 
1015   waiting_read_ = false;
1016   core_->read_iobuffer_ = NULL;
1017   core_->read_buffer_length_ = 0;
1018 
1019   DCHECK_NE(rv, ERR_IO_PENDING);
1020   base::ResetAndReturn(&read_callback_).Run(rv);
1021 }
1022 
1023 }  // namespace net
1024 
1025