1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_ 17 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_ 18 19 #include <string> 20 #include <unordered_set> 21 #include <vector> 22 23 #include "tensorflow/core/distributed_runtime/message_wrappers.h" 24 #include "tensorflow/core/lib/core/status.h" 25 #include "tensorflow/core/platform/mutex.h" 26 #include "tensorflow/core/platform/protobuf.h" 27 #include "tensorflow/core/platform/thread_annotations.h" 28 #include "tensorflow/core/platform/types.h" 29 #include "tensorflow/core/protobuf/worker.pb.h" 30 31 namespace tensorflow { 32 33 // RecentRequestIds tracks recent 64-bit request_ids. When maximum capacity is 34 // reached, the oldest request_id is evicted. Thread safe. 35 // 36 // Some RPCs like RecvTensor are unsafe to retry. For example, RecvTensor pairs 37 // one sender and one receiver, and the receiver waits for the sender's tensor. 38 // Retried RecvTensor requests are problematic, because the original RecvTensor 39 // request may have consumed the sender's tensor, so a retried request might 40 // block forever. RecentRequestIds identifies retried requests, so we can fail 41 // them instead of blocking forever. 42 // 43 // Internally, recent request_ids are stored in two data structures: a set and a 44 // circular buffer. The set is used for efficient lookups, and the circular 45 // buffer tracks the oldest request_id. When the buffer is full, the new 46 // request_id replaces the oldest request_id in the circular buffer, and the 47 // oldest request_id is removed from the set. 48 class RecentRequestIds { 49 public: 50 // num_tracked_request_ids should be much larger than the number of RPCs that 51 // can be received in a small time window. For example, we observed a peak RPC 52 // rate of ~700 RecvTensor RPC/s when training inception v3 on TPUs, so we 53 // currently set num_tracked_request_ids to 100,000 for RecvTensor. 54 RecentRequestIds(int num_tracked_request_ids); 55 56 // Returns OK iff request_id has not been seen in the last 57 // num_tracked_request_ids insertions. For backwards compatibility, this 58 // always returns OK for request_id 0. The method_name and the request's 59 // ShortDebugString are added to returned errors. 60 Status TrackUnique(int64 request_id, const string& method_name, 61 const protobuf::Message& request); 62 // Overloaded versions of the above function for wrapped protos. 63 Status TrackUnique(int64 request_id, const string& method_name, 64 const RunStepRequestWrapper* wrapper); 65 66 private: 67 bool Insert(int64 request_id); 68 69 mutex mu_; 70 // next_index_ indexes into circular_buffer_, and points to the next storage 71 // space to use. When the buffer is full, next_index_ points at the oldest 72 // request_id. 73 int next_index_ GUARDED_BY(mu_) = 0; 74 std::vector<int64> circular_buffer_ GUARDED_BY(mu_); 75 std::unordered_set<int64> set_ GUARDED_BY(mu_); 76 }; 77 78 } // namespace tensorflow 79 80 #endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_ 81