• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <gmock/gmock.h>
16 #include <gtest/gtest.h>
17 
18 #include <chrono>
19 #include <string>
20 #include <thread>
21 #include <vector>
22 
23 #include "absl/log/check.h"
24 #include "envoy/config/cluster/v3/cluster.pb.h"
25 #include "envoy/config/cluster/v3/outlier_detection.pb.h"
26 #include "envoy/extensions/filters/http/fault/v3/fault.pb.h"
27 #include "envoy/extensions/filters/http/router/v3/router.pb.h"
28 #include "src/core/client_channel/backup_poller.h"
29 #include "src/core/config/config_vars.h"
30 #include "test/core/test_util/resolve_localhost_ip46.h"
31 #include "test/cpp/end2end/xds/xds_end2end_test_lib.h"
32 
33 namespace grpc {
34 namespace testing {
35 namespace {
36 
37 class OutlierDetectionTest : public XdsEnd2endTest {
38  protected:
CreateMetadataValueThatHashesToBackend(int index)39   std::string CreateMetadataValueThatHashesToBackend(int index) {
40     return absl::StrCat(grpc_core::LocalIp(), ":", backends_[index]->port(),
41                         "_0");
42   }
43 };
44 
45 INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest,
46                          ::testing::Values(XdsTestType()), &XdsTestType::Name);
47 // TODO(donnadionne): add non-xds test a new
48 // test/cpp/end2end/outlier_detection_end2end_test.cc
49 
50 // Tests SuccessRateEjectionAndUnejection:
51 // 1. Use ring hash policy that hashes using a header value to ensure rpcs
52 //    go to all backends.
53 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
54 //    interval to pass.
55 // 3. We should skip exactly 1 backend due to ejection and all the loads
56 //    sticky to that backend should go to 1 other backend.
57 // 4. Let the ejection period pass and verify we can go back to both backends
58 //    after the uneject.
TEST_P(OutlierDetectionTest,SuccessRateEjectionAndUnejection)59 TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) {
60   CreateAndStartBackends(2);
61   auto cluster = default_cluster_;
62   cluster.set_lb_policy(Cluster::RING_HASH);
63   // Setup outlier failure percentage parameters.
64   // Any failure will cause an potential ejection with the probability of 100%
65   // (to eliminate flakiness of the test).
66   auto* outlier_detection = cluster.mutable_outlier_detection();
67   SetProtoDuration(grpc_core::Duration::Seconds(1),
68                    outlier_detection->mutable_interval());
69   SetProtoDuration(grpc_core::Duration::Seconds(1),
70                    outlier_detection->mutable_base_ejection_time());
71   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
72   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
73   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
74   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
75   balancer_->ads_service()->SetCdsResource(cluster);
76   auto new_route_config = default_route_config_;
77   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
78   auto* hash_policy = route->mutable_route()->add_hash_policy();
79   hash_policy->mutable_header()->set_header_name("address_hash");
80   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
81                                    new_route_config);
82   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
83   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
84   // Note each type of RPC will contain a header value that will always be
85   // hashed to a specific backend as the header value matches the value used
86   // to create the entry in the ring.
87   std::vector<std::pair<std::string, std::string>> metadata = {
88       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
89   std::vector<std::pair<std::string, std::string>> metadata1 = {
90       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
91   const auto rpc_options = RpcOptions().set_metadata(metadata);
92   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
93   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
94                  WaitForBackendOptions(), rpc_options);
95   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
96                  WaitForBackendOptions(), rpc_options1);
97   // Trigger an error to backend 0.
98   // The success rate enforcement_percentage is 100%, so this will cause
99   // the backend to be ejected when the ejection timer fires.
100   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
101                       RpcOptions()
102                           .set_metadata(std::move(metadata))
103                           .set_server_expected_error(StatusCode::CANCELLED));
104   // Wait for traffic aimed at backend 0 to start going to backend 1.
105   // This tells us that backend 0 has been ejected.
106   // It should take no more than one ejection timer interval.
107   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
108                  WaitForBackendOptions().set_timeout_ms(
109                      3000 * grpc_test_slowdown_factor()),
110                  rpc_options);
111   // Now wait for traffic aimed at backend 0 to switch back to backend 0.
112   // This tells us that backend 0 has been unejected.
113   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
114                  WaitForBackendOptions().set_timeout_ms(
115                      3000 * grpc_test_slowdown_factor()),
116                  rpc_options);
117 }
118 
119 // We don't eject more than max_ejection_percent (default 10%) of the backends
120 // beyond the first one.
TEST_P(OutlierDetectionTest,SuccessRateMaxPercent)121 TEST_P(OutlierDetectionTest, SuccessRateMaxPercent) {
122   CreateAndStartBackends(4);
123   auto cluster = default_cluster_;
124   cluster.set_lb_policy(Cluster::RING_HASH);
125   // Setup outlier failure percentage parameters.
126   // Any failure will cause an potential ejection with the probability of 100%
127   // (to eliminate flakiness of the test).
128   auto* outlier_detection = cluster.mutable_outlier_detection();
129   SetProtoDuration(grpc_core::Duration::Seconds(1),
130                    outlier_detection->mutable_interval());
131   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
132   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
133   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
134   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
135   balancer_->ads_service()->SetCdsResource(cluster);
136   auto new_route_config = default_route_config_;
137   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
138   auto* hash_policy = route->mutable_route()->add_hash_policy();
139   hash_policy->mutable_header()->set_header_name("address_hash");
140   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
141                                    new_route_config);
142   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
143   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
144   // Note each type of RPC will contain a header value that will always be
145   // hashed to a specific backend as the header value matches the value used
146   // to create the entry in the ring.
147   std::vector<std::pair<std::string, std::string>> metadata = {
148       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
149   std::vector<std::pair<std::string, std::string>> metadata1 = {
150       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
151   std::vector<std::pair<std::string, std::string>> metadata2 = {
152       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
153   std::vector<std::pair<std::string, std::string>> metadata3 = {
154       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
155   const auto rpc_options = RpcOptions().set_metadata(metadata);
156   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
157   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
158   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
159   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
160                  WaitForBackendOptions(), rpc_options);
161   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
162                  WaitForBackendOptions(), rpc_options1);
163   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
164                  WaitForBackendOptions(), rpc_options2);
165   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
166                  WaitForBackendOptions(), rpc_options3);
167   // Cause 2 errors and wait until one ejection happens.
168   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
169                       RpcOptions()
170                           .set_metadata(std::move(metadata))
171                           .set_server_expected_error(StatusCode::CANCELLED));
172   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
173                       RpcOptions()
174                           .set_metadata(std::move(metadata1))
175                           .set_server_expected_error(StatusCode::CANCELLED));
176   absl::Time deadline =
177       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
178   while (true) {
179     ResetBackendCounters();
180     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
181     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
182     if (!SeenAllBackends(0, 2)) {
183       break;
184     }
185     EXPECT_LE(absl::Now(), deadline);
186     if (absl::Now() >= deadline) break;
187   }
188   // 1 backend should be ejected, traffic picked up by another backend.
189   // No other backend should be ejected.
190   ResetBackendCounters();
191   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
192   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
193   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
194   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
195   size_t empty_load_backend_count = 0;
196   size_t double_load_backend_count = 0;
197   size_t regular_load_backend_count = 0;
198   for (size_t i = 0; i < backends_.size(); ++i) {
199     if (backends_[i]->backend_service()->request_count() == 0) {
200       ++empty_load_backend_count;
201     } else if (backends_[i]->backend_service()->request_count() == 200) {
202       ++double_load_backend_count;
203     } else if (backends_[i]->backend_service()->request_count() == 100) {
204       ++regular_load_backend_count;
205     } else {
206       CHECK(1);
207     }
208   }
209   EXPECT_EQ(1, empty_load_backend_count);
210   EXPECT_EQ(1, double_load_backend_count);
211   EXPECT_EQ(2, regular_load_backend_count);
212 }
213 
214 // Success rate stdev_factor is honored, a higher value would ensure ejection
215 // does not occur.
TEST_P(OutlierDetectionTest,SuccessRateStdevFactor)216 TEST_P(OutlierDetectionTest, SuccessRateStdevFactor) {
217   CreateAndStartBackends(2);
218   auto cluster = default_cluster_;
219   cluster.set_lb_policy(Cluster::RING_HASH);
220   // Setup outlier failure percentage parameters.
221   // Any failure will cause an potential ejection with the probability of 100%
222   // (to eliminate flakiness of the test).
223   auto* outlier_detection = cluster.mutable_outlier_detection();
224   SetProtoDuration(grpc_core::Duration::Seconds(1),
225                    outlier_detection->mutable_interval());
226   SetProtoDuration(grpc_core::Duration::Seconds(1),
227                    outlier_detection->mutable_base_ejection_time());
228   // We know a stdev factor of 100 will ensure the ejection occurs, so setting
229   // it to something higher like 1000 to test that ejection will not occur.
230   // Note this parameter is the only difference between this test and
231   // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
232   // this one value changes means the difference between not ejecting in this
233   // test and ejecting in the other test.
234   outlier_detection->mutable_success_rate_stdev_factor()->set_value(1000);
235   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
236   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
237   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
238   balancer_->ads_service()->SetCdsResource(cluster);
239   auto new_route_config = default_route_config_;
240   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
241   auto* hash_policy = route->mutable_route()->add_hash_policy();
242   hash_policy->mutable_header()->set_header_name("address_hash");
243   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
244                                    new_route_config);
245   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
246   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
247   // Note each type of RPC will contain a header value that will always be
248   // hashed to a specific backend as the header value matches the value used
249   // to create the entry in the ring.
250   std::vector<std::pair<std::string, std::string>> metadata = {
251       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
252   std::vector<std::pair<std::string, std::string>> metadata1 = {
253       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
254   const auto rpc_options = RpcOptions().set_metadata(metadata);
255   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
256   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
257                  WaitForBackendOptions(), rpc_options);
258   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
259                  WaitForBackendOptions(), rpc_options1);
260   // Cause an error and wait for 1 outlier detection interval to pass
261   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
262                       RpcOptions()
263                           .set_metadata(std::move(metadata))
264                           .set_server_expected_error(StatusCode::CANCELLED));
265   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
266       3000 * grpc_test_slowdown_factor()));
267   ResetBackendCounters();
268   // 1 backend experienced failure, but since the stdev_factor is high, no
269   // backend will be noticed as an outlier so no ejection.
270   // Both backends are still getting the RPCs intended for them.
271   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
272   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
273   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
274   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
275 }
276 
277 // Success rate enforcement percentage is honored, setting it to 0 so guarantee
278 // the randomized number between 1 to 100 will always be great, so nothing will
279 // be ejected.
TEST_P(OutlierDetectionTest,SuccessRateEnforcementPercentage)280 TEST_P(OutlierDetectionTest, SuccessRateEnforcementPercentage) {
281   CreateAndStartBackends(2);
282   auto cluster = default_cluster_;
283   cluster.set_lb_policy(Cluster::RING_HASH);
284   auto* outlier_detection = cluster.mutable_outlier_detection();
285   SetProtoDuration(grpc_core::Duration::Seconds(1),
286                    outlier_detection->mutable_interval());
287   SetProtoDuration(grpc_core::Duration::Seconds(1),
288                    outlier_detection->mutable_base_ejection_time());
289   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
290   // Setting enforcing_success_rate to 0 to ensure we will never eject.
291   // Note this parameter is the only difference between this test and
292   // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
293   // this one value changes means the difference between guaranteed not ejecting
294   // in this test and guaranteed ejecting in the other test.
295   outlier_detection->mutable_enforcing_success_rate()->set_value(0);
296   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
297   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
298   balancer_->ads_service()->SetCdsResource(cluster);
299   auto new_route_config = default_route_config_;
300   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
301   auto* hash_policy = route->mutable_route()->add_hash_policy();
302   hash_policy->mutable_header()->set_header_name("address_hash");
303   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
304                                    new_route_config);
305   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
306   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
307   // Note each type of RPC will contain a header value that will always be
308   // hashed to a specific backend as the header value matches the value used
309   // to create the entry in the ring.
310   std::vector<std::pair<std::string, std::string>> metadata = {
311       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
312   std::vector<std::pair<std::string, std::string>> metadata1 = {
313       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
314   const auto rpc_options = RpcOptions().set_metadata(metadata);
315   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
316   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
317                  WaitForBackendOptions(), rpc_options);
318   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
319                  WaitForBackendOptions(), rpc_options1);
320   // Cause an error and wait for 1 outlier detection interval to pass
321   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
322                       RpcOptions()
323                           .set_metadata(std::move(metadata))
324                           .set_server_expected_error(StatusCode::CANCELLED));
325   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
326       3000 * grpc_test_slowdown_factor()));
327   ResetBackendCounters();
328   // 1 backend experienced failure, but since the enforcement percentage is 0,
329   // no backend will be ejected. Both backends are still getting the RPCs
330   // intended for them.
331   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
332   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
333   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
334   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
335 }
336 
337 // Success rate does not eject if there are less than minimum_hosts backends
338 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,SuccessRateMinimumHosts)339 TEST_P(OutlierDetectionTest, SuccessRateMinimumHosts) {
340   CreateAndStartBackends(2);
341   auto cluster = default_cluster_;
342   cluster.set_lb_policy(Cluster::RING_HASH);
343   // Setup outlier failure percentage parameters.
344   // Any failure will cause an potential ejection with the probability of 100%
345   // (to eliminate flakiness of the test).
346   auto* outlier_detection = cluster.mutable_outlier_detection();
347   SetProtoDuration(grpc_core::Duration::Seconds(1),
348                    outlier_detection->mutable_interval());
349   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
350   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
351   // Set success_rate_minimum_hosts to 3 when we only have 2 backends
352   // Note this parameter is the only difference between this test and
353   // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
354   // this one value changes means the difference between not ejecting in this
355   // test and ejecting in the other test.
356   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(3);
357   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
358   balancer_->ads_service()->SetCdsResource(cluster);
359   auto new_route_config = default_route_config_;
360   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
361   auto* hash_policy = route->mutable_route()->add_hash_policy();
362   hash_policy->mutable_header()->set_header_name("address_hash");
363   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
364                                    new_route_config);
365   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
366   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
367   // Note each type of RPC will contain a header value that will always be
368   // hashed to a specific backend as the header value matches the value used
369   // to create the entry in the ring.
370   std::vector<std::pair<std::string, std::string>> metadata = {
371       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
372   std::vector<std::pair<std::string, std::string>> metadata1 = {
373       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
374   const auto rpc_options = RpcOptions().set_metadata(metadata);
375   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
376   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
377                  WaitForBackendOptions(), rpc_options);
378   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
379                  WaitForBackendOptions(), rpc_options1);
380   // Cause an error and wait for 1 outlier detection interval to pass
381   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
382                       RpcOptions()
383                           .set_metadata(std::move(metadata))
384                           .set_server_expected_error(StatusCode::CANCELLED));
385   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
386       3000 * grpc_test_slowdown_factor()));
387   ResetBackendCounters();
388   // All traffic still reaching the original backends and no backends are
389   // ejected.
390   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
391   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
392   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
393   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
394 }
395 
396 // Success rate does not eject if there are less than request_volume requests
397 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
398 // interval.
TEST_P(OutlierDetectionTest,SuccessRateRequestVolume)399 TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) {
400   CreateAndStartBackends(2);
401   auto cluster = default_cluster_;
402   cluster.set_lb_policy(Cluster::RING_HASH);
403   // Setup outlier failure percentage parameters.
404   // Any failure will cause an potential ejection with the probability of 100%
405   // (to eliminate flakiness of the test).
406   auto* outlier_detection = cluster.mutable_outlier_detection();
407   SetProtoDuration(grpc_core::Duration::Seconds(1),
408                    outlier_detection->mutable_interval());
409   outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
410   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
411   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
412   // Set success_rate_request_volume to 4 when we only send 3 RPC in the
413   // interval.
414   // Note this parameter is the only difference between this test and
415   // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
416   // this one value changes means the difference between not ejecting in this
417   // test and ejecting in the other test.
418   outlier_detection->mutable_success_rate_request_volume()->set_value(4);
419   balancer_->ads_service()->SetCdsResource(cluster);
420   auto new_route_config = default_route_config_;
421   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
422   auto* hash_policy = route->mutable_route()->add_hash_policy();
423   hash_policy->mutable_header()->set_header_name("address_hash");
424   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
425                                    new_route_config);
426   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
427   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
428   // Note each type of RPC will contain a header value that will always be
429   // hashed to a specific backend as the header value matches the value used
430   // to create the entry in the ring.
431   std::vector<std::pair<std::string, std::string>> metadata = {
432       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
433   std::vector<std::pair<std::string, std::string>> metadata1 = {
434       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
435   const auto rpc_options = RpcOptions().set_metadata(metadata);
436   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
437   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
438                  WaitForBackendOptions(), rpc_options);
439   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
440                  WaitForBackendOptions(), rpc_options1);
441   // Cause an error and wait for 1 outlier detection interval to pass
442   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
443                       RpcOptions()
444                           .set_metadata(std::move(metadata))
445                           .set_server_expected_error(StatusCode::CANCELLED));
446   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
447       3000 * grpc_test_slowdown_factor()));
448   ResetBackendCounters();
449   // All traffic still reaching the original backends and no backends are
450   // ejected.
451   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
452   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
453   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
454   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
455 }
456 
457 // Tests FailurePercentageEjectionAndUnejection:
458 // 1. Use ring hash policy that hashes using a header value to ensure RPCs
459 //    go to all backends.
460 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
461 //    interval to pass.
462 // 3. We should skip exactly 1 backend due to ejection and all the loads
463 //    sticky to that backend should go to 1 other backend.
464 // 4. Let the ejection period pass and verify that traffic will again go both
465 //    backends as we have unejected the backend.
TEST_P(OutlierDetectionTest,FailurePercentageEjectionAndUnejection)466 TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) {
467   CreateAndStartBackends(2);
468   auto cluster = default_cluster_;
469   cluster.set_lb_policy(Cluster::RING_HASH);
470   // Setup outlier failure percentage parameters.
471   // Any failure will cause an potential ejection with the probability of 100%
472   // (to eliminate flakiness of the test).
473   auto* outlier_detection = cluster.mutable_outlier_detection();
474   SetProtoDuration(grpc_core::Duration::Seconds(1),
475                    outlier_detection->mutable_interval());
476   SetProtoDuration(grpc_core::Duration::Seconds(3),
477                    outlier_detection->mutable_base_ejection_time());
478   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
479   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
480   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
481   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
482   balancer_->ads_service()->SetCdsResource(cluster);
483   auto new_route_config = default_route_config_;
484   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
485   auto* hash_policy = route->mutable_route()->add_hash_policy();
486   hash_policy->mutable_header()->set_header_name("address_hash");
487   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
488                                    new_route_config);
489   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
490   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
491   // Note each type of RPC will contain a header value that will always be
492   // hashed to a specific backend as the header value matches the value used
493   // to create the entry in the ring.
494   std::vector<std::pair<std::string, std::string>> metadata = {
495       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
496   std::vector<std::pair<std::string, std::string>> metadata1 = {
497       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
498   const auto rpc_options = RpcOptions().set_metadata(metadata);
499   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
500   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
501                  WaitForBackendOptions(), rpc_options);
502   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
503                  WaitForBackendOptions(), rpc_options1);
504   // Cause an error and wait for traffic aimed at backend 0 to start going to
505   // backend 1.
506   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
507                       RpcOptions()
508                           .set_metadata(std::move(metadata))
509                           .set_server_expected_error(StatusCode::CANCELLED));
510   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
511                  WaitForBackendOptions().set_timeout_ms(
512                      3000 * grpc_test_slowdown_factor()),
513                  rpc_options);
514   // 1 backend is ejected all traffic going to the ejected backend should now
515   // all be going to the other backend.
516   // failure percentage enforcement_percentage of 100% is honored as this test
517   // will consistently reject 1 backend.
518   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
519   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
520   // Now wait for traffic aimed at backend 0 to switch back to backend 0.
521   // This tells us that backend 0 has been unejected.
522   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
523                  WaitForBackendOptions().set_timeout_ms(
524                      30000 * grpc_test_slowdown_factor()),
525                  rpc_options);
526   // Verify that rpcs go to their expectedly hashed backends.
527   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
528   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
529   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
530   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
531 }
532 
533 // We don't eject more than max_ejection_percent (default 10%) of the backends
534 // beyond the first one.
TEST_P(OutlierDetectionTest,FailurePercentageMaxPercentage)535 TEST_P(OutlierDetectionTest, FailurePercentageMaxPercentage) {
536   CreateAndStartBackends(4);
537   auto cluster = default_cluster_;
538   cluster.set_lb_policy(Cluster::RING_HASH);
539   // Setup outlier failure percentage parameters.
540   // Any failure will cause an potential ejection with the probability of 100%
541   // (to eliminate flakiness of the test).
542   auto* outlier_detection = cluster.mutable_outlier_detection();
543   SetProtoDuration(grpc_core::Duration::Seconds(1),
544                    outlier_detection->mutable_interval());
545   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
546   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
547   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
548   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
549   balancer_->ads_service()->SetCdsResource(cluster);
550   auto new_route_config = default_route_config_;
551   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
552   auto* hash_policy = route->mutable_route()->add_hash_policy();
553   hash_policy->mutable_header()->set_header_name("address_hash");
554   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
555                                    new_route_config);
556   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
557   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
558   // Note each type of RPC will contain a header value that will always be
559   // hashed to a specific backend as the header value matches the value used
560   // to create the entry in the ring.
561   std::vector<std::pair<std::string, std::string>> metadata = {
562       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
563   std::vector<std::pair<std::string, std::string>> metadata1 = {
564       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
565   std::vector<std::pair<std::string, std::string>> metadata2 = {
566       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
567   std::vector<std::pair<std::string, std::string>> metadata3 = {
568       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
569   const auto rpc_options = RpcOptions().set_metadata(metadata);
570   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
571   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
572   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
573   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
574                  WaitForBackendOptions(), rpc_options);
575   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
576                  WaitForBackendOptions(), rpc_options1);
577   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
578                  WaitForBackendOptions(), rpc_options2);
579   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
580                  WaitForBackendOptions(), rpc_options3);
581   // Cause 2 errors and wait until one ejection happens.
582   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
583                       RpcOptions()
584                           .set_metadata(std::move(metadata))
585                           .set_server_expected_error(StatusCode::CANCELLED));
586   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
587                       RpcOptions()
588                           .set_metadata(std::move(metadata1))
589                           .set_server_expected_error(StatusCode::CANCELLED));
590   absl::Time deadline =
591       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
592   while (true) {
593     ResetBackendCounters();
594     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
595     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
596     if (!SeenAllBackends(0, 2)) {
597       break;
598     }
599     EXPECT_LE(absl::Now(), deadline);
600     if (absl::Now() >= deadline) break;
601   }
602   // 1 backend should be ejected, traffic picked up by another backend.
603   // No other backend should be ejected.
604   ResetBackendCounters();
605   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
606   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
607   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
608   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
609   size_t empty_load_backend_count = 0;
610   size_t double_load_backend_count = 0;
611   size_t regular_load_backend_count = 0;
612   for (size_t i = 0; i < backends_.size(); ++i) {
613     if (backends_[i]->backend_service()->request_count() == 0) {
614       ++empty_load_backend_count;
615     } else if (backends_[i]->backend_service()->request_count() == 200) {
616       ++double_load_backend_count;
617     } else if (backends_[i]->backend_service()->request_count() == 100) {
618       ++regular_load_backend_count;
619     } else {
620       CHECK(1);
621     }
622   }
623   EXPECT_EQ(1, empty_load_backend_count);
624   EXPECT_EQ(1, double_load_backend_count);
625   EXPECT_EQ(2, regular_load_backend_count);
626 }
627 
628 // Failure percentage threshold is honored, a higher value would ensure ejection
629 // does not occur
TEST_P(OutlierDetectionTest,FailurePercentageThreshold)630 TEST_P(OutlierDetectionTest, FailurePercentageThreshold) {
631   CreateAndStartBackends(2);
632   auto cluster = default_cluster_;
633   cluster.set_lb_policy(Cluster::RING_HASH);
634   auto* outlier_detection = cluster.mutable_outlier_detection();
635   SetProtoDuration(grpc_core::Duration::Seconds(1),
636                    outlier_detection->mutable_interval());
637   SetProtoDuration(grpc_core::Duration::Seconds(1),
638                    outlier_detection->mutable_base_ejection_time());
639   // Setup outlier failure percentage parameter to 50
640   // Note this parameter is the only difference between this test and
641   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 0)
642   // and this one value changes means the difference between not ejecting in
643   // this test and ejecting in the other test.
644   outlier_detection->mutable_failure_percentage_threshold()->set_value(50);
645   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
646   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
647   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
648   balancer_->ads_service()->SetCdsResource(cluster);
649   auto new_route_config = default_route_config_;
650   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
651   auto* hash_policy = route->mutable_route()->add_hash_policy();
652   hash_policy->mutable_header()->set_header_name("address_hash");
653   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
654                                    new_route_config);
655   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
656   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
657   // Note each type of RPC will contain a header value that will always be
658   // hashed to a specific backend as the header value matches the value used
659   // to create the entry in the ring.
660   std::vector<std::pair<std::string, std::string>> metadata = {
661       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
662   std::vector<std::pair<std::string, std::string>> metadata1 = {
663       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
664   const auto rpc_options = RpcOptions().set_metadata(metadata);
665   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
666   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
667                  WaitForBackendOptions(), rpc_options);
668   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
669                  WaitForBackendOptions(), rpc_options1);
670   // Cause an error and wait for 1 outlier detection interval to pass to cause
671   // the backend to be ejected.
672   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
673                       RpcOptions()
674                           .set_metadata(std::move(metadata))
675                           .set_server_expected_error(StatusCode::CANCELLED));
676   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
677       3000 * grpc_test_slowdown_factor()));
678   ResetBackendCounters();
679   // 1 backend experienced 1 failure, but since the threshold is 50 % no
680   // backend will be noticed as an outlier so no ejection.
681   // Both backends are still getting the RPCs intended for them.
682   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
683   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
684   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
685   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
686 }
687 
688 // Failure percentage enforcement percentage is honored, setting it to 0 so
689 // guarantee the randomized number between 1 to 100 will always be great, so
690 // nothing will be ejected.
TEST_P(OutlierDetectionTest,FailurePercentageEnforcementPercentage)691 TEST_P(OutlierDetectionTest, FailurePercentageEnforcementPercentage) {
692   CreateAndStartBackends(2);
693   auto cluster = default_cluster_;
694   cluster.set_lb_policy(Cluster::RING_HASH);
695   auto* outlier_detection = cluster.mutable_outlier_detection();
696   SetProtoDuration(grpc_core::Duration::Seconds(1),
697                    outlier_detection->mutable_interval());
698   SetProtoDuration(grpc_core::Duration::Seconds(1),
699                    outlier_detection->mutable_base_ejection_time());
700   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
701   // Setting enforcing_success_rate to 0 to ensure we will never eject.
702   // Note this parameter is the only difference between this test and
703   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 100)
704   // and this one value changes means the difference between guaranteed not
705   // ejecting in this test and guaranteed ejecting in the other test.
706   outlier_detection->mutable_enforcing_failure_percentage()->set_value(0);
707   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
708   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
709   balancer_->ads_service()->SetCdsResource(cluster);
710   auto new_route_config = default_route_config_;
711   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
712   auto* hash_policy = route->mutable_route()->add_hash_policy();
713   hash_policy->mutable_header()->set_header_name("address_hash");
714   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
715                                    new_route_config);
716   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
717   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
718   // Note each type of RPC will contain a header value that will always be
719   // hashed to a specific backend as the header value matches the value used
720   // to create the entry in the ring.
721   std::vector<std::pair<std::string, std::string>> metadata = {
722       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
723   std::vector<std::pair<std::string, std::string>> metadata1 = {
724       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
725   const auto rpc_options = RpcOptions().set_metadata(metadata);
726   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
727   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
728                  WaitForBackendOptions(), rpc_options);
729   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
730                  WaitForBackendOptions(), rpc_options1);
731   // Cause an error and wait for 1 outlier detection interval to pass to cause
732   // the backend to be ejected.
733   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
734                       RpcOptions()
735                           .set_metadata(std::move(metadata))
736                           .set_server_expected_error(StatusCode::CANCELLED));
737   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
738       3000 * grpc_test_slowdown_factor()));
739   ResetBackendCounters();
740   // 1 backend experienced failure, but since the enforcement percentage is 0,
741   // no backend will be ejected. Both backends are still getting the RPCs
742   // intended for them.
743   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
744   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
745   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
746   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
747 }
748 
749 // Failure percentage does not eject if there are less than minimum_hosts
750 // backends Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,FailurePercentageMinimumHosts)751 TEST_P(OutlierDetectionTest, FailurePercentageMinimumHosts) {
752   CreateAndStartBackends(2);
753   auto cluster = default_cluster_;
754   cluster.set_lb_policy(Cluster::RING_HASH);
755   // Setup outlier failure percentage parameters.
756   // Any failure will cause an potential ejection with the probability of 100%
757   // (to eliminate flakiness of the test).
758   auto* outlier_detection = cluster.mutable_outlier_detection();
759   SetProtoDuration(grpc_core::Duration::Seconds(1),
760                    outlier_detection->mutable_interval());
761   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
762   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
763   // Set failure_percentage_minimum_hosts to 3 when we only have 2 backends
764   // Note this parameter is the only difference between this test and
765   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
766   // and this one value changes means the difference between not ejecting in
767   // this test and ejecting in the other test.
768   cluster.mutable_outlier_detection()
769       ->mutable_failure_percentage_minimum_hosts()
770       ->set_value(3);
771   cluster.mutable_outlier_detection()
772       ->mutable_failure_percentage_request_volume()
773       ->set_value(1);
774   balancer_->ads_service()->SetCdsResource(cluster);
775   auto new_route_config = default_route_config_;
776   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
777   auto* hash_policy = route->mutable_route()->add_hash_policy();
778   hash_policy->mutable_header()->set_header_name("address_hash");
779   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
780                                    new_route_config);
781   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
782   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
783   // Note each type of RPC will contain a header value that will always be
784   // hashed to a specific backend as the header value matches the value used
785   // to create the entry in the ring.
786   std::vector<std::pair<std::string, std::string>> metadata = {
787       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
788   std::vector<std::pair<std::string, std::string>> metadata1 = {
789       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
790   const auto rpc_options = RpcOptions().set_metadata(metadata);
791   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
792   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
793                  WaitForBackendOptions(), rpc_options);
794   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
795                  WaitForBackendOptions(), rpc_options1);
796   // Cause an error and wait for 1 outlier detection interval to pass to cause
797   // the backend to be ejected.
798   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
799                       RpcOptions()
800                           .set_metadata(std::move(metadata))
801                           .set_server_expected_error(StatusCode::CANCELLED));
802   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
803       3000 * grpc_test_slowdown_factor()));
804   ResetBackendCounters();
805   // All traffic still reaching the original backends and no backends are
806   // ejected.
807   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
808   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
809   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
810   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
811 }
812 
813 // Failure percentage does not eject if there are less than request_volume
814 // requests
815 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
816 // interval.
TEST_P(OutlierDetectionTest,FailurePercentageRequestVolume)817 TEST_P(OutlierDetectionTest, FailurePercentageRequestVolume) {
818   CreateAndStartBackends(2);
819   auto cluster = default_cluster_;
820   cluster.set_lb_policy(Cluster::RING_HASH);
821   // Setup outlier failure percentage parameters.
822   // Any failure will cause an potential ejection with the probability of 100%
823   // (to eliminate flakiness of the test).
824   auto* outlier_detection = cluster.mutable_outlier_detection();
825   SetProtoDuration(grpc_core::Duration::Seconds(1),
826                    outlier_detection->mutable_interval());
827   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
828   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
829   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
830   // Set failure_percentage_request_volume to 4 when we only send 3 RPC in the
831   // interval.
832   // // Note this parameter is the only difference between this test and
833   // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
834   // and this one value changes means the difference between not ejecting in
835   // this test and ejecting in the other test.
836   outlier_detection->mutable_failure_percentage_request_volume()->set_value(4);
837   balancer_->ads_service()->SetCdsResource(cluster);
838   auto new_route_config = default_route_config_;
839   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
840   auto* hash_policy = route->mutable_route()->add_hash_policy();
841   hash_policy->mutable_header()->set_header_name("address_hash");
842   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
843                                    new_route_config);
844   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
845   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
846   // Note each type of RPC will contain a header value that will always be
847   // hashed to a specific backend as the header value matches the value used
848   // to create the entry in the ring.
849   std::vector<std::pair<std::string, std::string>> metadata = {
850       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
851   std::vector<std::pair<std::string, std::string>> metadata1 = {
852       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
853   const auto rpc_options = RpcOptions().set_metadata(metadata);
854   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
855   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
856                  WaitForBackendOptions(), rpc_options);
857   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
858                  WaitForBackendOptions(), rpc_options1);
859   // Cause an error and wait for 1 outlier detection interval to pass to cause
860   // the backend to be ejected.
861   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
862                       RpcOptions()
863                           .set_metadata(std::move(metadata))
864                           .set_server_expected_error(StatusCode::CANCELLED));
865   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
866       3000 * grpc_test_slowdown_factor()));
867   ResetBackendCounters();
868   // All traffic still reaching the original backends and no backends are
869   // ejected.
870   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
871   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
872   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
873   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
874 }
875 
876 // Tests SuccessRate and FailurePercentage both configured
877 // Configure max_ejection_percent to 50% which means max 2/4 backends can be
878 // ejected.
879 // Configure success rate to eject 1 and failure percentage to eject 2.
880 // Verify that maximum 2 backends are ejected, not 3!
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentage)881 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentage) {
882   CreateAndStartBackends(4);
883   auto cluster = default_cluster_;
884   cluster.set_lb_policy(Cluster::RING_HASH);
885   // Setup outlier failure percentage parameters.
886   // Any failure will cause an potential ejection with the probability of 100%
887   // (to eliminate flakiness of the test).
888   auto* outlier_detection = cluster.mutable_outlier_detection();
889   SetProtoDuration(grpc_core::Duration::Seconds(1),
890                    outlier_detection->mutable_interval());
891   outlier_detection->mutable_max_ejection_percent()->set_value(50);
892   // This stdev of 500 will ensure the number of ok RPC and error RPC we send
893   // will make 1 outlier out of the 4 backends.
894   outlier_detection->mutable_success_rate_stdev_factor()->set_value(500);
895   outlier_detection->mutable_enforcing_success_rate()->set_value(100);
896   outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
897   outlier_detection->mutable_success_rate_request_volume()->set_value(1);
898   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
899   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
900   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
901   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
902   balancer_->ads_service()->SetCdsResource(cluster);
903   auto new_route_config = default_route_config_;
904   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
905   auto* hash_policy = route->mutable_route()->add_hash_policy();
906   hash_policy->mutable_header()->set_header_name("address_hash");
907   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
908                                    new_route_config);
909   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
910   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
911   // Note each type of RPC will contain a header value that will always be
912   // hashed to a specific backend as the header value matches the value used
913   // to create the entry in the ring.
914   std::vector<std::pair<std::string, std::string>> metadata = {
915       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
916   std::vector<std::pair<std::string, std::string>> metadata1 = {
917       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
918   std::vector<std::pair<std::string, std::string>> metadata2 = {
919       {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
920   std::vector<std::pair<std::string, std::string>> metadata3 = {
921       {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
922   const auto rpc_options = RpcOptions().set_metadata(metadata);
923   const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
924   const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
925   const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
926   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
927                  WaitForBackendOptions(), rpc_options);
928   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
929                  WaitForBackendOptions(), rpc_options1);
930   WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
931                  WaitForBackendOptions(), rpc_options2);
932   WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
933                  WaitForBackendOptions(), rpc_options3);
934   // Cause 2 errors on 1 backend and 1 error on 2 backends and wait for 2
935   // backends to be ejected. The 2 errors to the 1 backend will make exactly 1
936   // outlier from the success rate algorithm; all 4 errors will make 3 outliers
937   // from the failure percentage algorithm because the threshold is set to 0. I
938   // have verified through debug logs we eject 1 backend because of success
939   // rate, 1 backend because of failure percentage; but as we attempt to eject
940   // another backend because of failure percentage we will stop as we have
941   // reached our 50% limit.
942   CheckRpcSendFailure(
943       DEBUG_LOCATION, StatusCode::CANCELLED, "",
944       RpcOptions().set_metadata(metadata).set_server_expected_error(
945           StatusCode::CANCELLED));
946   CheckRpcSendFailure(
947       DEBUG_LOCATION, StatusCode::CANCELLED, "",
948       RpcOptions().set_metadata(metadata).set_server_expected_error(
949           StatusCode::CANCELLED));
950   CheckRpcSendFailure(
951       DEBUG_LOCATION, StatusCode::CANCELLED, "",
952       RpcOptions().set_metadata(metadata1).set_server_expected_error(
953           StatusCode::CANCELLED));
954   CheckRpcSendFailure(
955       DEBUG_LOCATION, StatusCode::CANCELLED, "",
956       RpcOptions().set_metadata(metadata2).set_server_expected_error(
957           StatusCode::CANCELLED));
958   absl::Time deadline =
959       absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
960   std::vector<size_t> idx = {0, 1, 2, 3};
961   while (true) {
962     ResetBackendCounters();
963     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
964     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
965     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options2);
966     CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options3);
967     if (std::count_if(idx.begin(), idx.end(),
968                       [this](size_t i) { return SeenBackend(i); }) == 2) {
969       break;
970     }
971     EXPECT_LE(absl::Now(), deadline);
972     if (absl::Now() >= deadline) break;
973   }
974   ResetBackendCounters();
975   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
976   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
977   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
978   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
979   size_t empty_load_backend_count = 0;
980   size_t double_load_backend_count = 0;
981   for (size_t i = 0; i < backends_.size(); ++i) {
982     if (backends_[i]->backend_service()->request_count() == 0) {
983       ++empty_load_backend_count;
984     } else if (backends_[i]->backend_service()->request_count() >= 100) {
985       // The extra load could go to 2 remaining backends or just 1 of them.
986       ++double_load_backend_count;
987     } else if (backends_[i]->backend_service()->request_count() > 300) {
988       CHECK(1);
989     }
990   }
991   EXPECT_EQ(2, empty_load_backend_count);
992   EXPECT_EQ(2, double_load_backend_count);
993 }
994 
995 // Tests SuccessRate and FailurePercentage both unconfigured;
996 // This is the case where according to the gRFC we need to instruct the picker
997 // not to do counting or even start the timer. The result of not counting is
998 // that there will be no ejection taking place since we can't do any
999 // calculations.
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentageBothDisabled)1000 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentageBothDisabled) {
1001   CreateAndStartBackends(2);
1002   auto cluster = default_cluster_;
1003   cluster.set_lb_policy(Cluster::RING_HASH);
1004   auto* outlier_detection = cluster.mutable_outlier_detection();
1005   SetProtoDuration(grpc_core::Duration::Seconds(1),
1006                    outlier_detection->mutable_interval());
1007   SetProtoDuration(grpc_core::Duration::Seconds(1),
1008                    outlier_detection->mutable_base_ejection_time());
1009   balancer_->ads_service()->SetCdsResource(cluster);
1010   auto new_route_config = default_route_config_;
1011   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1012   auto* hash_policy = route->mutable_route()->add_hash_policy();
1013   hash_policy->mutable_header()->set_header_name("address_hash");
1014   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1015                                    new_route_config);
1016   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1017   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1018   // Note each type of RPC will contain a header value that will always be
1019   // hashed to a specific backend as the header value matches the value used
1020   // to create the entry in the ring.
1021   std::vector<std::pair<std::string, std::string>> metadata = {
1022       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1023   std::vector<std::pair<std::string, std::string>> metadata1 = {
1024       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1025   const auto rpc_options = RpcOptions().set_metadata(metadata);
1026   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1027   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1028                  WaitForBackendOptions(), rpc_options);
1029   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1030                  WaitForBackendOptions(), rpc_options1);
1031   // Cause an error and wait for 1 outlier detection interval to pass
1032   CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
1033                       RpcOptions()
1034                           .set_metadata(std::move(metadata))
1035                           .set_server_expected_error(StatusCode::CANCELLED));
1036   gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
1037       3000 * grpc_test_slowdown_factor()));
1038   ResetBackendCounters();
1039   // 1 backend experienced failure, but since there is no counting there is no
1040   // ejection.  Both backends are still getting the RPCs intended for them.
1041   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
1042   CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
1043   EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
1044   EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
1045 }
1046 
1047 // Tests that we uneject any ejected addresses when the OD policy is
1048 // disabled.
TEST_P(OutlierDetectionTest,DisableOutlierDetectionWhileAddressesAreEjected)1049 TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) {
1050   CreateAndStartBackends(2);
1051   auto cluster = default_cluster_;
1052   cluster.set_lb_policy(Cluster::RING_HASH);
1053   // Setup outlier failure percentage parameters.
1054   // Any failure will cause an potential ejection with the probability of 100%
1055   // (to eliminate flakiness of the test).
1056   auto* outlier_detection = cluster.mutable_outlier_detection();
1057   SetProtoDuration(grpc_core::Duration::Seconds(1),
1058                    outlier_detection->mutable_interval());
1059   SetProtoDuration(grpc_core::Duration::Seconds(3),
1060                    outlier_detection->mutable_base_ejection_time());
1061   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1062   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1063   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1064   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1065   balancer_->ads_service()->SetCdsResource(cluster);
1066   auto new_route_config = default_route_config_;
1067   auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1068   auto* hash_policy = route->mutable_route()->add_hash_policy();
1069   hash_policy->mutable_header()->set_header_name("address_hash");
1070   SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1071                                    new_route_config);
1072   EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1073   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1074   // Note each type of RPC will contain a header value that will always be
1075   // hashed to a specific backend as the header value matches the value used
1076   // to create the entry in the ring.
1077   std::vector<std::pair<std::string, std::string>> metadata = {
1078       {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1079   std::vector<std::pair<std::string, std::string>> metadata1 = {
1080       {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1081   const auto rpc_options = RpcOptions().set_metadata(metadata);
1082   const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1083   WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1084                  WaitForBackendOptions(), rpc_options);
1085   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1086                  WaitForBackendOptions(), rpc_options1);
1087   // Cause an error and wait for traffic aimed at backend 0 to start going to
1088   // backend 1.
1089   CheckRpcSendFailure(
1090       DEBUG_LOCATION, StatusCode::CANCELLED, "",
1091       RpcOptions().set_metadata(metadata).set_server_expected_error(
1092           StatusCode::CANCELLED));
1093   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1094                  WaitForBackendOptions().set_timeout_ms(
1095                      3000 * grpc_test_slowdown_factor()),
1096                  rpc_options);
1097   // 1 backend is ejected all traffic going to the ejected backend should now
1098   // all be going to the other backend.
1099   // failure percentage enforcement_percentage of 100% is honored as this test
1100   // will consistently reject 1 backend.
1101   CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
1102   EXPECT_EQ(1, backends_[1]->backend_service()->request_count());
1103   // Send an update that disables outlier detection.
1104   cluster.clear_outlier_detection();
1105   balancer_->ads_service()->SetCdsResource(cluster);
1106   // Wait for the backend to start being used again.
1107   WaitForBackend(
1108       DEBUG_LOCATION, 0,
1109       [](const RpcResult& result) {
1110         EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED)
1111             << "Error: " << result.status.error_message();
1112       },
1113       WaitForBackendOptions(),
1114       RpcOptions()
1115           .set_metadata(std::move(metadata))
1116           .set_server_expected_error(StatusCode::CANCELLED));
1117 }
1118 
TEST_P(OutlierDetectionTest,EjectionRetainedAcrossPriorities)1119 TEST_P(OutlierDetectionTest, EjectionRetainedAcrossPriorities) {
1120   CreateAndStartBackends(3);
1121   auto cluster = default_cluster_;
1122   // Setup outlier failure percentage parameters.
1123   // Any failure will cause an potential ejection with the probability of 100%
1124   // (to eliminate flakiness of the test).
1125   auto* outlier_detection = cluster.mutable_outlier_detection();
1126   SetProtoDuration(grpc_core::Duration::Seconds(1),
1127                    outlier_detection->mutable_interval());
1128   SetProtoDuration(grpc_core::Duration::Minutes(10),
1129                    outlier_detection->mutable_base_ejection_time());
1130   outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1131   outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1132   outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1133   outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1134   balancer_->ads_service()->SetCdsResource(cluster);
1135   // Priority 0: backend 0 and a non-existent backend.
1136   // Priority 1: backend 1.
1137   EdsResourceArgs args({
1138       {"locality0", {CreateEndpoint(0), MakeNonExistentEndpoint()}},
1139       {"locality1", {CreateEndpoint(1)}, kDefaultLocalityWeight, 1},
1140   });
1141   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1142   WaitForBackend(DEBUG_LOCATION, 0);
1143   // Trigger an error to backend 0.
1144   // The success rate enforcement_percentage is 100%, so this will cause
1145   // the backend to be ejected when the ejection timer fires.
1146   CheckRpcSendFailure(
1147       DEBUG_LOCATION, StatusCode::CANCELLED, "",
1148       RpcOptions().set_server_expected_error(StatusCode::CANCELLED));
1149   // Wait for traffic aimed at backend 0 to start going to backend 1.
1150   // This tells us that backend 0 has been ejected.
1151   // It should take no more than one ejection timer interval.
1152   WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1153                  WaitForBackendOptions().set_timeout_ms(
1154                      3000 * grpc_test_slowdown_factor()));
1155   // Now send an EDS update that moves backend 0 to priority 1.
1156   // We also add backend 2, so that we know when the client sees the update.
1157   args = EdsResourceArgs({
1158       {"locality0", {MakeNonExistentEndpoint()}},
1159       {"locality1", CreateEndpointsForBackends(), kDefaultLocalityWeight, 1},
1160   });
1161   balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1162   WaitForBackend(DEBUG_LOCATION, 2);
1163   // Now send 100 RPCs and make sure they all go to backends 1 and 2,
1164   // because backend 0 should still be ejected.
1165   CheckRpcSendOk(DEBUG_LOCATION, 100);
1166   EXPECT_EQ(0, backends_[0]->backend_service()->request_count());
1167   EXPECT_EQ(50, backends_[1]->backend_service()->request_count());
1168   EXPECT_EQ(50, backends_[2]->backend_service()->request_count());
1169 }
1170 
1171 }  // namespace
1172 }  // namespace testing
1173 }  // namespace grpc
1174 
main(int argc,char ** argv)1175 int main(int argc, char** argv) {
1176   grpc::testing::TestEnvironment env(&argc, argv);
1177   ::testing::InitGoogleTest(&argc, argv);
1178   // Make the backup poller poll very frequently in order to pick up
1179   // updates from all the subchannels's FDs.
1180   grpc_core::ConfigVars::Overrides overrides;
1181   overrides.client_channel_backup_poll_interval_ms = 1;
1182   grpc_core::ConfigVars::SetOverrides(overrides);
1183 #if TARGET_OS_IPHONE
1184   // Workaround Apple CFStream bug
1185   grpc_core::SetEnv("grpc_cfstream", "0");
1186 #endif
1187   grpc_init();
1188   const auto result = RUN_ALL_TESTS();
1189   grpc_shutdown();
1190   return result;
1191 }
1192