// Copyright 2017 gRPC authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include #include #include "src/core/client_channel/backup_poller.h" #include "src/core/lib/config/config_vars.h" #include "src/proto/grpc/testing/xds/v3/cluster.grpc.pb.h" #include "src/proto/grpc/testing/xds/v3/fault.grpc.pb.h" #include "src/proto/grpc/testing/xds/v3/outlier_detection.grpc.pb.h" #include "src/proto/grpc/testing/xds/v3/router.grpc.pb.h" #include "test/core/util/resolve_localhost_ip46.h" #include "test/cpp/end2end/xds/xds_end2end_test_lib.h" namespace grpc { namespace testing { namespace { class OutlierDetectionTest : public XdsEnd2endTest { protected: std::string CreateMetadataValueThatHashesToBackend(int index) { return absl::StrCat(grpc_core::LocalIp(), ":", backends_[index]->port(), "_0"); } }; INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest, ::testing::Values(XdsTestType()), &XdsTestType::Name); // TODO(donnadionne): add non-xds test a new // test/cpp/end2end/outlier_detection_end2end_test.cc // Tests SuccessRateEjectionAndUnejection: // 1. Use ring hash policy that hashes using a header value to ensure rpcs // go to all backends. // 2. Cause a single error on 1 backend and wait for 1 outlier detection // interval to pass. // 3. We should skip exactly 1 backend due to ejection and all the loads // sticky to that backend should go to 1 other backend. // 4. Let the ejection period pass and verify we can go back to both backends // after the uneject. TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_success_rate_stdev_factor()->set_value(100); outlier_detection->mutable_enforcing_success_rate()->set_value(100); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); outlier_detection->mutable_success_rate_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Trigger an error to backend 0. // The success rate enforcement_percentage is 100%, so this will cause // the backend to be ejected when the ejection timer fires. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); // Wait for traffic aimed at backend 0 to start going to backend 1. // This tells us that backend 0 has been ejected. // It should take no more than one ejection timer interval. WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 3000 * grpc_test_slowdown_factor()), rpc_options); // Now wait for traffic aimed at backend 0 to switch back to backend 0. // This tells us that backend 0 has been unejected. WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 3000 * grpc_test_slowdown_factor()), rpc_options); } // We don't eject more than max_ejection_percent (default 10%) of the backends // beyond the first one. TEST_P(OutlierDetectionTest, SuccessRateMaxPercent) { CreateAndStartBackends(4); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_success_rate_stdev_factor()->set_value(100); outlier_detection->mutable_enforcing_success_rate()->set_value(100); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); outlier_detection->mutable_success_rate_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; std::vector> metadata2 = { {"address_hash", CreateMetadataValueThatHashesToBackend(2)}}; std::vector> metadata3 = { {"address_hash", CreateMetadataValueThatHashesToBackend(3)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(metadata1); const auto rpc_options2 = RpcOptions().set_metadata(metadata2); const auto rpc_options3 = RpcOptions().set_metadata(metadata3); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options2); WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options3); // Cause 2 errors and wait until one ejection happens. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata1)) .set_server_expected_error(StatusCode::CANCELLED)); absl::Time deadline = absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor(); while (true) { ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1); if (!SeenAllBackends(0, 2)) { break; } EXPECT_LE(absl::Now(), deadline); if (absl::Now() >= deadline) break; } // 1 backend should be ejected, trafficed picked up by another backend. // No other backend should be ejected. ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3); size_t empty_load_backend_count = 0; size_t double_load_backend_count = 0; size_t regular_load_backend_count = 0; for (size_t i = 0; i < backends_.size(); ++i) { if (backends_[i]->backend_service()->request_count() == 0) { ++empty_load_backend_count; } else if (backends_[i]->backend_service()->request_count() == 200) { ++double_load_backend_count; } else if (backends_[i]->backend_service()->request_count() == 100) { ++regular_load_backend_count; } else { GPR_ASSERT(1); } } EXPECT_EQ(1, empty_load_backend_count); EXPECT_EQ(1, double_load_backend_count); EXPECT_EQ(2, regular_load_backend_count); } // Success rate stdev_factor is honored, a higher value would ensure ejection // does not occur. TEST_P(OutlierDetectionTest, SuccessRateStdevFactor) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); // We know a stdev factor of 100 will ensure the ejection occurs, so setting // it to something higher like 1000 to test that ejection will not occur. // Note this parameter is the only difference between this test and // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and // this one value changes means the difference between not ejecting in this // test and ejecting in the other test. outlier_detection->mutable_success_rate_stdev_factor()->set_value(1000); outlier_detection->mutable_enforcing_success_rate()->set_value(100); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); outlier_detection->mutable_success_rate_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // 1 backend experenced failure, but since the stdev_factor is high, no // backend will be noticed as an outlier so no ejection. // Both backends are still getting the RPCs intended for them. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Success rate enforcement percentage is honored, setting it to 0 so guarantee // the randomized number between 1 to 100 will always be great, so nothing will // be ejected. TEST_P(OutlierDetectionTest, SuccessRateEnforcementPercentage) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_success_rate_stdev_factor()->set_value(100); // Setting enforcing_success_rate to 0 to ensure we will never eject. // Note this parameter is the only difference between this test and // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and // this one value changes means the difference between guaranteed not ejecting // in this test and guaranteed ejecting in the other test. outlier_detection->mutable_enforcing_success_rate()->set_value(0); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); outlier_detection->mutable_success_rate_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // 1 backend experenced failure, but since the enforcement percentage is 0, no // backend will be ejected. // Both backends are still getting the RPCs intended for them. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Success rate does not eject if there are less than minimum_hosts backends // Set success_rate_minimum_hosts to 3 when we only have 2 backends TEST_P(OutlierDetectionTest, SuccessRateMinimumHosts) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_success_rate_stdev_factor()->set_value(100); outlier_detection->mutable_enforcing_success_rate()->set_value(100); // Set success_rate_minimum_hosts to 3 when we only have 2 backends // Note this parameter is the only difference between this test and // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and // this one value changes means the difference between not ejecting in this // test and ejecting in the other test. outlier_detection->mutable_success_rate_minimum_hosts()->set_value(3); outlier_detection->mutable_success_rate_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // All traffic still reaching the original backends and no backends are // ejected. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Success rate does not eject if there are less than request_volume requests // Set success_rate_request_volume to 4 when we only send 3 RPC in the // interval. TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_success_rate_stdev_factor()->set_value(100); outlier_detection->mutable_enforcing_success_rate()->set_value(100); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); // Set success_rate_request_volume to 4 when we only send 3 RPC in the // interval. // Note this parameter is the only difference between this test and // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and // this one value changes means the difference between not ejecting in this // test and ejecting in the other test. outlier_detection->mutable_success_rate_request_volume()->set_value(4); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // All traffic still reaching the original backends and no backends are // ejected. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Tests FailurePercentageEjectionAndUnejection: // 1. Use ring hash policy that hashes using a header value to ensure RPCs // go to all backends. // 2. Cause a single error on 1 backend and wait for 1 outlier detection // interval to pass. // 3. We should skip exactly 1 backend due to ejection and all the loads // sticky to that backend should go to 1 other backend. // 4. Let the ejection period pass and verify that traffic will again go both // backends as we have unejected the backend. TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(3), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for traffic aimed at backend 0 to start going to // backend 1. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 3000 * grpc_test_slowdown_factor()), rpc_options); // 1 backend is ejected all traffic going to the ejected backend should now // all be going to the other backend. // failure percentage enforcement_percentage of 100% is honored as this test // will consistently reject 1 backend. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); // Now wait for traffic aimed at backend 0 to switch back to backend 0. // This tells us that backend 0 has been unejected. WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 30000 * grpc_test_slowdown_factor()), rpc_options); // Verify that rpcs go to their expectedly hashed backends. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // We don't eject more than max_ejection_percent (default 10%) of the backends // beyond the first one. TEST_P(OutlierDetectionTest, FailurePercentageMaxPercentage) { CreateAndStartBackends(4); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; std::vector> metadata2 = { {"address_hash", CreateMetadataValueThatHashesToBackend(2)}}; std::vector> metadata3 = { {"address_hash", CreateMetadataValueThatHashesToBackend(3)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(metadata1); const auto rpc_options2 = RpcOptions().set_metadata(metadata2); const auto rpc_options3 = RpcOptions().set_metadata(metadata3); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options2); WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options3); // Cause 2 errors and wait until one ejection happens. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata1)) .set_server_expected_error(StatusCode::CANCELLED)); absl::Time deadline = absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor(); while (true) { ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1); if (!SeenAllBackends(0, 2)) { break; } EXPECT_LE(absl::Now(), deadline); if (absl::Now() >= deadline) break; } // 1 backend should be ejected, trafficed picked up by another backend. // No other backend should be ejected. ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3); size_t empty_load_backend_count = 0; size_t double_load_backend_count = 0; size_t regular_load_backend_count = 0; for (size_t i = 0; i < backends_.size(); ++i) { if (backends_[i]->backend_service()->request_count() == 0) { ++empty_load_backend_count; } else if (backends_[i]->backend_service()->request_count() == 200) { ++double_load_backend_count; } else if (backends_[i]->backend_service()->request_count() == 100) { ++regular_load_backend_count; } else { GPR_ASSERT(1); } } EXPECT_EQ(1, empty_load_backend_count); EXPECT_EQ(1, double_load_backend_count); EXPECT_EQ(2, regular_load_backend_count); } // Failure percentage threshold is honored, a higher value would ensure ejection // does not occur TEST_P(OutlierDetectionTest, FailurePercentageThreshold) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); // Setup outlier failure percentage parameter to 50 // Note this parameter is the only difference between this test and // FailurePercentageEjectionAndUnejection (ejection portion, value set to 0) // and this one value changes means the difference between not ejecting in // this test and ejecting in the other test. outlier_detection->mutable_failure_percentage_threshold()->set_value(50); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass to cause // the backend to be ejected. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // 1 backend experenced 1 failure, but since the threshold is 50 % no // backend will be noticed as an outlier so no ejection. // Both backends are still getting the RPCs intended for them. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Failure percentage enforcement percentage is honored, setting it to 0 so // guarantee the randomized number between 1 to 100 will always be great, so // nothing will be ejected. TEST_P(OutlierDetectionTest, FailurePercentageEnforcementPercentage) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); // Setting enforcing_success_rate to 0 to ensure we will never eject. // Note this parameter is the only difference between this test and // FailurePercentageEjectionAndUnejection (ejection portion, value set to 100) // and this one value changes means the difference between guaranteed not // ejecting in this test and guaranteed ejecting in the other test. outlier_detection->mutable_enforcing_failure_percentage()->set_value(0); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass to cause // the backend to be ejected. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // 1 backend experenced failure, but since the enforcement percentage is 0, no // backend will be ejected. // Both backends are still getting the RPCs intended for them. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Failure percentage does not eject if there are less than minimum_hosts // backends Set success_rate_minimum_hosts to 3 when we only have 2 backends TEST_P(OutlierDetectionTest, FailurePercentageMinimumHosts) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); // Set failure_percentage_minimum_hosts to 3 when we only have 2 backends // Note this parameter is the only difference between this test and // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1) // and this one value changes means the difference between not ejecting in // this test and ejecting in the other test. cluster.mutable_outlier_detection() ->mutable_failure_percentage_minimum_hosts() ->set_value(3); cluster.mutable_outlier_detection() ->mutable_failure_percentage_request_volume() ->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass to cause // the backend to be ejected. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // All traffic still reaching the original backends and no backends are // ejected. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Failure percentage does not eject if there are less than request_volume // requests // Set success_rate_request_volume to 4 when we only send 3 RPC in the // interval. TEST_P(OutlierDetectionTest, FailurePercentageRequestVolume) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); // Set failure_percentage_request_volume to 4 when we only send 3 RPC in the // interval. // // Note this parameter is the only difference between this test and // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1) // and this one value changes means the difference between not ejecting in // this test and ejecting in the other test. outlier_detection->mutable_failure_percentage_request_volume()->set_value(4); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass to cause // the backend to be ejected. CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // All traffic still reaching the original backends and no backends are // ejected. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Tests SuccessRate and FailurePercentage both configured // Configure max_ejection_percent to 50% which means max 2/4 backends can be // ejected. // Configure success rate to eject 1 and failure percentage to eject 2. // Verify that maximum 2 backends are ejected, not 3! TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentage) { CreateAndStartBackends(4); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); outlier_detection->mutable_max_ejection_percent()->set_value(50); // This stdev of 500 will ensure the number of ok RPC and error RPC we send // will make 1 outlier out of the 4 backends. outlier_detection->mutable_success_rate_stdev_factor()->set_value(500); outlier_detection->mutable_enforcing_success_rate()->set_value(100); outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1); outlier_detection->mutable_success_rate_request_volume()->set_value(1); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; std::vector> metadata2 = { {"address_hash", CreateMetadataValueThatHashesToBackend(2)}}; std::vector> metadata3 = { {"address_hash", CreateMetadataValueThatHashesToBackend(3)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(metadata1); const auto rpc_options2 = RpcOptions().set_metadata(metadata2); const auto rpc_options3 = RpcOptions().set_metadata(metadata3); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options2); WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options3); // Cause 2 errors on 1 backend and 1 error on 2 backends and wait for 2 // backends to be ejected. The 2 errors to the 1 backend will make exactly 1 // outlier from the success rate algorithm; all 4 errors will make 3 outliers // from the failure pecentage algorithm because the threahold is set to 0. I // have verified through debug logs we eject 1 backend because of success // rate, 1 backend because of failure percentage; but as we attempt to eject // another backend because of failure percentage we will stop as we have // reached our 50% limit. CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_metadata(metadata).set_server_expected_error( StatusCode::CANCELLED)); CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_metadata(metadata).set_server_expected_error( StatusCode::CANCELLED)); CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_metadata(metadata1).set_server_expected_error( StatusCode::CANCELLED)); CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_metadata(metadata2).set_server_expected_error( StatusCode::CANCELLED)); absl::Time deadline = absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor(); std::vector idx = {0, 1, 2, 3}; while (true) { ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options2); CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options3); if (std::count_if(idx.begin(), idx.end(), [this](size_t i) { return SeenBackend(i); }) == 2) { break; } EXPECT_LE(absl::Now(), deadline); if (absl::Now() >= deadline) break; } ResetBackendCounters(); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3); size_t empty_load_backend_count = 0; size_t double_load_backend_count = 0; for (size_t i = 0; i < backends_.size(); ++i) { if (backends_[i]->backend_service()->request_count() == 0) { ++empty_load_backend_count; } else if (backends_[i]->backend_service()->request_count() >= 100) { // The extra load could go to 2 remaining backends or just 1 of them. ++double_load_backend_count; } else if (backends_[i]->backend_service()->request_count() > 300) { GPR_ASSERT(1); } } EXPECT_EQ(2, empty_load_backend_count); EXPECT_EQ(2, double_load_backend_count); } // Tests SuccessRate and FailurePercentage both unconfigured; // This is the case where according to the gRFC we need to instruct the picker // not to do counting or even start the timer. The result of not counting is // that there will be no ejection taking place since we can't do any // calculations. TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentageBothDisabled) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_base_ejection_time()); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for 1 outlier detection interval to pass CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); gpr_sleep_until(grpc_timeout_milliseconds_to_deadline( 3000 * grpc_test_slowdown_factor())); ResetBackendCounters(); // 1 backend experenced failure, but since there is no counting there is no // ejection. Both backends are still getting the RPCs intended for them. CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options); CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1); EXPECT_EQ(100, backends_[0]->backend_service()->request_count()); EXPECT_EQ(100, backends_[1]->backend_service()->request_count()); } // Tests that we uneject any ejected addresses when the OD policy is // disabled. TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) { CreateAndStartBackends(2); auto cluster = default_cluster_; cluster.set_lb_policy(Cluster::RING_HASH); // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Seconds(3), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); auto new_route_config = default_route_config_; auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0); auto* hash_policy = route->mutable_route()->add_hash_policy(); hash_policy->mutable_header()->set_header_name("address_hash"); SetListenerAndRouteConfiguration(balancer_.get(), default_listener_, new_route_config); EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}}); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); // Note each type of RPC will contain a header value that will always be // hashed to a specific backend as the header value matches the value used // to create the entry in the ring. std::vector> metadata = { {"address_hash", CreateMetadataValueThatHashesToBackend(0)}}; std::vector> metadata1 = { {"address_hash", CreateMetadataValueThatHashesToBackend(1)}}; const auto rpc_options = RpcOptions().set_metadata(metadata); const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1)); WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions(), rpc_options1); // Cause an error and wait for traffic aimed at backend 0 to start going to // backend 1. CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_metadata(metadata).set_server_expected_error( StatusCode::CANCELLED)); WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 3000 * grpc_test_slowdown_factor()), rpc_options); // 1 backend is ejected all traffic going to the ejected backend should now // all be going to the other backend. // failure percentage enforcement_percentage of 100% is honored as this test // will consistently reject 1 backend. CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options); EXPECT_EQ(1, backends_[1]->backend_service()->request_count()); // Send an update that disables outlier detection. cluster.clear_outlier_detection(); balancer_->ads_service()->SetCdsResource(cluster); // Wait for the backend to start being used again. WaitForBackend( DEBUG_LOCATION, 0, [](const RpcResult& result) { EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED) << "Error: " << result.status.error_message(); }, WaitForBackendOptions(), RpcOptions() .set_metadata(std::move(metadata)) .set_server_expected_error(StatusCode::CANCELLED)); } TEST_P(OutlierDetectionTest, EjectionRetainedAcrossPriorities) { CreateAndStartBackends(3); auto cluster = default_cluster_; // Setup outlier failure percentage parameters. // Any failure will cause an potential ejection with the probability of 100% // (to eliminate flakiness of the test). auto* outlier_detection = cluster.mutable_outlier_detection(); SetProtoDuration(grpc_core::Duration::Seconds(1), outlier_detection->mutable_interval()); SetProtoDuration(grpc_core::Duration::Minutes(10), outlier_detection->mutable_base_ejection_time()); outlier_detection->mutable_failure_percentage_threshold()->set_value(0); outlier_detection->mutable_enforcing_failure_percentage()->set_value(100); outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1); outlier_detection->mutable_failure_percentage_request_volume()->set_value(1); balancer_->ads_service()->SetCdsResource(cluster); // Priority 0: backend 0 and a non-existent backend. // Priority 1: backend 1. EdsResourceArgs args({ {"locality0", {CreateEndpoint(0), MakeNonExistantEndpoint()}}, {"locality1", {CreateEndpoint(1)}, kDefaultLocalityWeight, 1}, }); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); WaitForBackend(DEBUG_LOCATION, 0); // Trigger an error to backend 0. // The success rate enforcement_percentage is 100%, so this will cause // the backend to be ejected when the ejection timer fires. CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::CANCELLED, "", RpcOptions().set_server_expected_error(StatusCode::CANCELLED)); // Wait for traffic aimed at backend 0 to start going to backend 1. // This tells us that backend 0 has been ejected. // It should take no more than one ejection timer interval. WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr, WaitForBackendOptions().set_timeout_ms( 3000 * grpc_test_slowdown_factor())); // Now send an EDS update that moves backend 0 to priority 1. // We also add backend 2, so that we know when the client sees the update. args = EdsResourceArgs({ {"locality0", {MakeNonExistantEndpoint()}}, {"locality1", CreateEndpointsForBackends(), kDefaultLocalityWeight, 1}, }); balancer_->ads_service()->SetEdsResource(BuildEdsResource(args)); WaitForBackend(DEBUG_LOCATION, 2); // Now send 100 RPCs and make sure they all go to backends 1 and 2, // because backend 0 should still be ejected. CheckRpcSendOk(DEBUG_LOCATION, 100); EXPECT_EQ(0, backends_[0]->backend_service()->request_count()); EXPECT_EQ(50, backends_[1]->backend_service()->request_count()); EXPECT_EQ(50, backends_[2]->backend_service()->request_count()); } } // namespace } // namespace testing } // namespace grpc int main(int argc, char** argv) { grpc::testing::TestEnvironment env(&argc, argv); ::testing::InitGoogleTest(&argc, argv); // Make the backup poller poll very frequently in order to pick up // updates from all the subchannels's FDs. grpc_core::ConfigVars::Overrides overrides; overrides.client_channel_backup_poll_interval_ms = 1; grpc_core::ConfigVars::SetOverrides(overrides); #if TARGET_OS_IPHONE // Workaround Apple CFStream bug grpc_core::SetEnv("grpc_cfstream", "0"); #endif grpc_init(); const auto result = RUN_ALL_TESTS(); grpc_shutdown(); return result; }