1 // Copyright 2017 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <gmock/gmock.h>
16 #include <gtest/gtest.h>
17
18 #include <chrono>
19 #include <string>
20 #include <thread>
21 #include <vector>
22
23 #include "absl/log/check.h"
24 #include "envoy/config/cluster/v3/cluster.pb.h"
25 #include "envoy/config/cluster/v3/outlier_detection.pb.h"
26 #include "envoy/extensions/filters/http/fault/v3/fault.pb.h"
27 #include "envoy/extensions/filters/http/router/v3/router.pb.h"
28 #include "src/core/client_channel/backup_poller.h"
29 #include "src/core/config/config_vars.h"
30 #include "test/core/test_util/resolve_localhost_ip46.h"
31 #include "test/cpp/end2end/xds/xds_end2end_test_lib.h"
32
33 namespace grpc {
34 namespace testing {
35 namespace {
36
37 class OutlierDetectionTest : public XdsEnd2endTest {
38 protected:
CreateMetadataValueThatHashesToBackend(int index)39 std::string CreateMetadataValueThatHashesToBackend(int index) {
40 return absl::StrCat(grpc_core::LocalIp(), ":", backends_[index]->port(),
41 "_0");
42 }
43 };
44
45 INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest,
46 ::testing::Values(XdsTestType()), &XdsTestType::Name);
47 // TODO(donnadionne): add non-xds test a new
48 // test/cpp/end2end/outlier_detection_end2end_test.cc
49
50 // Tests SuccessRateEjectionAndUnejection:
51 // 1. Use ring hash policy that hashes using a header value to ensure rpcs
52 // go to all backends.
53 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
54 // interval to pass.
55 // 3. We should skip exactly 1 backend due to ejection and all the loads
56 // sticky to that backend should go to 1 other backend.
57 // 4. Let the ejection period pass and verify we can go back to both backends
58 // after the uneject.
TEST_P(OutlierDetectionTest,SuccessRateEjectionAndUnejection)59 TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) {
60 CreateAndStartBackends(2);
61 auto cluster = default_cluster_;
62 cluster.set_lb_policy(Cluster::RING_HASH);
63 // Setup outlier failure percentage parameters.
64 // Any failure will cause an potential ejection with the probability of 100%
65 // (to eliminate flakiness of the test).
66 auto* outlier_detection = cluster.mutable_outlier_detection();
67 SetProtoDuration(grpc_core::Duration::Seconds(1),
68 outlier_detection->mutable_interval());
69 SetProtoDuration(grpc_core::Duration::Seconds(1),
70 outlier_detection->mutable_base_ejection_time());
71 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
72 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
73 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
74 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
75 balancer_->ads_service()->SetCdsResource(cluster);
76 auto new_route_config = default_route_config_;
77 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
78 auto* hash_policy = route->mutable_route()->add_hash_policy();
79 hash_policy->mutable_header()->set_header_name("address_hash");
80 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
81 new_route_config);
82 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
83 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
84 // Note each type of RPC will contain a header value that will always be
85 // hashed to a specific backend as the header value matches the value used
86 // to create the entry in the ring.
87 std::vector<std::pair<std::string, std::string>> metadata = {
88 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
89 std::vector<std::pair<std::string, std::string>> metadata1 = {
90 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
91 const auto rpc_options = RpcOptions().set_metadata(metadata);
92 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
93 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
94 WaitForBackendOptions(), rpc_options);
95 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
96 WaitForBackendOptions(), rpc_options1);
97 // Trigger an error to backend 0.
98 // The success rate enforcement_percentage is 100%, so this will cause
99 // the backend to be ejected when the ejection timer fires.
100 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
101 RpcOptions()
102 .set_metadata(std::move(metadata))
103 .set_server_expected_error(StatusCode::CANCELLED));
104 // Wait for traffic aimed at backend 0 to start going to backend 1.
105 // This tells us that backend 0 has been ejected.
106 // It should take no more than one ejection timer interval.
107 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
108 WaitForBackendOptions().set_timeout_ms(
109 3000 * grpc_test_slowdown_factor()),
110 rpc_options);
111 // Now wait for traffic aimed at backend 0 to switch back to backend 0.
112 // This tells us that backend 0 has been unejected.
113 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
114 WaitForBackendOptions().set_timeout_ms(
115 3000 * grpc_test_slowdown_factor()),
116 rpc_options);
117 }
118
119 // We don't eject more than max_ejection_percent (default 10%) of the backends
120 // beyond the first one.
TEST_P(OutlierDetectionTest,SuccessRateMaxPercent)121 TEST_P(OutlierDetectionTest, SuccessRateMaxPercent) {
122 CreateAndStartBackends(4);
123 auto cluster = default_cluster_;
124 cluster.set_lb_policy(Cluster::RING_HASH);
125 // Setup outlier failure percentage parameters.
126 // Any failure will cause an potential ejection with the probability of 100%
127 // (to eliminate flakiness of the test).
128 auto* outlier_detection = cluster.mutable_outlier_detection();
129 SetProtoDuration(grpc_core::Duration::Seconds(1),
130 outlier_detection->mutable_interval());
131 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
132 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
133 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
134 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
135 balancer_->ads_service()->SetCdsResource(cluster);
136 auto new_route_config = default_route_config_;
137 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
138 auto* hash_policy = route->mutable_route()->add_hash_policy();
139 hash_policy->mutable_header()->set_header_name("address_hash");
140 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
141 new_route_config);
142 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
143 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
144 // Note each type of RPC will contain a header value that will always be
145 // hashed to a specific backend as the header value matches the value used
146 // to create the entry in the ring.
147 std::vector<std::pair<std::string, std::string>> metadata = {
148 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
149 std::vector<std::pair<std::string, std::string>> metadata1 = {
150 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
151 std::vector<std::pair<std::string, std::string>> metadata2 = {
152 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
153 std::vector<std::pair<std::string, std::string>> metadata3 = {
154 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
155 const auto rpc_options = RpcOptions().set_metadata(metadata);
156 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
157 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
158 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
159 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
160 WaitForBackendOptions(), rpc_options);
161 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
162 WaitForBackendOptions(), rpc_options1);
163 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
164 WaitForBackendOptions(), rpc_options2);
165 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
166 WaitForBackendOptions(), rpc_options3);
167 // Cause 2 errors and wait until one ejection happens.
168 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
169 RpcOptions()
170 .set_metadata(std::move(metadata))
171 .set_server_expected_error(StatusCode::CANCELLED));
172 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
173 RpcOptions()
174 .set_metadata(std::move(metadata1))
175 .set_server_expected_error(StatusCode::CANCELLED));
176 absl::Time deadline =
177 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
178 while (true) {
179 ResetBackendCounters();
180 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
181 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
182 if (!SeenAllBackends(0, 2)) {
183 break;
184 }
185 EXPECT_LE(absl::Now(), deadline);
186 if (absl::Now() >= deadline) break;
187 }
188 // 1 backend should be ejected, traffic picked up by another backend.
189 // No other backend should be ejected.
190 ResetBackendCounters();
191 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
192 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
193 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
194 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
195 size_t empty_load_backend_count = 0;
196 size_t double_load_backend_count = 0;
197 size_t regular_load_backend_count = 0;
198 for (size_t i = 0; i < backends_.size(); ++i) {
199 if (backends_[i]->backend_service()->request_count() == 0) {
200 ++empty_load_backend_count;
201 } else if (backends_[i]->backend_service()->request_count() == 200) {
202 ++double_load_backend_count;
203 } else if (backends_[i]->backend_service()->request_count() == 100) {
204 ++regular_load_backend_count;
205 } else {
206 CHECK(1);
207 }
208 }
209 EXPECT_EQ(1, empty_load_backend_count);
210 EXPECT_EQ(1, double_load_backend_count);
211 EXPECT_EQ(2, regular_load_backend_count);
212 }
213
214 // Success rate stdev_factor is honored, a higher value would ensure ejection
215 // does not occur.
TEST_P(OutlierDetectionTest,SuccessRateStdevFactor)216 TEST_P(OutlierDetectionTest, SuccessRateStdevFactor) {
217 CreateAndStartBackends(2);
218 auto cluster = default_cluster_;
219 cluster.set_lb_policy(Cluster::RING_HASH);
220 // Setup outlier failure percentage parameters.
221 // Any failure will cause an potential ejection with the probability of 100%
222 // (to eliminate flakiness of the test).
223 auto* outlier_detection = cluster.mutable_outlier_detection();
224 SetProtoDuration(grpc_core::Duration::Seconds(1),
225 outlier_detection->mutable_interval());
226 SetProtoDuration(grpc_core::Duration::Seconds(1),
227 outlier_detection->mutable_base_ejection_time());
228 // We know a stdev factor of 100 will ensure the ejection occurs, so setting
229 // it to something higher like 1000 to test that ejection will not occur.
230 // Note this parameter is the only difference between this test and
231 // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
232 // this one value changes means the difference between not ejecting in this
233 // test and ejecting in the other test.
234 outlier_detection->mutable_success_rate_stdev_factor()->set_value(1000);
235 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
236 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
237 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
238 balancer_->ads_service()->SetCdsResource(cluster);
239 auto new_route_config = default_route_config_;
240 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
241 auto* hash_policy = route->mutable_route()->add_hash_policy();
242 hash_policy->mutable_header()->set_header_name("address_hash");
243 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
244 new_route_config);
245 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
246 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
247 // Note each type of RPC will contain a header value that will always be
248 // hashed to a specific backend as the header value matches the value used
249 // to create the entry in the ring.
250 std::vector<std::pair<std::string, std::string>> metadata = {
251 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
252 std::vector<std::pair<std::string, std::string>> metadata1 = {
253 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
254 const auto rpc_options = RpcOptions().set_metadata(metadata);
255 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
256 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
257 WaitForBackendOptions(), rpc_options);
258 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
259 WaitForBackendOptions(), rpc_options1);
260 // Cause an error and wait for 1 outlier detection interval to pass
261 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
262 RpcOptions()
263 .set_metadata(std::move(metadata))
264 .set_server_expected_error(StatusCode::CANCELLED));
265 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
266 3000 * grpc_test_slowdown_factor()));
267 ResetBackendCounters();
268 // 1 backend experienced failure, but since the stdev_factor is high, no
269 // backend will be noticed as an outlier so no ejection.
270 // Both backends are still getting the RPCs intended for them.
271 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
272 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
273 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
274 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
275 }
276
277 // Success rate enforcement percentage is honored, setting it to 0 so guarantee
278 // the randomized number between 1 to 100 will always be great, so nothing will
279 // be ejected.
TEST_P(OutlierDetectionTest,SuccessRateEnforcementPercentage)280 TEST_P(OutlierDetectionTest, SuccessRateEnforcementPercentage) {
281 CreateAndStartBackends(2);
282 auto cluster = default_cluster_;
283 cluster.set_lb_policy(Cluster::RING_HASH);
284 auto* outlier_detection = cluster.mutable_outlier_detection();
285 SetProtoDuration(grpc_core::Duration::Seconds(1),
286 outlier_detection->mutable_interval());
287 SetProtoDuration(grpc_core::Duration::Seconds(1),
288 outlier_detection->mutable_base_ejection_time());
289 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
290 // Setting enforcing_success_rate to 0 to ensure we will never eject.
291 // Note this parameter is the only difference between this test and
292 // SuccessRateEjectionAndUnejection (ejection portion, value set to 100) and
293 // this one value changes means the difference between guaranteed not ejecting
294 // in this test and guaranteed ejecting in the other test.
295 outlier_detection->mutable_enforcing_success_rate()->set_value(0);
296 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
297 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
298 balancer_->ads_service()->SetCdsResource(cluster);
299 auto new_route_config = default_route_config_;
300 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
301 auto* hash_policy = route->mutable_route()->add_hash_policy();
302 hash_policy->mutable_header()->set_header_name("address_hash");
303 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
304 new_route_config);
305 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
306 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
307 // Note each type of RPC will contain a header value that will always be
308 // hashed to a specific backend as the header value matches the value used
309 // to create the entry in the ring.
310 std::vector<std::pair<std::string, std::string>> metadata = {
311 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
312 std::vector<std::pair<std::string, std::string>> metadata1 = {
313 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
314 const auto rpc_options = RpcOptions().set_metadata(metadata);
315 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
316 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
317 WaitForBackendOptions(), rpc_options);
318 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
319 WaitForBackendOptions(), rpc_options1);
320 // Cause an error and wait for 1 outlier detection interval to pass
321 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
322 RpcOptions()
323 .set_metadata(std::move(metadata))
324 .set_server_expected_error(StatusCode::CANCELLED));
325 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
326 3000 * grpc_test_slowdown_factor()));
327 ResetBackendCounters();
328 // 1 backend experienced failure, but since the enforcement percentage is 0,
329 // no backend will be ejected. Both backends are still getting the RPCs
330 // intended for them.
331 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
332 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
333 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
334 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
335 }
336
337 // Success rate does not eject if there are less than minimum_hosts backends
338 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,SuccessRateMinimumHosts)339 TEST_P(OutlierDetectionTest, SuccessRateMinimumHosts) {
340 CreateAndStartBackends(2);
341 auto cluster = default_cluster_;
342 cluster.set_lb_policy(Cluster::RING_HASH);
343 // Setup outlier failure percentage parameters.
344 // Any failure will cause an potential ejection with the probability of 100%
345 // (to eliminate flakiness of the test).
346 auto* outlier_detection = cluster.mutable_outlier_detection();
347 SetProtoDuration(grpc_core::Duration::Seconds(1),
348 outlier_detection->mutable_interval());
349 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
350 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
351 // Set success_rate_minimum_hosts to 3 when we only have 2 backends
352 // Note this parameter is the only difference between this test and
353 // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
354 // this one value changes means the difference between not ejecting in this
355 // test and ejecting in the other test.
356 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(3);
357 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
358 balancer_->ads_service()->SetCdsResource(cluster);
359 auto new_route_config = default_route_config_;
360 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
361 auto* hash_policy = route->mutable_route()->add_hash_policy();
362 hash_policy->mutable_header()->set_header_name("address_hash");
363 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
364 new_route_config);
365 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
366 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
367 // Note each type of RPC will contain a header value that will always be
368 // hashed to a specific backend as the header value matches the value used
369 // to create the entry in the ring.
370 std::vector<std::pair<std::string, std::string>> metadata = {
371 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
372 std::vector<std::pair<std::string, std::string>> metadata1 = {
373 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
374 const auto rpc_options = RpcOptions().set_metadata(metadata);
375 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
376 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
377 WaitForBackendOptions(), rpc_options);
378 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
379 WaitForBackendOptions(), rpc_options1);
380 // Cause an error and wait for 1 outlier detection interval to pass
381 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
382 RpcOptions()
383 .set_metadata(std::move(metadata))
384 .set_server_expected_error(StatusCode::CANCELLED));
385 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
386 3000 * grpc_test_slowdown_factor()));
387 ResetBackendCounters();
388 // All traffic still reaching the original backends and no backends are
389 // ejected.
390 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
391 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
392 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
393 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
394 }
395
396 // Success rate does not eject if there are less than request_volume requests
397 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
398 // interval.
TEST_P(OutlierDetectionTest,SuccessRateRequestVolume)399 TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) {
400 CreateAndStartBackends(2);
401 auto cluster = default_cluster_;
402 cluster.set_lb_policy(Cluster::RING_HASH);
403 // Setup outlier failure percentage parameters.
404 // Any failure will cause an potential ejection with the probability of 100%
405 // (to eliminate flakiness of the test).
406 auto* outlier_detection = cluster.mutable_outlier_detection();
407 SetProtoDuration(grpc_core::Duration::Seconds(1),
408 outlier_detection->mutable_interval());
409 outlier_detection->mutable_success_rate_stdev_factor()->set_value(100);
410 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
411 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
412 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
413 // interval.
414 // Note this parameter is the only difference between this test and
415 // SuccessRateEjectionAndUnejection (ejection portion, value set to 1) and
416 // this one value changes means the difference between not ejecting in this
417 // test and ejecting in the other test.
418 outlier_detection->mutable_success_rate_request_volume()->set_value(4);
419 balancer_->ads_service()->SetCdsResource(cluster);
420 auto new_route_config = default_route_config_;
421 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
422 auto* hash_policy = route->mutable_route()->add_hash_policy();
423 hash_policy->mutable_header()->set_header_name("address_hash");
424 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
425 new_route_config);
426 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
427 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
428 // Note each type of RPC will contain a header value that will always be
429 // hashed to a specific backend as the header value matches the value used
430 // to create the entry in the ring.
431 std::vector<std::pair<std::string, std::string>> metadata = {
432 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
433 std::vector<std::pair<std::string, std::string>> metadata1 = {
434 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
435 const auto rpc_options = RpcOptions().set_metadata(metadata);
436 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
437 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
438 WaitForBackendOptions(), rpc_options);
439 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
440 WaitForBackendOptions(), rpc_options1);
441 // Cause an error and wait for 1 outlier detection interval to pass
442 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
443 RpcOptions()
444 .set_metadata(std::move(metadata))
445 .set_server_expected_error(StatusCode::CANCELLED));
446 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
447 3000 * grpc_test_slowdown_factor()));
448 ResetBackendCounters();
449 // All traffic still reaching the original backends and no backends are
450 // ejected.
451 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
452 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
453 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
454 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
455 }
456
457 // Tests FailurePercentageEjectionAndUnejection:
458 // 1. Use ring hash policy that hashes using a header value to ensure RPCs
459 // go to all backends.
460 // 2. Cause a single error on 1 backend and wait for 1 outlier detection
461 // interval to pass.
462 // 3. We should skip exactly 1 backend due to ejection and all the loads
463 // sticky to that backend should go to 1 other backend.
464 // 4. Let the ejection period pass and verify that traffic will again go both
465 // backends as we have unejected the backend.
TEST_P(OutlierDetectionTest,FailurePercentageEjectionAndUnejection)466 TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) {
467 CreateAndStartBackends(2);
468 auto cluster = default_cluster_;
469 cluster.set_lb_policy(Cluster::RING_HASH);
470 // Setup outlier failure percentage parameters.
471 // Any failure will cause an potential ejection with the probability of 100%
472 // (to eliminate flakiness of the test).
473 auto* outlier_detection = cluster.mutable_outlier_detection();
474 SetProtoDuration(grpc_core::Duration::Seconds(1),
475 outlier_detection->mutable_interval());
476 SetProtoDuration(grpc_core::Duration::Seconds(3),
477 outlier_detection->mutable_base_ejection_time());
478 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
479 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
480 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
481 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
482 balancer_->ads_service()->SetCdsResource(cluster);
483 auto new_route_config = default_route_config_;
484 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
485 auto* hash_policy = route->mutable_route()->add_hash_policy();
486 hash_policy->mutable_header()->set_header_name("address_hash");
487 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
488 new_route_config);
489 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
490 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
491 // Note each type of RPC will contain a header value that will always be
492 // hashed to a specific backend as the header value matches the value used
493 // to create the entry in the ring.
494 std::vector<std::pair<std::string, std::string>> metadata = {
495 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
496 std::vector<std::pair<std::string, std::string>> metadata1 = {
497 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
498 const auto rpc_options = RpcOptions().set_metadata(metadata);
499 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
500 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
501 WaitForBackendOptions(), rpc_options);
502 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
503 WaitForBackendOptions(), rpc_options1);
504 // Cause an error and wait for traffic aimed at backend 0 to start going to
505 // backend 1.
506 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
507 RpcOptions()
508 .set_metadata(std::move(metadata))
509 .set_server_expected_error(StatusCode::CANCELLED));
510 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
511 WaitForBackendOptions().set_timeout_ms(
512 3000 * grpc_test_slowdown_factor()),
513 rpc_options);
514 // 1 backend is ejected all traffic going to the ejected backend should now
515 // all be going to the other backend.
516 // failure percentage enforcement_percentage of 100% is honored as this test
517 // will consistently reject 1 backend.
518 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
519 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
520 // Now wait for traffic aimed at backend 0 to switch back to backend 0.
521 // This tells us that backend 0 has been unejected.
522 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
523 WaitForBackendOptions().set_timeout_ms(
524 30000 * grpc_test_slowdown_factor()),
525 rpc_options);
526 // Verify that rpcs go to their expectedly hashed backends.
527 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
528 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
529 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
530 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
531 }
532
533 // We don't eject more than max_ejection_percent (default 10%) of the backends
534 // beyond the first one.
TEST_P(OutlierDetectionTest,FailurePercentageMaxPercentage)535 TEST_P(OutlierDetectionTest, FailurePercentageMaxPercentage) {
536 CreateAndStartBackends(4);
537 auto cluster = default_cluster_;
538 cluster.set_lb_policy(Cluster::RING_HASH);
539 // Setup outlier failure percentage parameters.
540 // Any failure will cause an potential ejection with the probability of 100%
541 // (to eliminate flakiness of the test).
542 auto* outlier_detection = cluster.mutable_outlier_detection();
543 SetProtoDuration(grpc_core::Duration::Seconds(1),
544 outlier_detection->mutable_interval());
545 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
546 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
547 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
548 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
549 balancer_->ads_service()->SetCdsResource(cluster);
550 auto new_route_config = default_route_config_;
551 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
552 auto* hash_policy = route->mutable_route()->add_hash_policy();
553 hash_policy->mutable_header()->set_header_name("address_hash");
554 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
555 new_route_config);
556 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
557 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
558 // Note each type of RPC will contain a header value that will always be
559 // hashed to a specific backend as the header value matches the value used
560 // to create the entry in the ring.
561 std::vector<std::pair<std::string, std::string>> metadata = {
562 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
563 std::vector<std::pair<std::string, std::string>> metadata1 = {
564 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
565 std::vector<std::pair<std::string, std::string>> metadata2 = {
566 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
567 std::vector<std::pair<std::string, std::string>> metadata3 = {
568 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
569 const auto rpc_options = RpcOptions().set_metadata(metadata);
570 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
571 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
572 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
573 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
574 WaitForBackendOptions(), rpc_options);
575 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
576 WaitForBackendOptions(), rpc_options1);
577 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
578 WaitForBackendOptions(), rpc_options2);
579 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
580 WaitForBackendOptions(), rpc_options3);
581 // Cause 2 errors and wait until one ejection happens.
582 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
583 RpcOptions()
584 .set_metadata(std::move(metadata))
585 .set_server_expected_error(StatusCode::CANCELLED));
586 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
587 RpcOptions()
588 .set_metadata(std::move(metadata1))
589 .set_server_expected_error(StatusCode::CANCELLED));
590 absl::Time deadline =
591 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
592 while (true) {
593 ResetBackendCounters();
594 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
595 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
596 if (!SeenAllBackends(0, 2)) {
597 break;
598 }
599 EXPECT_LE(absl::Now(), deadline);
600 if (absl::Now() >= deadline) break;
601 }
602 // 1 backend should be ejected, traffic picked up by another backend.
603 // No other backend should be ejected.
604 ResetBackendCounters();
605 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
606 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
607 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
608 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
609 size_t empty_load_backend_count = 0;
610 size_t double_load_backend_count = 0;
611 size_t regular_load_backend_count = 0;
612 for (size_t i = 0; i < backends_.size(); ++i) {
613 if (backends_[i]->backend_service()->request_count() == 0) {
614 ++empty_load_backend_count;
615 } else if (backends_[i]->backend_service()->request_count() == 200) {
616 ++double_load_backend_count;
617 } else if (backends_[i]->backend_service()->request_count() == 100) {
618 ++regular_load_backend_count;
619 } else {
620 CHECK(1);
621 }
622 }
623 EXPECT_EQ(1, empty_load_backend_count);
624 EXPECT_EQ(1, double_load_backend_count);
625 EXPECT_EQ(2, regular_load_backend_count);
626 }
627
628 // Failure percentage threshold is honored, a higher value would ensure ejection
629 // does not occur
TEST_P(OutlierDetectionTest,FailurePercentageThreshold)630 TEST_P(OutlierDetectionTest, FailurePercentageThreshold) {
631 CreateAndStartBackends(2);
632 auto cluster = default_cluster_;
633 cluster.set_lb_policy(Cluster::RING_HASH);
634 auto* outlier_detection = cluster.mutable_outlier_detection();
635 SetProtoDuration(grpc_core::Duration::Seconds(1),
636 outlier_detection->mutable_interval());
637 SetProtoDuration(grpc_core::Duration::Seconds(1),
638 outlier_detection->mutable_base_ejection_time());
639 // Setup outlier failure percentage parameter to 50
640 // Note this parameter is the only difference between this test and
641 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 0)
642 // and this one value changes means the difference between not ejecting in
643 // this test and ejecting in the other test.
644 outlier_detection->mutable_failure_percentage_threshold()->set_value(50);
645 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
646 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
647 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
648 balancer_->ads_service()->SetCdsResource(cluster);
649 auto new_route_config = default_route_config_;
650 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
651 auto* hash_policy = route->mutable_route()->add_hash_policy();
652 hash_policy->mutable_header()->set_header_name("address_hash");
653 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
654 new_route_config);
655 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
656 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
657 // Note each type of RPC will contain a header value that will always be
658 // hashed to a specific backend as the header value matches the value used
659 // to create the entry in the ring.
660 std::vector<std::pair<std::string, std::string>> metadata = {
661 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
662 std::vector<std::pair<std::string, std::string>> metadata1 = {
663 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
664 const auto rpc_options = RpcOptions().set_metadata(metadata);
665 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
666 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
667 WaitForBackendOptions(), rpc_options);
668 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
669 WaitForBackendOptions(), rpc_options1);
670 // Cause an error and wait for 1 outlier detection interval to pass to cause
671 // the backend to be ejected.
672 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
673 RpcOptions()
674 .set_metadata(std::move(metadata))
675 .set_server_expected_error(StatusCode::CANCELLED));
676 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
677 3000 * grpc_test_slowdown_factor()));
678 ResetBackendCounters();
679 // 1 backend experienced 1 failure, but since the threshold is 50 % no
680 // backend will be noticed as an outlier so no ejection.
681 // Both backends are still getting the RPCs intended for them.
682 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
683 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
684 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
685 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
686 }
687
688 // Failure percentage enforcement percentage is honored, setting it to 0 so
689 // guarantee the randomized number between 1 to 100 will always be great, so
690 // nothing will be ejected.
TEST_P(OutlierDetectionTest,FailurePercentageEnforcementPercentage)691 TEST_P(OutlierDetectionTest, FailurePercentageEnforcementPercentage) {
692 CreateAndStartBackends(2);
693 auto cluster = default_cluster_;
694 cluster.set_lb_policy(Cluster::RING_HASH);
695 auto* outlier_detection = cluster.mutable_outlier_detection();
696 SetProtoDuration(grpc_core::Duration::Seconds(1),
697 outlier_detection->mutable_interval());
698 SetProtoDuration(grpc_core::Duration::Seconds(1),
699 outlier_detection->mutable_base_ejection_time());
700 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
701 // Setting enforcing_success_rate to 0 to ensure we will never eject.
702 // Note this parameter is the only difference between this test and
703 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 100)
704 // and this one value changes means the difference between guaranteed not
705 // ejecting in this test and guaranteed ejecting in the other test.
706 outlier_detection->mutable_enforcing_failure_percentage()->set_value(0);
707 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
708 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
709 balancer_->ads_service()->SetCdsResource(cluster);
710 auto new_route_config = default_route_config_;
711 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
712 auto* hash_policy = route->mutable_route()->add_hash_policy();
713 hash_policy->mutable_header()->set_header_name("address_hash");
714 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
715 new_route_config);
716 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
717 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
718 // Note each type of RPC will contain a header value that will always be
719 // hashed to a specific backend as the header value matches the value used
720 // to create the entry in the ring.
721 std::vector<std::pair<std::string, std::string>> metadata = {
722 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
723 std::vector<std::pair<std::string, std::string>> metadata1 = {
724 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
725 const auto rpc_options = RpcOptions().set_metadata(metadata);
726 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
727 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
728 WaitForBackendOptions(), rpc_options);
729 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
730 WaitForBackendOptions(), rpc_options1);
731 // Cause an error and wait for 1 outlier detection interval to pass to cause
732 // the backend to be ejected.
733 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
734 RpcOptions()
735 .set_metadata(std::move(metadata))
736 .set_server_expected_error(StatusCode::CANCELLED));
737 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
738 3000 * grpc_test_slowdown_factor()));
739 ResetBackendCounters();
740 // 1 backend experienced failure, but since the enforcement percentage is 0,
741 // no backend will be ejected. Both backends are still getting the RPCs
742 // intended for them.
743 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
744 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
745 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
746 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
747 }
748
749 // Failure percentage does not eject if there are less than minimum_hosts
750 // backends Set success_rate_minimum_hosts to 3 when we only have 2 backends
TEST_P(OutlierDetectionTest,FailurePercentageMinimumHosts)751 TEST_P(OutlierDetectionTest, FailurePercentageMinimumHosts) {
752 CreateAndStartBackends(2);
753 auto cluster = default_cluster_;
754 cluster.set_lb_policy(Cluster::RING_HASH);
755 // Setup outlier failure percentage parameters.
756 // Any failure will cause an potential ejection with the probability of 100%
757 // (to eliminate flakiness of the test).
758 auto* outlier_detection = cluster.mutable_outlier_detection();
759 SetProtoDuration(grpc_core::Duration::Seconds(1),
760 outlier_detection->mutable_interval());
761 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
762 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
763 // Set failure_percentage_minimum_hosts to 3 when we only have 2 backends
764 // Note this parameter is the only difference between this test and
765 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
766 // and this one value changes means the difference between not ejecting in
767 // this test and ejecting in the other test.
768 cluster.mutable_outlier_detection()
769 ->mutable_failure_percentage_minimum_hosts()
770 ->set_value(3);
771 cluster.mutable_outlier_detection()
772 ->mutable_failure_percentage_request_volume()
773 ->set_value(1);
774 balancer_->ads_service()->SetCdsResource(cluster);
775 auto new_route_config = default_route_config_;
776 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
777 auto* hash_policy = route->mutable_route()->add_hash_policy();
778 hash_policy->mutable_header()->set_header_name("address_hash");
779 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
780 new_route_config);
781 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
782 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
783 // Note each type of RPC will contain a header value that will always be
784 // hashed to a specific backend as the header value matches the value used
785 // to create the entry in the ring.
786 std::vector<std::pair<std::string, std::string>> metadata = {
787 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
788 std::vector<std::pair<std::string, std::string>> metadata1 = {
789 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
790 const auto rpc_options = RpcOptions().set_metadata(metadata);
791 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
792 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
793 WaitForBackendOptions(), rpc_options);
794 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
795 WaitForBackendOptions(), rpc_options1);
796 // Cause an error and wait for 1 outlier detection interval to pass to cause
797 // the backend to be ejected.
798 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
799 RpcOptions()
800 .set_metadata(std::move(metadata))
801 .set_server_expected_error(StatusCode::CANCELLED));
802 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
803 3000 * grpc_test_slowdown_factor()));
804 ResetBackendCounters();
805 // All traffic still reaching the original backends and no backends are
806 // ejected.
807 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
808 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
809 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
810 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
811 }
812
813 // Failure percentage does not eject if there are less than request_volume
814 // requests
815 // Set success_rate_request_volume to 4 when we only send 3 RPC in the
816 // interval.
TEST_P(OutlierDetectionTest,FailurePercentageRequestVolume)817 TEST_P(OutlierDetectionTest, FailurePercentageRequestVolume) {
818 CreateAndStartBackends(2);
819 auto cluster = default_cluster_;
820 cluster.set_lb_policy(Cluster::RING_HASH);
821 // Setup outlier failure percentage parameters.
822 // Any failure will cause an potential ejection with the probability of 100%
823 // (to eliminate flakiness of the test).
824 auto* outlier_detection = cluster.mutable_outlier_detection();
825 SetProtoDuration(grpc_core::Duration::Seconds(1),
826 outlier_detection->mutable_interval());
827 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
828 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
829 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
830 // Set failure_percentage_request_volume to 4 when we only send 3 RPC in the
831 // interval.
832 // // Note this parameter is the only difference between this test and
833 // FailurePercentageEjectionAndUnejection (ejection portion, value set to 1)
834 // and this one value changes means the difference between not ejecting in
835 // this test and ejecting in the other test.
836 outlier_detection->mutable_failure_percentage_request_volume()->set_value(4);
837 balancer_->ads_service()->SetCdsResource(cluster);
838 auto new_route_config = default_route_config_;
839 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
840 auto* hash_policy = route->mutable_route()->add_hash_policy();
841 hash_policy->mutable_header()->set_header_name("address_hash");
842 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
843 new_route_config);
844 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
845 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
846 // Note each type of RPC will contain a header value that will always be
847 // hashed to a specific backend as the header value matches the value used
848 // to create the entry in the ring.
849 std::vector<std::pair<std::string, std::string>> metadata = {
850 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
851 std::vector<std::pair<std::string, std::string>> metadata1 = {
852 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
853 const auto rpc_options = RpcOptions().set_metadata(metadata);
854 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
855 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
856 WaitForBackendOptions(), rpc_options);
857 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
858 WaitForBackendOptions(), rpc_options1);
859 // Cause an error and wait for 1 outlier detection interval to pass to cause
860 // the backend to be ejected.
861 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
862 RpcOptions()
863 .set_metadata(std::move(metadata))
864 .set_server_expected_error(StatusCode::CANCELLED));
865 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
866 3000 * grpc_test_slowdown_factor()));
867 ResetBackendCounters();
868 // All traffic still reaching the original backends and no backends are
869 // ejected.
870 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
871 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
872 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
873 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
874 }
875
876 // Tests SuccessRate and FailurePercentage both configured
877 // Configure max_ejection_percent to 50% which means max 2/4 backends can be
878 // ejected.
879 // Configure success rate to eject 1 and failure percentage to eject 2.
880 // Verify that maximum 2 backends are ejected, not 3!
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentage)881 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentage) {
882 CreateAndStartBackends(4);
883 auto cluster = default_cluster_;
884 cluster.set_lb_policy(Cluster::RING_HASH);
885 // Setup outlier failure percentage parameters.
886 // Any failure will cause an potential ejection with the probability of 100%
887 // (to eliminate flakiness of the test).
888 auto* outlier_detection = cluster.mutable_outlier_detection();
889 SetProtoDuration(grpc_core::Duration::Seconds(1),
890 outlier_detection->mutable_interval());
891 outlier_detection->mutable_max_ejection_percent()->set_value(50);
892 // This stdev of 500 will ensure the number of ok RPC and error RPC we send
893 // will make 1 outlier out of the 4 backends.
894 outlier_detection->mutable_success_rate_stdev_factor()->set_value(500);
895 outlier_detection->mutable_enforcing_success_rate()->set_value(100);
896 outlier_detection->mutable_success_rate_minimum_hosts()->set_value(1);
897 outlier_detection->mutable_success_rate_request_volume()->set_value(1);
898 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
899 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
900 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
901 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
902 balancer_->ads_service()->SetCdsResource(cluster);
903 auto new_route_config = default_route_config_;
904 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
905 auto* hash_policy = route->mutable_route()->add_hash_policy();
906 hash_policy->mutable_header()->set_header_name("address_hash");
907 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
908 new_route_config);
909 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
910 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
911 // Note each type of RPC will contain a header value that will always be
912 // hashed to a specific backend as the header value matches the value used
913 // to create the entry in the ring.
914 std::vector<std::pair<std::string, std::string>> metadata = {
915 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
916 std::vector<std::pair<std::string, std::string>> metadata1 = {
917 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
918 std::vector<std::pair<std::string, std::string>> metadata2 = {
919 {"address_hash", CreateMetadataValueThatHashesToBackend(2)}};
920 std::vector<std::pair<std::string, std::string>> metadata3 = {
921 {"address_hash", CreateMetadataValueThatHashesToBackend(3)}};
922 const auto rpc_options = RpcOptions().set_metadata(metadata);
923 const auto rpc_options1 = RpcOptions().set_metadata(metadata1);
924 const auto rpc_options2 = RpcOptions().set_metadata(metadata2);
925 const auto rpc_options3 = RpcOptions().set_metadata(metadata3);
926 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
927 WaitForBackendOptions(), rpc_options);
928 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
929 WaitForBackendOptions(), rpc_options1);
930 WaitForBackend(DEBUG_LOCATION, 2, /*check_status=*/nullptr,
931 WaitForBackendOptions(), rpc_options2);
932 WaitForBackend(DEBUG_LOCATION, 3, /*check_status=*/nullptr,
933 WaitForBackendOptions(), rpc_options3);
934 // Cause 2 errors on 1 backend and 1 error on 2 backends and wait for 2
935 // backends to be ejected. The 2 errors to the 1 backend will make exactly 1
936 // outlier from the success rate algorithm; all 4 errors will make 3 outliers
937 // from the failure percentage algorithm because the threshold is set to 0. I
938 // have verified through debug logs we eject 1 backend because of success
939 // rate, 1 backend because of failure percentage; but as we attempt to eject
940 // another backend because of failure percentage we will stop as we have
941 // reached our 50% limit.
942 CheckRpcSendFailure(
943 DEBUG_LOCATION, StatusCode::CANCELLED, "",
944 RpcOptions().set_metadata(metadata).set_server_expected_error(
945 StatusCode::CANCELLED));
946 CheckRpcSendFailure(
947 DEBUG_LOCATION, StatusCode::CANCELLED, "",
948 RpcOptions().set_metadata(metadata).set_server_expected_error(
949 StatusCode::CANCELLED));
950 CheckRpcSendFailure(
951 DEBUG_LOCATION, StatusCode::CANCELLED, "",
952 RpcOptions().set_metadata(metadata1).set_server_expected_error(
953 StatusCode::CANCELLED));
954 CheckRpcSendFailure(
955 DEBUG_LOCATION, StatusCode::CANCELLED, "",
956 RpcOptions().set_metadata(metadata2).set_server_expected_error(
957 StatusCode::CANCELLED));
958 absl::Time deadline =
959 absl::Now() + absl::Seconds(3) * grpc_test_slowdown_factor();
960 std::vector<size_t> idx = {0, 1, 2, 3};
961 while (true) {
962 ResetBackendCounters();
963 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
964 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options1);
965 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options2);
966 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options3);
967 if (std::count_if(idx.begin(), idx.end(),
968 [this](size_t i) { return SeenBackend(i); }) == 2) {
969 break;
970 }
971 EXPECT_LE(absl::Now(), deadline);
972 if (absl::Now() >= deadline) break;
973 }
974 ResetBackendCounters();
975 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
976 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
977 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options2);
978 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options3);
979 size_t empty_load_backend_count = 0;
980 size_t double_load_backend_count = 0;
981 for (size_t i = 0; i < backends_.size(); ++i) {
982 if (backends_[i]->backend_service()->request_count() == 0) {
983 ++empty_load_backend_count;
984 } else if (backends_[i]->backend_service()->request_count() >= 100) {
985 // The extra load could go to 2 remaining backends or just 1 of them.
986 ++double_load_backend_count;
987 } else if (backends_[i]->backend_service()->request_count() > 300) {
988 CHECK(1);
989 }
990 }
991 EXPECT_EQ(2, empty_load_backend_count);
992 EXPECT_EQ(2, double_load_backend_count);
993 }
994
995 // Tests SuccessRate and FailurePercentage both unconfigured;
996 // This is the case where according to the gRFC we need to instruct the picker
997 // not to do counting or even start the timer. The result of not counting is
998 // that there will be no ejection taking place since we can't do any
999 // calculations.
TEST_P(OutlierDetectionTest,SuccessRateAndFailurePercentageBothDisabled)1000 TEST_P(OutlierDetectionTest, SuccessRateAndFailurePercentageBothDisabled) {
1001 CreateAndStartBackends(2);
1002 auto cluster = default_cluster_;
1003 cluster.set_lb_policy(Cluster::RING_HASH);
1004 auto* outlier_detection = cluster.mutable_outlier_detection();
1005 SetProtoDuration(grpc_core::Duration::Seconds(1),
1006 outlier_detection->mutable_interval());
1007 SetProtoDuration(grpc_core::Duration::Seconds(1),
1008 outlier_detection->mutable_base_ejection_time());
1009 balancer_->ads_service()->SetCdsResource(cluster);
1010 auto new_route_config = default_route_config_;
1011 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1012 auto* hash_policy = route->mutable_route()->add_hash_policy();
1013 hash_policy->mutable_header()->set_header_name("address_hash");
1014 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1015 new_route_config);
1016 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1017 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1018 // Note each type of RPC will contain a header value that will always be
1019 // hashed to a specific backend as the header value matches the value used
1020 // to create the entry in the ring.
1021 std::vector<std::pair<std::string, std::string>> metadata = {
1022 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1023 std::vector<std::pair<std::string, std::string>> metadata1 = {
1024 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1025 const auto rpc_options = RpcOptions().set_metadata(metadata);
1026 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1027 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1028 WaitForBackendOptions(), rpc_options);
1029 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1030 WaitForBackendOptions(), rpc_options1);
1031 // Cause an error and wait for 1 outlier detection interval to pass
1032 CheckRpcSendFailure(DEBUG_LOCATION, StatusCode::CANCELLED, "",
1033 RpcOptions()
1034 .set_metadata(std::move(metadata))
1035 .set_server_expected_error(StatusCode::CANCELLED));
1036 gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(
1037 3000 * grpc_test_slowdown_factor()));
1038 ResetBackendCounters();
1039 // 1 backend experienced failure, but since there is no counting there is no
1040 // ejection. Both backends are still getting the RPCs intended for them.
1041 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options);
1042 CheckRpcSendOk(DEBUG_LOCATION, 100, rpc_options1);
1043 EXPECT_EQ(100, backends_[0]->backend_service()->request_count());
1044 EXPECT_EQ(100, backends_[1]->backend_service()->request_count());
1045 }
1046
1047 // Tests that we uneject any ejected addresses when the OD policy is
1048 // disabled.
TEST_P(OutlierDetectionTest,DisableOutlierDetectionWhileAddressesAreEjected)1049 TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) {
1050 CreateAndStartBackends(2);
1051 auto cluster = default_cluster_;
1052 cluster.set_lb_policy(Cluster::RING_HASH);
1053 // Setup outlier failure percentage parameters.
1054 // Any failure will cause an potential ejection with the probability of 100%
1055 // (to eliminate flakiness of the test).
1056 auto* outlier_detection = cluster.mutable_outlier_detection();
1057 SetProtoDuration(grpc_core::Duration::Seconds(1),
1058 outlier_detection->mutable_interval());
1059 SetProtoDuration(grpc_core::Duration::Seconds(3),
1060 outlier_detection->mutable_base_ejection_time());
1061 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1062 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1063 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1064 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1065 balancer_->ads_service()->SetCdsResource(cluster);
1066 auto new_route_config = default_route_config_;
1067 auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
1068 auto* hash_policy = route->mutable_route()->add_hash_policy();
1069 hash_policy->mutable_header()->set_header_name("address_hash");
1070 SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
1071 new_route_config);
1072 EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
1073 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1074 // Note each type of RPC will contain a header value that will always be
1075 // hashed to a specific backend as the header value matches the value used
1076 // to create the entry in the ring.
1077 std::vector<std::pair<std::string, std::string>> metadata = {
1078 {"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
1079 std::vector<std::pair<std::string, std::string>> metadata1 = {
1080 {"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
1081 const auto rpc_options = RpcOptions().set_metadata(metadata);
1082 const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
1083 WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
1084 WaitForBackendOptions(), rpc_options);
1085 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1086 WaitForBackendOptions(), rpc_options1);
1087 // Cause an error and wait for traffic aimed at backend 0 to start going to
1088 // backend 1.
1089 CheckRpcSendFailure(
1090 DEBUG_LOCATION, StatusCode::CANCELLED, "",
1091 RpcOptions().set_metadata(metadata).set_server_expected_error(
1092 StatusCode::CANCELLED));
1093 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1094 WaitForBackendOptions().set_timeout_ms(
1095 3000 * grpc_test_slowdown_factor()),
1096 rpc_options);
1097 // 1 backend is ejected all traffic going to the ejected backend should now
1098 // all be going to the other backend.
1099 // failure percentage enforcement_percentage of 100% is honored as this test
1100 // will consistently reject 1 backend.
1101 CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
1102 EXPECT_EQ(1, backends_[1]->backend_service()->request_count());
1103 // Send an update that disables outlier detection.
1104 cluster.clear_outlier_detection();
1105 balancer_->ads_service()->SetCdsResource(cluster);
1106 // Wait for the backend to start being used again.
1107 WaitForBackend(
1108 DEBUG_LOCATION, 0,
1109 [](const RpcResult& result) {
1110 EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED)
1111 << "Error: " << result.status.error_message();
1112 },
1113 WaitForBackendOptions(),
1114 RpcOptions()
1115 .set_metadata(std::move(metadata))
1116 .set_server_expected_error(StatusCode::CANCELLED));
1117 }
1118
TEST_P(OutlierDetectionTest,EjectionRetainedAcrossPriorities)1119 TEST_P(OutlierDetectionTest, EjectionRetainedAcrossPriorities) {
1120 CreateAndStartBackends(3);
1121 auto cluster = default_cluster_;
1122 // Setup outlier failure percentage parameters.
1123 // Any failure will cause an potential ejection with the probability of 100%
1124 // (to eliminate flakiness of the test).
1125 auto* outlier_detection = cluster.mutable_outlier_detection();
1126 SetProtoDuration(grpc_core::Duration::Seconds(1),
1127 outlier_detection->mutable_interval());
1128 SetProtoDuration(grpc_core::Duration::Minutes(10),
1129 outlier_detection->mutable_base_ejection_time());
1130 outlier_detection->mutable_failure_percentage_threshold()->set_value(0);
1131 outlier_detection->mutable_enforcing_failure_percentage()->set_value(100);
1132 outlier_detection->mutable_failure_percentage_minimum_hosts()->set_value(1);
1133 outlier_detection->mutable_failure_percentage_request_volume()->set_value(1);
1134 balancer_->ads_service()->SetCdsResource(cluster);
1135 // Priority 0: backend 0 and a non-existent backend.
1136 // Priority 1: backend 1.
1137 EdsResourceArgs args({
1138 {"locality0", {CreateEndpoint(0), MakeNonExistentEndpoint()}},
1139 {"locality1", {CreateEndpoint(1)}, kDefaultLocalityWeight, 1},
1140 });
1141 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1142 WaitForBackend(DEBUG_LOCATION, 0);
1143 // Trigger an error to backend 0.
1144 // The success rate enforcement_percentage is 100%, so this will cause
1145 // the backend to be ejected when the ejection timer fires.
1146 CheckRpcSendFailure(
1147 DEBUG_LOCATION, StatusCode::CANCELLED, "",
1148 RpcOptions().set_server_expected_error(StatusCode::CANCELLED));
1149 // Wait for traffic aimed at backend 0 to start going to backend 1.
1150 // This tells us that backend 0 has been ejected.
1151 // It should take no more than one ejection timer interval.
1152 WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
1153 WaitForBackendOptions().set_timeout_ms(
1154 3000 * grpc_test_slowdown_factor()));
1155 // Now send an EDS update that moves backend 0 to priority 1.
1156 // We also add backend 2, so that we know when the client sees the update.
1157 args = EdsResourceArgs({
1158 {"locality0", {MakeNonExistentEndpoint()}},
1159 {"locality1", CreateEndpointsForBackends(), kDefaultLocalityWeight, 1},
1160 });
1161 balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
1162 WaitForBackend(DEBUG_LOCATION, 2);
1163 // Now send 100 RPCs and make sure they all go to backends 1 and 2,
1164 // because backend 0 should still be ejected.
1165 CheckRpcSendOk(DEBUG_LOCATION, 100);
1166 EXPECT_EQ(0, backends_[0]->backend_service()->request_count());
1167 EXPECT_EQ(50, backends_[1]->backend_service()->request_count());
1168 EXPECT_EQ(50, backends_[2]->backend_service()->request_count());
1169 }
1170
1171 } // namespace
1172 } // namespace testing
1173 } // namespace grpc
1174
main(int argc,char ** argv)1175 int main(int argc, char** argv) {
1176 grpc::testing::TestEnvironment env(&argc, argv);
1177 ::testing::InitGoogleTest(&argc, argv);
1178 // Make the backup poller poll very frequently in order to pick up
1179 // updates from all the subchannels's FDs.
1180 grpc_core::ConfigVars::Overrides overrides;
1181 overrides.client_channel_backup_poll_interval_ms = 1;
1182 grpc_core::ConfigVars::SetOverrides(overrides);
1183 #if TARGET_OS_IPHONE
1184 // Workaround Apple CFStream bug
1185 grpc_core::SetEnv("grpc_cfstream", "0");
1186 #endif
1187 grpc_init();
1188 const auto result = RUN_ALL_TESTS();
1189 grpc_shutdown();
1190 return result;
1191 }
1192