1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Example program for Host Bandwidth Managment
9 *
10 * This program loads a cgroup skb BPF program to enforce cgroup output
11 * (egress) or input (ingress) bandwidth limits.
12 *
13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
14 * Where:
15 * -d Print BPF trace debug buffer
16 * -l Also limit flows doing loopback
17 * -n <#> To create cgroup \"/hbm#\" and attach prog
18 * Default is /hbm1
19 * --no_cn Do not return cn notifications
20 * -r <rate> Rate limit in Mbps
21 * -s Get HBM stats (marked, dropped, etc.)
22 * -t <time> Exit after specified seconds (default is 0)
23 * -w Work conserving flag. cgroup can increase its bandwidth
24 * beyond the rate limit specified while there is available
25 * bandwidth. Current implementation assumes there is only
26 * NIC (eth0), but can be extended to support multiple NICs.
27 * Currrently only supported for egress.
28 * -h Print this info
29 * prog BPF program file name. Name defaults to hbm_out_kern.o
30 */
31
32 #define _GNU_SOURCE
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <assert.h>
37 #include <sys/resource.h>
38 #include <sys/time.h>
39 #include <unistd.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <linux/unistd.h>
43
44 #include <linux/bpf.h>
45 #include <bpf/bpf.h>
46 #include <getopt.h>
47
48 #include "bpf_load.h"
49 #include "bpf_rlimit.h"
50 #include "cgroup_helpers.h"
51 #include "hbm.h"
52 #include "bpf_util.h"
53 #include "bpf.h"
54 #include "libbpf.h"
55
56 bool outFlag = true;
57 int minRate = 1000; /* cgroup rate limit in Mbps */
58 int rate = 1000; /* can grow if rate conserving is enabled */
59 int dur = 1;
60 bool stats_flag;
61 bool loopback_flag;
62 bool debugFlag;
63 bool work_conserving_flag;
64 bool no_cn_flag;
65 bool edt_flag;
66
67 static void Usage(void);
68 static void read_trace_pipe2(void);
69 static void do_error(char *msg, bool errno_flag);
70
71 #define DEBUGFS "/sys/kernel/debug/tracing/"
72
73 struct bpf_object *obj;
74 int bpfprog_fd;
75 int cgroup_storage_fd;
76
read_trace_pipe2(void)77 static void read_trace_pipe2(void)
78 {
79 int trace_fd;
80 FILE *outf;
81 char *outFname = "hbm_out.log";
82
83 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
84 if (trace_fd < 0) {
85 printf("Error opening trace_pipe\n");
86 return;
87 }
88
89 // Future support of ingress
90 // if (!outFlag)
91 // outFname = "hbm_in.log";
92 outf = fopen(outFname, "w");
93
94 if (outf == NULL)
95 printf("Error creating %s\n", outFname);
96
97 while (1) {
98 static char buf[4097];
99 ssize_t sz;
100
101 sz = read(trace_fd, buf, sizeof(buf) - 1);
102 if (sz > 0) {
103 buf[sz] = 0;
104 puts(buf);
105 if (outf != NULL) {
106 fprintf(outf, "%s\n", buf);
107 fflush(outf);
108 }
109 }
110 }
111 }
112
do_error(char * msg,bool errno_flag)113 static void do_error(char *msg, bool errno_flag)
114 {
115 if (errno_flag)
116 printf("ERROR: %s, errno: %d\n", msg, errno);
117 else
118 printf("ERROR: %s\n", msg);
119 exit(1);
120 }
121
prog_load(char * prog)122 static int prog_load(char *prog)
123 {
124 struct bpf_prog_load_attr prog_load_attr = {
125 .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
126 .file = prog,
127 .expected_attach_type = BPF_CGROUP_INET_EGRESS,
128 };
129 int map_fd;
130 struct bpf_map *map;
131
132 int ret = 0;
133
134 if (access(prog, O_RDONLY) < 0) {
135 printf("Error accessing file %s: %s\n", prog, strerror(errno));
136 return 1;
137 }
138 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
139 ret = 1;
140 if (!ret) {
141 map = bpf_object__find_map_by_name(obj, "queue_stats");
142 map_fd = bpf_map__fd(map);
143 if (map_fd < 0) {
144 printf("Map not found: %s\n", strerror(map_fd));
145 ret = 1;
146 }
147 }
148
149 if (ret) {
150 printf("ERROR: load_bpf_file failed for: %s\n", prog);
151 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
152 ret = -1;
153 } else {
154 ret = map_fd;
155 }
156
157 return ret;
158 }
159
run_bpf_prog(char * prog,int cg_id)160 static int run_bpf_prog(char *prog, int cg_id)
161 {
162 int map_fd;
163 int rc = 0;
164 int key = 0;
165 int cg1 = 0;
166 int type = BPF_CGROUP_INET_EGRESS;
167 char cg_dir[100];
168 struct hbm_queue_stats qstats = {0};
169
170 sprintf(cg_dir, "/hbm%d", cg_id);
171 map_fd = prog_load(prog);
172 if (map_fd == -1)
173 return 1;
174
175 if (setup_cgroup_environment()) {
176 printf("ERROR: setting cgroup environment\n");
177 goto err;
178 }
179 cg1 = create_and_get_cgroup(cg_dir);
180 if (!cg1) {
181 printf("ERROR: create_and_get_cgroup\n");
182 goto err;
183 }
184 if (join_cgroup(cg_dir)) {
185 printf("ERROR: join_cgroup\n");
186 goto err;
187 }
188
189 qstats.rate = rate;
190 qstats.stats = stats_flag ? 1 : 0;
191 qstats.loopback = loopback_flag ? 1 : 0;
192 qstats.no_cn = no_cn_flag ? 1 : 0;
193 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
194 printf("ERROR: Could not update map element\n");
195 goto err;
196 }
197
198 if (!outFlag)
199 type = BPF_CGROUP_INET_INGRESS;
200 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
201 printf("ERROR: bpf_prog_attach fails!\n");
202 log_err("Attaching prog");
203 goto err;
204 }
205
206 if (work_conserving_flag) {
207 struct timeval t0, t_last, t_new;
208 FILE *fin;
209 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
210 signed long long last_cg_tx_bytes, new_cg_tx_bytes;
211 signed long long delta_time, delta_bytes, delta_rate;
212 int delta_ms;
213 #define DELTA_RATE_CHECK 10000 /* in us */
214 #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
215
216 bpf_map_lookup_elem(map_fd, &key, &qstats);
217 if (gettimeofday(&t0, NULL) < 0)
218 do_error("gettimeofday failed", true);
219 t_last = t0;
220 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
221 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
222 do_error("fscanf fails", false);
223 fclose(fin);
224 last_cg_tx_bytes = qstats.bytes_total;
225 while (true) {
226 usleep(DELTA_RATE_CHECK);
227 if (gettimeofday(&t_new, NULL) < 0)
228 do_error("gettimeofday failed", true);
229 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
230 (t_new.tv_usec - t0.tv_usec)/1000;
231 if (delta_ms > dur * 1000)
232 break;
233 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
234 (t_new.tv_usec - t_last.tv_usec);
235 if (delta_time == 0)
236 continue;
237 t_last = t_new;
238 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
239 "r");
240 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
241 do_error("fscanf fails", false);
242 fclose(fin);
243 printf(" new_eth_tx_bytes:%llu\n",
244 new_eth_tx_bytes);
245 bpf_map_lookup_elem(map_fd, &key, &qstats);
246 new_cg_tx_bytes = qstats.bytes_total;
247 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
248 last_eth_tx_bytes = new_eth_tx_bytes;
249 delta_rate = (delta_bytes * 8000000) / delta_time;
250 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
251 delta_ms, delta_rate/1000000000.0,
252 rate/1000.0);
253 if (delta_rate < RATE_THRESHOLD) {
254 /* can increase cgroup rate limit, but first
255 * check if we are using the current limit.
256 * Currently increasing by 6.25%, unknown
257 * if that is the optimal rate.
258 */
259 int rate_diff100;
260
261 delta_bytes = new_cg_tx_bytes -
262 last_cg_tx_bytes;
263 last_cg_tx_bytes = new_cg_tx_bytes;
264 delta_rate = (delta_bytes * 8000000) /
265 delta_time;
266 printf(" rate:%.3fGbps",
267 delta_rate/1000000000.0);
268 rate_diff100 = (((long long)rate)*1000000 -
269 delta_rate) * 100 /
270 (((long long) rate) * 1000000);
271 printf(" rdiff:%d", rate_diff100);
272 if (rate_diff100 <= 3) {
273 rate += (rate >> 4);
274 if (rate > RATE_THRESHOLD / 1000000)
275 rate = RATE_THRESHOLD / 1000000;
276 qstats.rate = rate;
277 printf(" INC\n");
278 } else {
279 printf("\n");
280 }
281 } else {
282 /* Need to decrease cgroup rate limit.
283 * Currently decreasing by 12.5%, unknown
284 * if that is optimal
285 */
286 printf(" DEC\n");
287 rate -= (rate >> 3);
288 if (rate < minRate)
289 rate = minRate;
290 qstats.rate = rate;
291 }
292 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
293 do_error("update map element fails", false);
294 }
295 } else {
296 sleep(dur);
297 }
298 // Get stats!
299 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
300 char fname[100];
301 FILE *fout;
302
303 if (!outFlag)
304 sprintf(fname, "hbm.%d.in", cg_id);
305 else
306 sprintf(fname, "hbm.%d.out", cg_id);
307 fout = fopen(fname, "w");
308 fprintf(fout, "id:%d\n", cg_id);
309 fprintf(fout, "ERROR: Could not lookup queue_stats\n");
310 } else if (stats_flag && qstats.lastPacketTime >
311 qstats.firstPacketTime) {
312 long long delta_us = (qstats.lastPacketTime -
313 qstats.firstPacketTime)/1000;
314 unsigned int rate_mbps = ((qstats.bytes_total -
315 qstats.bytes_dropped) * 8 /
316 delta_us);
317 double percent_pkts, percent_bytes;
318 char fname[100];
319 FILE *fout;
320 int k;
321 static const char *returnValNames[] = {
322 "DROP_PKT",
323 "ALLOW_PKT",
324 "DROP_PKT_CWR",
325 "ALLOW_PKT_CWR"
326 };
327 #define RET_VAL_COUNT 4
328
329 // Future support of ingress
330 // if (!outFlag)
331 // sprintf(fname, "hbm.%d.in", cg_id);
332 // else
333 sprintf(fname, "hbm.%d.out", cg_id);
334 fout = fopen(fname, "w");
335 fprintf(fout, "id:%d\n", cg_id);
336 fprintf(fout, "rate_mbps:%d\n", rate_mbps);
337 fprintf(fout, "duration:%.1f secs\n",
338 (qstats.lastPacketTime - qstats.firstPacketTime) /
339 1000000000.0);
340 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
341 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
342 1000000));
343 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
344 fprintf(fout, "bytes_dropped_MB:%d\n",
345 (int)(qstats.bytes_dropped /
346 1000000));
347 // Marked Pkts and Bytes
348 percent_pkts = (qstats.pkts_marked * 100.0) /
349 (qstats.pkts_total + 1);
350 percent_bytes = (qstats.bytes_marked * 100.0) /
351 (qstats.bytes_total + 1);
352 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
353 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
354
355 // Dropped Pkts and Bytes
356 percent_pkts = (qstats.pkts_dropped * 100.0) /
357 (qstats.pkts_total + 1);
358 percent_bytes = (qstats.bytes_dropped * 100.0) /
359 (qstats.bytes_total + 1);
360 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
361 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
362
363 // ECN CE markings
364 percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
365 (qstats.pkts_total + 1);
366 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
367 (int)qstats.pkts_ecn_ce);
368
369 // Average cwnd
370 fprintf(fout, "avg cwnd:%d\n",
371 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
372 // Average rtt
373 fprintf(fout, "avg rtt:%d\n",
374 (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
375 // Average credit
376 if (edt_flag)
377 fprintf(fout, "avg credit_ms:%.03f\n",
378 (qstats.sum_credit /
379 (qstats.pkts_total + 1.0)) / 1000000.0);
380 else
381 fprintf(fout, "avg credit:%d\n",
382 (int)(qstats.sum_credit /
383 (1500 * ((int)qstats.pkts_total ) + 1)));
384
385 // Return values stats
386 for (k = 0; k < RET_VAL_COUNT; k++) {
387 percent_pkts = (qstats.returnValCount[k] * 100.0) /
388 (qstats.pkts_total + 1);
389 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
390 percent_pkts, (int)qstats.returnValCount[k]);
391 }
392 fclose(fout);
393 }
394
395 if (debugFlag)
396 read_trace_pipe2();
397 return rc;
398 err:
399 rc = 1;
400
401 if (cg1)
402 close(cg1);
403 cleanup_cgroup_environment();
404
405 return rc;
406 }
407
Usage(void)408 static void Usage(void)
409 {
410 printf("This program loads a cgroup skb BPF program to enforce\n"
411 "cgroup output (egress) bandwidth limits.\n\n"
412 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
413 " [-s] [-t <secs>] [-w] [-h] [prog]\n"
414 " Where:\n"
415 " -o indicates egress direction (default)\n"
416 " -d print BPF trace debug buffer\n"
417 " --edt use fq's Earliest Departure Time\n"
418 " -l also limit flows using loopback\n"
419 " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
420 " Default is /hbm1\n"
421 " --no_cn disable CN notifications\n"
422 " -r <rate> Rate in Mbps\n"
423 " -s Update HBM stats\n"
424 " -t <time> Exit after specified seconds (default is 0)\n"
425 " -w Work conserving flag. cgroup can increase\n"
426 " bandwidth beyond the rate limit specified\n"
427 " while there is available bandwidth. Current\n"
428 " implementation assumes there is only eth0\n"
429 " but can be extended to support multiple NICs\n"
430 " -h print this info\n"
431 " prog BPF program file name. Name defaults to\n"
432 " hbm_out_kern.o\n");
433 }
434
main(int argc,char ** argv)435 int main(int argc, char **argv)
436 {
437 char *prog = "hbm_out_kern.o";
438 int k;
439 int cg_id = 1;
440 char *optstring = "iodln:r:st:wh";
441 struct option loptions[] = {
442 {"no_cn", 0, NULL, 1},
443 {"edt", 0, NULL, 2},
444 {NULL, 0, NULL, 0}
445 };
446
447 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
448 switch (k) {
449 case 1:
450 no_cn_flag = true;
451 break;
452 case 2:
453 prog = "hbm_edt_kern.o";
454 edt_flag = true;
455 break;
456 case'o':
457 break;
458 case 'd':
459 debugFlag = true;
460 break;
461 case 'l':
462 loopback_flag = true;
463 break;
464 case 'n':
465 cg_id = atoi(optarg);
466 break;
467 case 'r':
468 minRate = atoi(optarg) * 1.024;
469 rate = minRate;
470 break;
471 case 's':
472 stats_flag = true;
473 break;
474 case 't':
475 dur = atoi(optarg);
476 break;
477 case 'w':
478 work_conserving_flag = true;
479 break;
480 case '?':
481 if (optopt == 'n' || optopt == 'r' || optopt == 't')
482 fprintf(stderr,
483 "Option -%c requires an argument.\n\n",
484 optopt);
485 case 'h':
486 // fallthrough
487 default:
488 Usage();
489 return 0;
490 }
491 }
492
493 if (optind < argc)
494 prog = argv[optind];
495 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
496
497 return run_bpf_prog(prog, cg_id);
498 }
499