From 43736ec3e02789795d4e7b9cb49a005fa56c0171 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 13 Jun 2025 00:21:46 -0700 Subject: [PATCH 1/2] bpf: Include verifier memory allocations in memcg statistics This commit adds __GFP_ACCOUNT flag to verifier induced memory allocations. The intent is to account for all allocations reachable from BPF_PROG_LOAD command, which is needed to track verifier memory consumption in veristat. This includes allocations done in verifier.c, and some allocations in btf.c, functions in log.c do not allocate. There is also a utility function bpf_memcg_flags() which selectively adds GFP_ACCOUNT flag depending on the `cgroup.memory=nobpf` option. As far as I understand [1], the idea is to remove bpf_prog instances and maps from memcg accounting as these objects do not strictly belong to cgroup, hence it should not apply here. (btf_parse_fields() is reachable from both program load and map creation, but allocated record is not persistent as is freed as soon as map_check_btf() exits). [1] https://lore.kernel.org/all/20230210154734.4416-1-laoar.shao@gmail.com/ Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250613072147.3938139-2-eddyz87@gmail.com --- kernel/bpf/btf.c | 15 +++++----- kernel/bpf/verifier.c | 69 ++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1d2cf898e21e..682acb1ed234 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3443,7 +3443,8 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); @@ -3958,7 +3959,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -9019,7 +9020,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, bpf_free_cands_from_cache(*cc); *cc = NULL; } - new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9027,7 +9028,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ - new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT); bpf_free_cands(cands); if (!new_cands->name) { kfree(new_cands); @@ -9111,7 +9112,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, continue; /* most of the time there is only one candidate for a given kind+name pair */ - new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9228,7 +9229,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" * into arrays of btf_ids of struct fields and array indices. */ - specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT); if (!specs) return -ENOMEM; @@ -9253,7 +9254,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, goto out; } if (cc->cnt) { - cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT); if (!cands.cands) { err = -ENOMEM; goto out; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c378074516cf..fd81cff11617 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1404,7 +1404,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) goto out; alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); - new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT); if (!new_arr) { kfree(arr); return NULL; @@ -1421,7 +1421,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, - sizeof(struct bpf_reference_state), GFP_KERNEL); + sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT); if (!dst->refs) return -ENOMEM; @@ -1440,7 +1440,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st size_t n = src->allocated_stack / BPF_REG_SIZE; dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dst->stack) return -ENOMEM; @@ -1731,7 +1731,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); + GFP_KERNEL_ACCOUNT); if (!dst_state->jmp_history) return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; @@ -1760,7 +1760,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT); if (!dst) return -ENOMEM; dst_state->frame[i] = dst; @@ -1874,7 +1874,7 @@ static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, info = env->scc_info[scc]; num_visits = info ? info->num_visits : 0; new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); - info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; env->scc_info[scc] = info; @@ -2095,7 +2095,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; int err; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) goto err; @@ -2862,7 +2862,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) goto err; @@ -2885,7 +2885,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - frame = kzalloc(sizeof(*frame), GFP_KERNEL); + frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) goto err; init_func_state(env, frame, @@ -3237,7 +3237,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } - tab = kzalloc(sizeof(*tab), GFP_KERNEL); + tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT); if (!tab) return -ENOMEM; prog_aux->kfunc_tab = tab; @@ -3253,7 +3253,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; if (!btf_tab && offset) { - btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL); + btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM; prog_aux->kfunc_btf_tab = btf_tab; @@ -3939,7 +3939,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); + p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); if (!p) return -ENOMEM; cur->jmp_history = p; @@ -10356,7 +10356,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } caller = state->frame[state->curframe]; - callee = kzalloc(sizeof(*callee), GFP_KERNEL); + callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT); if (!callee) return -ENOMEM; state->frame[state->curframe + 1] = callee; @@ -17693,17 +17693,18 @@ static int check_cfg(struct bpf_verifier_env *env) int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_state) return -ENOMEM; - insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_postorder = env->cfg.insn_postorder = + kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_postorder) { kvfree(insn_state); kvfree(insn_stack); @@ -17837,7 +17838,7 @@ static int check_btf_func_early(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!krecord) return -ENOMEM; @@ -17937,7 +17938,7 @@ static int check_btf_func(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); krecord = prog->aux->func_info; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!info_aux) return -ENOMEM; @@ -18023,7 +18024,7 @@ static int check_btf_line(struct bpf_verifier_env *env, * pass in a smaller bpf_line_info object. */ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!linfo) return -ENOMEM; @@ -19408,7 +19409,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (loop) { struct bpf_scc_backedge *backedge; - backedge = kzalloc(sizeof(*backedge), GFP_KERNEL); + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT); if (!backedge) return -ENOMEM; err = copy_verifier_state(&backedge->state, cur); @@ -19472,7 +19473,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * When looping the sl->state.branches will be > 0 and this state * will not be considered for equivalence until branches == 0. */ - new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT); if (!new_sl) return -ENOMEM; env->total_states++; @@ -22976,13 +22977,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) env->prev_linfo = NULL; env->pass_cnt++; - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); return -ENOMEM; @@ -23208,7 +23209,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt) { - prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT); prog->aux->ctx_arg_info_size = cnt; return prog->aux->ctx_arg_info ? 0 : -ENOMEM; @@ -24152,7 +24153,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) * - repeat the computation while {in,out} fields changes for * any instruction. */ - state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) { err = -ENOMEM; goto out; @@ -24244,10 +24245,10 @@ static int compute_scc(struct bpf_verifier_env *env) * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; * - 'dfs' DFS traversal stack, used to emulate explicit recursion. */ - stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL); + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); if (!stack || !pre || !low || !dfs) { err = -ENOMEM; goto exit; @@ -24381,7 +24382,7 @@ static int compute_scc(struct bpf_verifier_env *env) dfs_sz--; } } - env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL); + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT); if (!env->scc_info) { err = -ENOMEM; goto exit; @@ -24409,7 +24410,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT); if (!env) return -ENOMEM; @@ -24472,7 +24473,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct list_head), - GFP_USER); + GFP_KERNEL_ACCOUNT); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; @@ -24603,7 +24604,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_maps) { ret = -ENOMEM; @@ -24618,7 +24619,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* if program passed verifier, update used_btfs in bpf_prog_aux */ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, sizeof(env->used_btfs[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_btfs) { ret = -ENOMEM; goto err_release_maps; From 67cdcc405b46c13446d6d220a108daa2f8de3436 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 13 Jun 2025 00:21:47 -0700 Subject: [PATCH 2/2] veristat: Memory accounting for bpf programs This commit adds a new field mem_peak / "Peak memory (MiB)" field to a set of gathered statistics. The field is intended as an estimate for peak verifier memory consumption for processing of a given program. Mechanically stat is collected as follows: - At the beginning of handle_verif_mode() a new cgroup is created and veristat process is moved into this cgroup. - At each program load: - bpf_object__load() is split into bpf_object__prepare() and bpf_object__load() to avoid accounting for memory allocated for maps; - before bpf_object__load(): - a write to "memory.peak" file of the new cgroup is used to reset cgroup statistics; - updated value is read from "memory.peak" file and stashed; - after bpf_object__load() "memory.peak" is read again and difference between new and stashed values is used as a metric. If any of the above steps fails veristat proceeds w/o collecting mem_peak information for a program, reporting mem_peak as -1. While memcg provides data in bytes (converted from pages), veristat converts it to megabytes to avoid jitter when comparing results of different executions. The change has no measurable impact on veristat running time. A correlation between "Peak states" and "Peak memory" fields provides a sanity check for gathered statistics, e.g. a sample of data for sched_ext programs: Program Peak states Peak memory (MiB) ------------------------ ----------- ----------------- lavd_select_cpu 2153 44 lavd_enqueue 1982 41 lavd_dispatch 3480 28 layered_dispatch 1417 17 layered_enqueue 760 11 lavd_cpu_offline 349 6 lavd_cpu_online 349 6 lavd_init 394 6 rusty_init 350 5 layered_select_cpu 391 4 ... rusty_stopping 134 1 arena_topology_node_init 170 0 Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250613072147.3938139-3-eddyz87@gmail.com --- tools/testing/selftests/bpf/Makefile | 5 + tools/testing/selftests/bpf/veristat.c | 248 ++++++++++++++++++++++++- 2 files changed, 246 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cf5ed3bee573..97013c49920b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -841,6 +841,11 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ +# This works around GCC warning about snprintf truncating strings like: +# +# char a[PATH_MAX], b[PATH_MAX]; +# snprintf(a, "%s/foo", b); // triggers -Wformat-truncation +$(OUTPUT)/veristat.o: CFLAGS += -Wno-format-truncation $(OUTPUT)/veristat.o: $(BPFOBJ) $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(call msg,BINARY,,$@) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 70cdabd88c50..56771650ee8c 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -49,6 +49,7 @@ enum stat_id { STACK, PROG_TYPE, ATTACH_TYPE, + MEMORY_PEAK, FILE_NAME, PROG_NAME, @@ -208,6 +209,9 @@ static struct env { int top_src_lines; struct var_preset *presets; int npresets; + char orig_cgroup[PATH_MAX]; + char stat_cgroup[PATH_MAX]; + int memory_peak_fd; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -219,6 +223,22 @@ static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va return vfprintf(stderr, format, args); } +#define log_errno(fmt, ...) log_errno_aux(__FILE__, __LINE__, fmt, ##__VA_ARGS__) + +__printf(3, 4) +static int log_errno_aux(const char *file, int line, const char *fmt, ...) +{ + int err = -errno; + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "%s:%d: ", file, line); + vfprintf(stderr, fmt, ap); + fprintf(stderr, " failed with error '%s'.\n", strerror(errno)); + va_end(ap); + return err; +} + #ifndef VERISTAT_VERSION #define VERISTAT_VERSION "" #endif @@ -734,13 +754,13 @@ static int append_file_from_file(const char *path) } static const struct stat_specs default_csv_output_spec = { - .spec_cnt = 14, + .spec_cnt = 15, .ids = { FILE_NAME, PROG_NAME, VERDICT, DURATION, TOTAL_INSNS, TOTAL_STATES, PEAK_STATES, MAX_STATES_PER_INSN, MARK_READ_MAX_LEN, SIZE, JITED_SIZE, PROG_TYPE, ATTACH_TYPE, - STACK, + STACK, MEMORY_PEAK, }, }; @@ -781,6 +801,7 @@ static struct stat_def { [STACK] = {"Stack depth", {"stack_depth", "stack"}, }, [PROG_TYPE] = { "Program type", {"prog_type"}, }, [ATTACH_TYPE] = { "Attach type", {"attach_type", }, }, + [MEMORY_PEAK] = { "Peak memory (MiB)", {"mem_peak", }, }, }; static bool parse_stat_id_var(const char *name, size_t len, int *id, @@ -1279,16 +1300,214 @@ static int max_verifier_log_size(void) return log_size; } +static bool output_stat_enabled(int id) +{ + int i; + + for (i = 0; i < env.output_spec.spec_cnt; i++) + if (env.output_spec.ids[i] == id) + return true; + return false; +} + +__printf(2, 3) +static int write_one_line(const char *file, const char *fmt, ...) +{ + int err, saved_errno; + va_list ap; + FILE *f; + + f = fopen(file, "w"); + if (!f) + return -1; + + va_start(ap, fmt); + errno = 0; + err = vfprintf(f, fmt, ap); + saved_errno = errno; + va_end(ap); + fclose(f); + errno = saved_errno; + return err < 0 ? -1 : 0; +} + +__scanf(3, 4) +static int scanf_one_line(const char *file, int fields_expected, const char *fmt, ...) +{ + int res = 0, saved_errno = 0; + char *line = NULL; + size_t line_len; + va_list ap; + FILE *f; + + f = fopen(file, "r"); + if (!f) + return -1; + + va_start(ap, fmt); + while (getline(&line, &line_len, f) > 0) { + res = vsscanf(line, fmt, ap); + if (res == fields_expected) + goto out; + } + if (ferror(f)) { + saved_errno = errno; + res = -1; + } + +out: + va_end(ap); + free(line); + fclose(f); + errno = saved_errno; + return res; +} + +static void destroy_stat_cgroup(void) +{ + char buf[PATH_MAX]; + int err; + + close(env.memory_peak_fd); + + if (env.orig_cgroup[0]) { + snprintf(buf, sizeof(buf), "%s/cgroup.procs", env.orig_cgroup); + err = write_one_line(buf, "%d\n", getpid()); + if (err < 0) + log_errno("moving self to original cgroup %s\n", env.orig_cgroup); + } + + if (env.stat_cgroup[0]) { + err = rmdir(env.stat_cgroup); + if (err < 0) + log_errno("deletion of cgroup %s", env.stat_cgroup); + } + + env.memory_peak_fd = -1; + env.orig_cgroup[0] = 0; + env.stat_cgroup[0] = 0; +} + +/* + * Creates a cgroup at /sys/fs/cgroup/veristat-accounting-, + * moves current process to this cgroup. + */ +static void create_stat_cgroup(void) +{ + char cgroup_fs_mount[4096]; + char buf[4096]; + int err; + + env.memory_peak_fd = -1; + + if (!output_stat_enabled(MEMORY_PEAK)) + return; + + err = scanf_one_line("/proc/self/mounts", 2, "%*s %4095s cgroup2 %s", + cgroup_fs_mount, buf); + if (err != 2) { + if (err < 0) + log_errno("reading /proc/self/mounts"); + else if (!env.quiet) + fprintf(stderr, "Can't find cgroupfs v2 mount point.\n"); + goto err_out; + } + + /* cgroup-v2.rst promises the line "0::" for cgroups v2 */ + err = scanf_one_line("/proc/self/cgroup", 1, "0::%4095s", buf); + if (err != 1) { + if (err < 0) + log_errno("reading /proc/self/cgroup"); + else if (!env.quiet) + fprintf(stderr, "Can't infer veristat process cgroup."); + goto err_out; + } + + snprintf(env.orig_cgroup, sizeof(env.orig_cgroup), "%s/%s", cgroup_fs_mount, buf); + + snprintf(buf, sizeof(buf), "%s/veristat-accounting-%d", cgroup_fs_mount, getpid()); + err = mkdir(buf, 0777); + if (err < 0) { + log_errno("creation of cgroup %s", buf); + goto err_out; + } + strcpy(env.stat_cgroup, buf); + + snprintf(buf, sizeof(buf), "%s/cgroup.procs", env.stat_cgroup); + err = write_one_line(buf, "%d\n", getpid()); + if (err < 0) { + log_errno("entering cgroup %s", buf); + goto err_out; + } + + snprintf(buf, sizeof(buf), "%s/memory.peak", env.stat_cgroup); + env.memory_peak_fd = open(buf, O_RDWR | O_APPEND); + if (env.memory_peak_fd < 0) { + log_errno("opening %s", buf); + goto err_out; + } + + return; + +err_out: + if (!env.quiet) + fprintf(stderr, "Memory usage metric unavailable.\n"); + destroy_stat_cgroup(); +} + +/* Current value of /sys/fs/cgroup/veristat-accounting-/memory.peak */ +static long cgroup_memory_peak(void) +{ + long err, memory_peak; + char buf[32]; + + if (env.memory_peak_fd < 0) + return -1; + + err = pread(env.memory_peak_fd, buf, sizeof(buf) - 1, 0); + if (err <= 0) { + log_errno("pread(%s/memory.peak)", env.stat_cgroup); + return -1; + } + + buf[err] = 0; + errno = 0; + memory_peak = strtoll(buf, NULL, 10); + if (errno) { + log_errno("%s/memory.peak:strtoll(%s)", env.stat_cgroup, buf); + return -1; + } + + return memory_peak; +} + +static int reset_stat_cgroup(void) +{ + char buf[] = "r\n"; + int err; + + if (env.memory_peak_fd < 0) + return -1; + + err = pwrite(env.memory_peak_fd, buf, sizeof(buf), 0); + if (err <= 0) { + log_errno("pwrite(%s/memory.peak)", env.stat_cgroup); + return -1; + } + return 0; +} + static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { const char *base_filename = basename(strdupa(filename)); const char *prog_name = bpf_program__name(prog); + long mem_peak_a, mem_peak_b, mem_peak = -1; char *buf; int buf_sz, log_level; struct verif_stats *stats; struct bpf_prog_info info; __u32 info_len = sizeof(info); - int err = 0; + int err = 0, cgroup_err; void *tmp; int fd; @@ -1333,7 +1552,15 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf if (env.force_reg_invariants) bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_REG_INVARIANTS); - err = bpf_object__load(obj); + err = bpf_object__prepare(obj); + if (!err) { + cgroup_err = reset_stat_cgroup(); + mem_peak_a = cgroup_memory_peak(); + err = bpf_object__load(obj); + mem_peak_b = cgroup_memory_peak(); + if (!cgroup_err && mem_peak_a >= 0 && mem_peak_b >= 0) + mem_peak = mem_peak_b - mem_peak_a; + } env.progs_processed++; stats->file_name = strdup(base_filename); @@ -1342,6 +1569,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf stats->stats[SIZE] = bpf_program__insn_cnt(prog); stats->stats[PROG_TYPE] = bpf_program__type(prog); stats->stats[ATTACH_TYPE] = bpf_program__expected_attach_type(prog); + stats->stats[MEMORY_PEAK] = mem_peak < 0 ? -1 : mem_peak / (1024 * 1024); memset(&info, 0, info_len); fd = bpf_program__fd(prog); @@ -1825,6 +2053,7 @@ static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2, case TOTAL_STATES: case PEAK_STATES: case MAX_STATES_PER_INSN: + case MEMORY_PEAK: case MARK_READ_MAX_LEN: { long v1 = s1->stats[id]; long v2 = s2->stats[id]; @@ -2054,6 +2283,7 @@ static void prepare_value(const struct verif_stats *s, enum stat_id id, case STACK: case SIZE: case JITED_SIZE: + case MEMORY_PEAK: *val = s ? s->stats[id] : 0; break; default: @@ -2140,6 +2370,7 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats case MARK_READ_MAX_LEN: case SIZE: case JITED_SIZE: + case MEMORY_PEAK: case STACK: { long val; int err, n; @@ -2777,7 +3008,7 @@ static void output_prog_stats(void) static int handle_verif_mode(void) { - int i, err; + int i, err = 0; if (env.filename_cnt == 0) { fprintf(stderr, "Please provide path to BPF object file!\n\n"); @@ -2785,11 +3016,12 @@ static int handle_verif_mode(void) return -EINVAL; } + create_stat_cgroup(); for (i = 0; i < env.filename_cnt; i++) { err = process_obj(env.filenames[i]); if (err) { fprintf(stderr, "Failed to process '%s': %d\n", env.filenames[i], err); - return err; + goto out; } } @@ -2797,7 +3029,9 @@ static int handle_verif_mode(void) output_prog_stats(); - return 0; +out: + destroy_stat_cgroup(); + return err; } static int handle_replay_mode(void)