Merge branch 'allow-mmap-of-sys-kernel-btf-vmlinux'

Lorenz Bauer says:

====================
Allow mmap of /sys/kernel/btf/vmlinux

I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go.
With some upcoming changes the library is sitting at 5MiB for a parse.
Most of that memory is simply copying the BTF blob into user space.
By allowing vmlinux BTF to be mmapped read-only into user space I can
cut memory usage by about 75%.

Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
---
Changes in v5:
- Fix error return of btf_parse_raw_mmap (Andrii)
- Link to v4: https://lore.kernel.org/r/20250510-vmlinux-mmap-v4-0-69e424b2a672@isovalent.com

Changes in v4:
- Go back to remap_pfn_range for aarch64 compat
- Dropped btf_new_no_copy (Andrii)
- Fixed nits in selftests (Andrii)
- Clearer error handling in the mmap handler (Andrii)
- Fixed build on s390
- Link to v3: https://lore.kernel.org/r/20250505-vmlinux-mmap-v3-0-5d53afa060e8@isovalent.com

Changes in v3:
- Remove slightly confusing calculation of trailing (Alexei)
- Use vm_insert_page (Alexei)
- Simplified libbpf code
- Link to v2: https://lore.kernel.org/r/20250502-vmlinux-mmap-v2-0-95c271434519@isovalent.com

Changes in v2:
- Use btf__new in selftest
- Avoid vm_iomap_memory in btf_vmlinux_mmap
- Add VM_DONTDUMP
- Add support to libbpf
- Link to v1: https://lore.kernel.org/r/20250501-vmlinux-mmap-v1-0-aa2724572598@isovalent.com

---
====================

Link: https://patch.msgid.link/20250520-vmlinux-mmap-v5-0-e8c941acc414@isovalent.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
This commit is contained in:
Andrii Nakryiko
2025-05-23 10:06:28 -07:00
4 changed files with 186 additions and 19 deletions

View File

@@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
*/
#ifdef CONFIG_DEBUG_INFO_BTF
#define BTF \
. = ALIGN(PAGE_SIZE); \
.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \
BOUNDED_SECTION_BY(.BTF, _BTF) \
} \
. = ALIGN(4); \
. = ALIGN(PAGE_SIZE); \
.BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \
*(.BTF_ids) \
}

View File

@@ -7,14 +7,46 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
extern char __start_BTF[];
extern char __stop_BTF[];
static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
{
unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
size_t vm_size = vma->vm_end - vma->vm_start;
phys_addr_t addr = virt_to_phys(__start_BTF);
unsigned long pfn = addr >> PAGE_SHIFT;
if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
return -EINVAL;
if (vma->vm_pgoff)
return -EINVAL;
if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
return -EACCES;
if (pfn + pages < pfn)
return -EINVAL;
if ((vm_size >> PAGE_SHIFT) > pages)
return -EINVAL;
vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
.read_new = sysfs_bin_attr_simple_read,
.mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;

View File

@@ -12,6 +12,7 @@
#include <sys/utsname.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/btf.h>
@@ -120,6 +121,9 @@ struct btf {
/* whether base_btf should be freed in btf_free for this instance */
bool owns_base;
/* whether raw_data is a (read-only) mmap */
bool raw_data_is_mmap;
/* BTF object FD, if loaded into kernel */
int fd;
@@ -951,6 +955,17 @@ static bool btf_is_modifiable(const struct btf *btf)
return (void *)btf->hdr != btf->raw_data;
}
static void btf_free_raw_data(struct btf *btf)
{
if (btf->raw_data_is_mmap) {
munmap(btf->raw_data, btf->raw_size);
btf->raw_data_is_mmap = false;
} else {
free(btf->raw_data);
}
btf->raw_data = NULL;
}
void btf__free(struct btf *btf)
{
if (IS_ERR_OR_NULL(btf))
@@ -970,7 +985,7 @@ void btf__free(struct btf *btf)
free(btf->types_data);
strset__free(btf->strs_set);
}
free(btf->raw_data);
btf_free_raw_data(btf);
free(btf->raw_data_swapped);
free(btf->type_offs);
if (btf->owns_base)
@@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf)
return libbpf_ptr(btf_new_empty(base_btf));
}
static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, bool is_mmap)
{
struct btf *btf;
int err;
@@ -1050,12 +1065,18 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
btf->start_str_off = base_btf->hdr->str_len;
}
btf->raw_data = malloc(size);
if (!btf->raw_data) {
err = -ENOMEM;
goto done;
if (is_mmap) {
btf->raw_data = (void *)data;
btf->raw_data_is_mmap = true;
} else {
btf->raw_data = malloc(size);
if (!btf->raw_data) {
err = -ENOMEM;
goto done;
}
memcpy(btf->raw_data, data, size);
}
memcpy(btf->raw_data, data, size);
btf->raw_size = size;
btf->hdr = btf->raw_data;
@@ -1083,12 +1104,12 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
struct btf *btf__new(const void *data, __u32 size)
{
return libbpf_ptr(btf_new(data, size, NULL));
return libbpf_ptr(btf_new(data, size, NULL, false));
}
struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf)
{
return libbpf_ptr(btf_new(data, size, base_btf));
return libbpf_ptr(btf_new(data, size, base_btf, false));
}
struct btf_elf_secs {
@@ -1209,7 +1230,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
if (secs.btf_base_data) {
dist_base_btf = btf_new(secs.btf_base_data->d_buf, secs.btf_base_data->d_size,
NULL);
NULL, false);
if (IS_ERR(dist_base_btf)) {
err = PTR_ERR(dist_base_btf);
dist_base_btf = NULL;
@@ -1218,7 +1239,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
}
btf = btf_new(secs.btf_data->d_buf, secs.btf_data->d_size,
dist_base_btf ?: base_btf);
dist_base_btf ?: base_btf, false);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto done;
@@ -1335,7 +1356,7 @@ static struct btf *btf_parse_raw(const char *path, struct btf *base_btf)
}
/* finally parse BTF data */
btf = btf_new(data, sz, base_btf);
btf = btf_new(data, sz, base_btf, false);
err_out:
free(data);
@@ -1354,6 +1375,37 @@ struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf)
return libbpf_ptr(btf_parse_raw(path, base_btf));
}
static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
{
struct stat st;
void *data;
struct btf *btf;
int fd, err;
fd = open(path, O_RDONLY);
if (fd < 0)
return libbpf_err_ptr(-errno);
if (fstat(fd, &st) < 0) {
err = -errno;
close(fd);
return libbpf_err_ptr(err);
}
data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
err = -errno;
close(fd);
if (data == MAP_FAILED)
return libbpf_err_ptr(err);
btf = btf_new(data, st.st_size, base_btf, true);
if (IS_ERR(btf))
munmap(data, st.st_size);
return btf;
}
static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext)
{
struct btf *btf;
@@ -1618,7 +1670,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
goto exit_free;
}
btf = btf_new(ptr, btf_info.btf_size, base_btf);
btf = btf_new(ptr, btf_info.btf_size, base_btf, false);
exit_free:
free(ptr);
@@ -1658,10 +1710,8 @@ struct btf *btf__load_from_kernel_by_id(__u32 id)
static void btf_invalidate_raw_data(struct btf *btf)
{
if (btf->raw_data) {
free(btf->raw_data);
btf->raw_data = NULL;
}
if (btf->raw_data)
btf_free_raw_data(btf);
if (btf->raw_data_swapped) {
free(btf->raw_data_swapped);
btf->raw_data_swapped = NULL;
@@ -5331,7 +5381,10 @@ struct btf *btf__load_vmlinux_btf(void)
pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n",
sysfs_btf_path);
} else {
btf = btf__parse(sysfs_btf_path, NULL);
btf = btf_parse_raw_mmap(sysfs_btf_path, NULL);
if (IS_ERR(btf))
btf = btf__parse(sysfs_btf_path, NULL);
if (!btf) {
err = -errno;
pr_warn("failed to read kernel BTF from '%s': %s\n",

View File

@@ -0,0 +1,81 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Copyright (c) 2025 Isovalent */
#include <test_progs.h>
#include <bpf/btf.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
static void test_btf_mmap_sysfs(const char *path, struct btf *base)
{
struct stat st;
__u64 btf_size, end;
void *raw_data = NULL;
int fd = -1;
long page_size;
struct btf *btf = NULL;
page_size = sysconf(_SC_PAGESIZE);
if (!ASSERT_GE(page_size, 0, "get_page_size"))
goto cleanup;
if (!ASSERT_OK(stat(path, &st), "stat_btf"))
goto cleanup;
btf_size = st.st_size;
end = (btf_size + page_size - 1) / page_size * page_size;
fd = open(path, O_RDONLY);
if (!ASSERT_GE(fd, 0, "open_btf"))
goto cleanup;
raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable"))
goto cleanup;
raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared"))
goto cleanup;
raw_data = mmap(NULL, end + 1, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size"))
goto cleanup;
raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_OK_PTR(raw_data, "mmap_btf"))
goto cleanup;
if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1,
"mprotect_writable"))
goto cleanup;
if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1,
"mprotect_executable"))
goto cleanup;
/* Check padding is zeroed */
for (int i = btf_size; i < end; i++) {
if (((__u8 *)raw_data)[i] != 0) {
PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i);
goto cleanup;
}
}
btf = btf__new_split(raw_data, btf_size, base);
if (!ASSERT_OK_PTR(btf, "parse_btf"))
goto cleanup;
cleanup:
btf__free(btf);
if (raw_data && raw_data != MAP_FAILED)
munmap(raw_data, btf_size);
if (fd >= 0)
close(fd);
}
void test_btf_sysfs(void)
{
test_btf_mmap_sysfs("/sys/kernel/btf/vmlinux", NULL);
}