bpf: arena: populate vm_area without allocating memory

vm_area_map_pages() may allocate memory while inserting pages into bpf
arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc
non-sleepable change bpf arena to populate pages without
allocating memory:
- at arena creation time populate all page table levels except
  the last level
- when new pages need to be inserted call apply_to_page_range() again
  with apply_range_set_cb() which will only set_pte_at() those pages and
  will not allocate memory.
- when freeing pages call apply_to_existing_page_range with
  apply_range_clear_cb() to clear the pte for the page to be removed. This
  doesn't free intermediate page table levels.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251222195022.431211-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Puranjay Mohan
2025-12-22 11:50:16 -08:00
committed by Alexei Starovoitov
parent ac1c5bc7c4
commit c336b0b327

View File

@@ -2,11 +2,13 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/cacheflush.h>
#include <linux/err.h>
#include "linux/filter.h"
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <asm/tlbflush.h>
#include "range_tree.h"
/*
@@ -92,6 +94,68 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}
struct apply_range_data {
struct page **pages;
int i;
};
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
struct page *page;
if (!data)
return 0;
/* sanity check */
if (unlikely(!pte_none(ptep_get(pte))))
return -EBUSY;
page = d->pages[d->i];
/* paranoia, similar to vmap_pages_pte_range() */
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
return -EINVAL;
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
d->i++;
return 0;
}
static void flush_vmap_cache(unsigned long start, unsigned long size)
{
flush_cache_vmap(start, start + size);
}
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
pte_t old_pte;
struct page *page;
/* sanity check */
old_pte = ptep_get(pte);
if (pte_none(old_pte) || !pte_present(old_pte))
return 0; /* nothing to do */
/* get page and free it */
page = pte_page(old_pte);
if (WARN_ON_ONCE(!page))
return -EINVAL;
pte_clear(&init_mm, addr, pte);
/* ensure no stale TLB entries */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
__free_page(page);
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
@@ -144,6 +208,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err;
}
mutex_init(&arena->lock);
err = populate_pgtable_except_pte(arena);
if (err) {
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
goto err;
}
return &arena->map;
err:
@@ -286,6 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
if (ret)
return VM_FAULT_SIGSEGV;
struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
@@ -293,12 +364,13 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
}
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
__free_page(page);
return VM_FAULT_SIGSEGV;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
out:
page_ref_add(page, 1);
vmf->page = page;
@@ -428,7 +500,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
struct page **pages;
struct page **pages = NULL;
long mapped = 0;
long pgoff = 0;
u32 uaddr32;
int ret, i;
@@ -450,7 +523,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (!pages)
return 0;
guard(mutex)(&arena->lock);
mutex_lock(&arena->lock);
if (uaddr) {
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
@@ -465,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
struct apply_range_data data = { .pages = pages, .i = 0 };
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
@@ -477,18 +551,24 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
* lower 32-bit and it's ok.
*/
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
if (ret) {
for (i = 0; i < page_cnt; i++)
apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
mapped = data.i;
flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
if (mapped < page_cnt) {
for (i = mapped; i < page_cnt; i++)
__free_page(pages[i]);
goto out;
}
mutex_unlock(&arena->lock);
kvfree(pages);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
range_tree_set(&arena->rt, pgoff, page_cnt);
range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
out_free_pages:
mutex_unlock(&arena->lock);
if (mapped)
arena_free_pages(arena, uaddr32, mapped);
kvfree(pages);
return 0;
}
@@ -545,8 +625,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
* page_cnt is big it's faster to do the batched zap.
*/
zap_pages(arena, full_uaddr, 1);
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb,
NULL);
}
}