mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 16:01:44 -04:00
bpf: arena: populate vm_area without allocating memory
vm_area_map_pages() may allocate memory while inserting pages into bpf arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc non-sleepable change bpf arena to populate pages without allocating memory: - at arena creation time populate all page table levels except the last level - when new pages need to be inserted call apply_to_page_range() again with apply_range_set_cb() which will only set_pte_at() those pages and will not allocate memory. - when freeing pages call apply_to_existing_page_range with apply_range_clear_cb() to clear the pte for the page to be removed. This doesn't free intermediate page table levels. Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Link: https://lore.kernel.org/r/20251222195022.431211-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
committed by
Alexei Starovoitov
parent
ac1c5bc7c4
commit
c336b0b327
@@ -2,11 +2,13 @@
|
||||
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/cacheflush.h>
|
||||
#include <linux/err.h>
|
||||
#include "linux/filter.h"
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "range_tree.h"
|
||||
|
||||
/*
|
||||
@@ -92,6 +94,68 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
|
||||
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
struct apply_range_data {
|
||||
struct page **pages;
|
||||
int i;
|
||||
};
|
||||
|
||||
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
|
||||
{
|
||||
struct apply_range_data *d = data;
|
||||
struct page *page;
|
||||
|
||||
if (!data)
|
||||
return 0;
|
||||
/* sanity check */
|
||||
if (unlikely(!pte_none(ptep_get(pte))))
|
||||
return -EBUSY;
|
||||
|
||||
page = d->pages[d->i];
|
||||
/* paranoia, similar to vmap_pages_pte_range() */
|
||||
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
|
||||
return -EINVAL;
|
||||
|
||||
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
|
||||
d->i++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void flush_vmap_cache(unsigned long start, unsigned long size)
|
||||
{
|
||||
flush_cache_vmap(start, start + size);
|
||||
}
|
||||
|
||||
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
|
||||
{
|
||||
pte_t old_pte;
|
||||
struct page *page;
|
||||
|
||||
/* sanity check */
|
||||
old_pte = ptep_get(pte);
|
||||
if (pte_none(old_pte) || !pte_present(old_pte))
|
||||
return 0; /* nothing to do */
|
||||
|
||||
/* get page and free it */
|
||||
page = pte_page(old_pte);
|
||||
if (WARN_ON_ONCE(!page))
|
||||
return -EINVAL;
|
||||
|
||||
pte_clear(&init_mm, addr, pte);
|
||||
|
||||
/* ensure no stale TLB entries */
|
||||
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||
|
||||
__free_page(page);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int populate_pgtable_except_pte(struct bpf_arena *arena)
|
||||
{
|
||||
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
|
||||
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
|
||||
}
|
||||
|
||||
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct vm_struct *kern_vm;
|
||||
@@ -144,6 +208,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
|
||||
goto err;
|
||||
}
|
||||
mutex_init(&arena->lock);
|
||||
err = populate_pgtable_except_pte(arena);
|
||||
if (err) {
|
||||
range_tree_destroy(&arena->rt);
|
||||
bpf_map_area_free(arena);
|
||||
goto err;
|
||||
}
|
||||
|
||||
return &arena->map;
|
||||
err:
|
||||
@@ -286,6 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
|
||||
if (ret)
|
||||
return VM_FAULT_SIGSEGV;
|
||||
|
||||
struct apply_range_data data = { .pages = &page, .i = 0 };
|
||||
/* Account into memcg of the process that created bpf_arena */
|
||||
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
|
||||
if (ret) {
|
||||
@@ -293,12 +364,13 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
|
||||
return VM_FAULT_SIGSEGV;
|
||||
}
|
||||
|
||||
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
|
||||
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
|
||||
if (ret) {
|
||||
range_tree_set(&arena->rt, vmf->pgoff, 1);
|
||||
__free_page(page);
|
||||
return VM_FAULT_SIGSEGV;
|
||||
}
|
||||
flush_vmap_cache(kaddr, PAGE_SIZE);
|
||||
out:
|
||||
page_ref_add(page, 1);
|
||||
vmf->page = page;
|
||||
@@ -428,7 +500,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
||||
/* user_vm_end/start are fixed before bpf prog runs */
|
||||
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
|
||||
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
|
||||
struct page **pages;
|
||||
struct page **pages = NULL;
|
||||
long mapped = 0;
|
||||
long pgoff = 0;
|
||||
u32 uaddr32;
|
||||
int ret, i;
|
||||
@@ -450,7 +523,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
||||
if (!pages)
|
||||
return 0;
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
mutex_lock(&arena->lock);
|
||||
|
||||
if (uaddr) {
|
||||
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
@@ -465,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
||||
if (ret)
|
||||
goto out_free_pages;
|
||||
|
||||
struct apply_range_data data = { .pages = pages, .i = 0 };
|
||||
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
|
||||
if (ret)
|
||||
goto out;
|
||||
@@ -477,18 +551,24 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
||||
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
|
||||
* lower 32-bit and it's ok.
|
||||
*/
|
||||
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
|
||||
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
|
||||
if (ret) {
|
||||
for (i = 0; i < page_cnt; i++)
|
||||
apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
|
||||
page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
|
||||
mapped = data.i;
|
||||
flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
|
||||
if (mapped < page_cnt) {
|
||||
for (i = mapped; i < page_cnt; i++)
|
||||
__free_page(pages[i]);
|
||||
goto out;
|
||||
}
|
||||
mutex_unlock(&arena->lock);
|
||||
kvfree(pages);
|
||||
return clear_lo32(arena->user_vm_start) + uaddr32;
|
||||
out:
|
||||
range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
|
||||
out_free_pages:
|
||||
mutex_unlock(&arena->lock);
|
||||
if (mapped)
|
||||
arena_free_pages(arena, uaddr32, mapped);
|
||||
kvfree(pages);
|
||||
return 0;
|
||||
}
|
||||
@@ -545,8 +625,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
|
||||
* page_cnt is big it's faster to do the batched zap.
|
||||
*/
|
||||
zap_pages(arena, full_uaddr, 1);
|
||||
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
|
||||
__free_page(page);
|
||||
apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb,
|
||||
NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user