Merge branch 'rework/console-list-lock' into for-linus

This commit is contained in:
Petr Mladek
2022-12-08 11:46:56 +01:00
2944 changed files with 142350 additions and 36220 deletions

View File

@@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# Don't instrument error handlers

View File

@@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
struct mm_struct *mm = current->mm;
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
mmap_read_lock(current->mm);
vma = current->mm->mmap;
while (vma) {
mmap_read_lock(mm);
for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
mmap_read_unlock(current->mm);
mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);

View File

@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
#else
DEFINE(LRU_GEN_WIDTH, 0);
DEFINE(__LRU_REFS_WIDTH, 0);
#endif
/* End of constants */
return 0;

View File

@@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
attr->value_size / sizeof(u32);
if (!(attr->map_flags & BPF_F_ZERO_SEED))
bloom->hash_seed = get_random_int();
bloom->hash_seed = get_random_u32();
return &bloom->map;
}

View File

@@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
start = prandom_u32_max(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
start = (get_random_int() % hole) & ~(alignment - 1);
start = prandom_u32_max(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
u32 imm_rnd = get_random_int();
u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -2007,7 +2007,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG]; \
u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \

View File

@@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
htab->hashrnd = get_random_int();
htab->hashrnd = get_random_u32();
htab_init_buckets(htab);

View File

@@ -445,8 +445,8 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
task_vma_iter_first_vma, /* use mm->mmap */
task_vma_iter_next_vma, /* use curr_vma->vm_next */
task_vma_iter_first_vma, /* use find_vma() with addr 0 */
task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
@@ -544,10 +544,10 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
switch (op) {
case task_vma_iter_first_vma:
curr_vma = curr_task->mm->mmap;
curr_vma = find_vma(curr_task->mm, 0);
break;
case task_vma_iter_next_vma:
curr_vma = curr_vma->vm_next;
curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
@@ -561,7 +561,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
curr_vma = curr_vma->vm_next;
curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {

View File

@@ -13350,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
imm_rnd = get_random_int();
imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;

View File

@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;

View File

@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
return pressure_write(of, buf, nbytes, PSI_CPU);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IRQ);
}
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif
static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
seq_printf(seq, "%d\n", psi->enabled);
return 0;
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
ssize_t ret;
int enable;
struct cgroup *cgrp;
struct psi_group *psi;
ret = kstrtoint(strstrip(buf), 0, &enable);
if (ret)
return ret;
if (enable < 0 || enable > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
psi = cgroup_psi(cgrp);
if (psi->enabled != enable) {
int i;
/* show or hide {cpu,memory,io,irq}.pressure files */
for (i = 0; i < NR_PSI_RESOURCES; i++)
cgroup_file_show(&cgrp->psi_files[i], enable);
psi->enabled = enable;
if (enable)
psi_cgroup_restart(psi);
}
cgroup_kn_unlock(of->kn);
return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void)
{
if (static_branch_likely(&psi_disabled))
return false;
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
{
.name = "cgroup.pressure",
.seq_show = cgroup_pressure_show,
.write = cgroup_pressure_write,
},
#endif /* CONFIG_PSI */
{ } /* terminate */
};

View File

@@ -50,7 +50,6 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
#include <linux/security.h>
@@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
if (current->mm) {
int i;
for (i = 0; i < VMACACHE_SIZE; i++) {
if (!current->vmacache.vmas[i])
continue;
flush_cache_range(current->vmacache.vmas[i],
addr, addr + BREAK_INSTR_SIZE);
}
}
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}

View File

@@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
const char *cp;
int cookie;
int len;
if (msg_len == 0)
@@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
for_each_console(c) {
if (!(c->flags & CON_ENABLED))
/*
* The console_srcu_read_lock() only provides safe console list
* traversal. The use of the ->write() callback relies on all other
* CPUs being stopped at the moment and console drivers being able to
* handle reentrance when @oops_in_progress is set.
*
* There is no guarantee that every console driver can handle
* reentrance in this way; the developer deploying the debugger
* is responsible for ensuring that the console drivers they
* have selected handle reentrance appropriately.
*/
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
if (!(console_srcu_read_flags(c) & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
@@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
--oops_in_progress;
touch_nmi_watchdog();
}
console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)

View File

@@ -214,13 +214,22 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
void __delayacct_thrashing_start(void)
void __delayacct_thrashing_start(bool *in_thrashing)
{
*in_thrashing = !!current->in_thrashing;
if (*in_thrashing)
return;
current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
void __delayacct_thrashing_end(void)
void __delayacct_thrashing_end(bool *in_thrashing)
{
if (*in_thrashing)
return;
current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,

View File

@@ -10,6 +10,7 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
kmsan_handle_dma(page, offset, size, dir);
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
@@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
if (ents > 0)
if (ents > 0) {
kmsan_handle_dma_sg(sg, nents, dir);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO && ents != -EREMOTEIO))
} else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO && ents != -EREMOTEIO)) {
return -EIO;
}
return ents;
}

View File

@@ -346,22 +346,27 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
memblock_free(tlb, PAGE_ALIGN(bytes));
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
if (nslabs < IO_TLB_MIN_SLABS)
panic("%s: Failed to remap %zu bytes\n",
__func__, bytes);
goto retry;
if (nslabs >= IO_TLB_MIN_SLABS)
goto retry;
pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
return;
}
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
if (!mem->slots)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
if (!mem->slots) {
pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
return;
}
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
default_nareas), SMP_CACHE_BYTES);
if (!mem->areas)
panic("%s: Failed to allocate mem->areas.\n", __func__);
if (!mem->areas) {
pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
return;
}
swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
default_nareas);
@@ -545,9 +550,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
if (PageHighMem(pfn_to_page(pfn))) {
/* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
char *buffer;
struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -555,12 +559,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
buffer = kmap_atomic(pfn_to_page(pfn));
page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
memcpy(vaddr, buffer + offset, sz);
memcpy_from_page(vaddr, page, offset, sz);
else
memcpy(buffer + offset, vaddr, sz);
kunmap_atomic(buffer);
memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -731,8 +734,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
int index;
phys_addr_t tlb_addr;
if (!mem || !mem->nslabs)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (!mem || !mem->nslabs) {
dev_warn_ratelimited(dev,
"Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
return (phys_addr_t)DMA_MAPPING_ERROR;
}
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");

View File

@@ -5,6 +5,7 @@
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
user_exit_irqoff();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
}
@@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
ct_nmi_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();

View File

@@ -10270,8 +10270,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;

View File

@@ -19,7 +19,7 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
@@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct folio *old_folio = page_folio(old_page);
struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
GFP_KERNEL);
new_folio = page_folio(new_page);
err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
/* For try_to_free_swap() below */
lock_page(old_page);
/* For folio_free_swap() below */
folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
get_page(new_page);
folio_get(new_folio);
page_add_new_anon_rmap(new_page, vma, addr);
lru_cache_add_inactive_or_unevictable(new_page, vma);
folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
if (!folio_mapped(old_folio))
folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
put_page(old_page);
folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
unlock_page(old_page);
folio_unlock(old_folio);
return err;
}
@@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -552,7 +555,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
collapse_pte_mapped_thp(mm, vaddr);
collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*

View File

@@ -60,6 +60,7 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
@@ -183,6 +184,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
void __weak release_thread(struct task_struct *dead_task)
{
}
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -466,6 +471,7 @@ void mm_update_next_owner(struct mm_struct *mm)
goto retry;
}
WRITE_ONCE(mm->owner, c);
lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -759,6 +765,7 @@ void __noreturn do_exit(long code)
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);

View File

@@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
buf = kmalloc(count + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_user(buf, buffer, count)) {
ret = -EFAULT;
goto out_free;
}
buf[count] = '\0';
buf = memdup_user_nul(buffer, count);
if (IS_ERR(buf))
return PTR_ERR(buf);
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
if (!ret)
fei_debugfs_add_attr(attr);
if (ret < 0)
fei_attr_remove(attr);
else {
list_add_tail(&attr->list, &fei_attr_list);
ret = count;
if (ret) {
fei_attr_free(attr);
goto out;
}
fei_debugfs_add_attr(attr);
list_add_tail(&attr->list, &fei_attr_list);
ret = count;
out:
mutex_unlock(&fei_lock);
out_free:
kfree(buf);
return ret;
}

View File

@@ -37,13 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -97,7 +97,6 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*/
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
dup_anon_vma_name(orig, new);
}
return new;
@@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
struct vm_area_struct *mpnt, *tmp;
int retval;
unsigned long charge;
unsigned long charge = 0;
LIST_HEAD(uf);
MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
MA_STATE(mas, &mm->mm_mt, 0, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
retval = mas_expected_entries(&mas, oldmm->map_count);
if (retval)
goto out;
mas_for_each(&old_mas, mpnt, ULONG_MAX) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto out;
goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
* Clear hugetlb-related page reserves for children. This only
* affects MAP_PRIVATE mappings. Faults generated by the child
* are not guaranteed to succeed, even if read-only
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
reset_vma_resv_huge_pages(tmp);
hugetlb_dup_vma_private(tmp);
/*
* Link in the new vma and copy the page table entries.
*/
*pprev = tmp;
pprev = &tmp->vm_next;
tmp->vm_prev = prev;
prev = tmp;
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;
/* Link the vma into the MT */
mas.index = tmp->vm_start;
mas.last = tmp->vm_end - 1;
mas_store(&mas, tmp);
if (mas_is_err(&mas))
goto fail_nomem_mas_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);
if (retval)
goto out;
goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
mas_destroy(&mas);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -714,6 +708,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
fail_nomem_mas_store:
unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -721,7 +718,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto out;
goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -1026,6 +1023,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->worker_private = NULL;
kcov_task_init(tsk);
kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
@@ -1109,9 +1107,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
@@ -1152,6 +1149,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@@ -1194,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -1285,13 +1284,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
&old_exe_file->f_path))
&old_exe_file->f_path)) {
ret = -EBUSY;
break;
}
}
mmap_read_unlock(mm);
fput(old_exe_file);
@@ -1566,9 +1568,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
/* initialize the new vmacache entries */
vmacache_flush(tsk);
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
@@ -2693,6 +2692,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */

View File

@@ -31,8 +31,8 @@ if [ "$building_out_of_srctree" ]; then
fi
all_dirs="$all_dirs $dir_list"
# include/generated/compile.h is ignored because it is touched even when none
# of the source files changed.
# include/generated/utsversion.h is ignored because it is generated after this
# script is executed. (utsversion.h is unneeded for kheaders)
#
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
# updated, but the contents might be still the same. When any CONFIG option is
@@ -42,7 +42,7 @@ all_dirs="$all_dirs $dir_list"
#
# Ignore them for md5 calculation to avoid pointless regeneration.
headers_md5="$(find $all_dirs -name "*.h" |
grep -v "include/generated/compile.h" |
grep -v "include/generated/utsversion.h" |
grep -v "include/generated/autoconf.h" |
xargs ls -l | md5sum | cut -d ' ' -f1)"

View File

@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
/**
* generic_handle_irq_safe - Invoke the handler for a HW irq belonging
* to a domain from any context.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
*
* Returns: 0 on success, a negative value on error.
*
* This function can be called from any context (IRQ or process
* context). If the interrupt is marked as 'enforce IRQ-context only' then
* the function must be invoked from hard interrupt context.
*/
int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
{
unsigned long flags;
int ret;
local_irq_save(flags);
ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
/**
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
* to a domain.

View File

@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
/*
* KMSAN doesn't instrument this file, so it may not know area->list
* is initialized. Unpoison it explicitly to avoid reports in
* kcov_remote_area_get().
*/
kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)

View File

@@ -26,7 +26,7 @@
static bool __init test_requires(void)
{
/* random should be initialized for the below tests */
return prandom_u32() + prandom_u32() != 0;
return get_random_u32() + get_random_u32() != 0;
}
/*
@@ -46,7 +46,7 @@ static bool __init test_encode_decode(void)
unsigned long addr;
size_t verif_size;
prandom_bytes(&addr, sizeof(addr));
get_random_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;

View File

@@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/*
* Because we write directly to the reserved memory region when loading
* crash kernels we need a mutex here to prevent multiple crash kernels
* from attempting to load simultaneously, and to prevent a crash kernel
* from loading over the top of a in use crash kernel.
*
* KISS: always take the mutex.
* crash kernels we need a serialization here to prevent multiple crash
* kernels from attempting to load simultaneously.
*/
if (!mutex_trylock(&kexec_mutex))
if (!kexec_trylock())
return -EBUSY;
if (flags & KEXEC_ON_CRASH) {
@@ -165,7 +162,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
kimage_free(image);
out_unlock:
mutex_unlock(&kexec_mutex);
kexec_unlock();
return ret;
}

View File

@@ -46,7 +46,7 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
DEFINE_MUTEX(kexec_mutex);
atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
ptr = kmap(page);
ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
@@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr = kmap_local_page(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
@@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init);
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
/* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
if (mutex_trylock(&kexec_mutex)) {
if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
mutex_unlock(&kexec_mutex);
kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
@@ -1004,14 +1004,17 @@ void crash_kexec(struct pt_regs *regs)
}
}
size_t crash_get_memory_size(void)
ssize_t crash_get_memory_size(void)
{
size_t size = 0;
ssize_t size = 0;
if (!kexec_trylock())
return -EBUSY;
mutex_lock(&kexec_mutex);
if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res);
mutex_unlock(&kexec_mutex);
kexec_unlock();
return size;
}
@@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size)
unsigned long old_size;
struct resource *ram_res;
mutex_lock(&kexec_mutex);
if (!kexec_trylock())
return -EBUSY;
if (kexec_crash_image) {
ret = -ENOENT;
@@ -1060,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size)
insert_resource(&iomem_resource, ram_res);
unlock:
mutex_unlock(&kexec_mutex);
kexec_unlock();
return ret;
}
@@ -1132,7 +1136,7 @@ int kernel_kexec(void)
{
int error = 0;
if (!mutex_trylock(&kexec_mutex))
if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1208,6 +1212,6 @@ int kernel_kexec(void)
#endif
Unlock:
mutex_unlock(&kexec_mutex);
kexec_unlock();
return error;
}

View File

@@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
if (!mutex_trylock(&kexec_mutex))
if (!kexec_trylock())
return -EBUSY;
dest_image = &kexec_image;
@@ -411,7 +411,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
mutex_unlock(&kexec_mutex);
kexec_unlock();
kimage_free(image);
return ret;
}

View File

@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
extern struct mutex kexec_mutex;
/*
* Whatever is used to serialize accesses to the kexec_crash_image needs to be
* NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
* "simple" atomic variable that is acquired with a cmpxchg().
*/
extern atomic_t __kexec_lock;
static inline bool kexec_trylock(void)
{
return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
}
static inline void kexec_unlock(void)
{
atomic_set_release(&__kexec_lock, 0);
}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>

View File

@@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%zu\n", crash_get_memory_size());
ssize_t size = crash_get_memory_size();
if (size < 0)
return size;
return sprintf(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,

View File

@@ -112,7 +112,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
if (i >= MAXLR - 1)
if (i >= MAXLR)
return;
/* Allocted a new one: */

View File

@@ -325,6 +325,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static int __klp_disable_patch(struct klp_patch *patch);
@@ -431,6 +432,22 @@ static struct attribute *klp_patch_attrs[] = {
};
ATTRIBUTE_GROUPS(klp_patch);
static ssize_t patched_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct klp_object *obj;
obj = container_of(kobj, struct klp_object, kobj);
return sysfs_emit(buf, "%d\n", obj->patched);
}
static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
static struct attribute *klp_object_attrs[] = {
&patched_kobj_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(klp_object);
static void klp_free_object_dynamic(struct klp_object *obj)
{
kfree(obj->name);
@@ -576,6 +593,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
static struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
@@ -1171,7 +1189,7 @@ int klp_module_coming(struct module *mod)
return -EINVAL;
if (!strcmp(mod->name, "vmlinux")) {
pr_err("vmlinux.ko: invalid module name");
pr_err("vmlinux.ko: invalid module name\n");
return -EINVAL;
}

View File

@@ -610,9 +610,23 @@ void klp_reverse_transition(void)
/* Called from copy_process() during fork */
void klp_copy_process(struct task_struct *child)
{
child->patch_state = current->patch_state;
/* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
/*
* The parent process may have gone through a KLP transition since
* the thread flag was copied in setup_thread_stack earlier. Bring
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
* operations by the tasklist_lock. The only exception is
* klp_update_patch_state(current), but we cannot race with
* that because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
else
clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
child->patch_state = current->patch_state;
}
/*

View File

@@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)

View File

@@ -399,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
r = get_random_int() % (n + 1);
r = prandom_u32_max(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks);
int err;
do {

View File

@@ -256,7 +256,7 @@ void module_decompress_cleanup(struct load_info *info)
static ssize_t compression_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
}
static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);

View File

@@ -10,6 +10,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/debugfs.h>
#include <linux/rculist.h>
#include "internal.h"
@@ -21,6 +22,9 @@ int try_add_tainted_module(struct module *mod)
module_assert_mutex_or_preempt();
if (!mod->taints)
goto out;
list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
lockdep_is_held(&module_mutex)) {
if (!strcmp(mod_taint->name, mod->name) &&
@@ -59,3 +63,70 @@ void print_unloaded_tainted_modules(void)
}
}
}
#ifdef CONFIG_DEBUG_FS
static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
__acquires(rcu)
{
rcu_read_lock();
return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
}
static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
}
static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
__releases(rcu)
{
rcu_read_unlock();
}
static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
{
struct mod_unload_taint *mod_taint;
char buf[MODULE_FLAGS_BUF_SIZE];
size_t l;
mod_taint = list_entry(p, struct mod_unload_taint, list);
l = module_flags_taint(mod_taint->taints, buf);
buf[l++] = '\0';
seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
seq_puts(m, "\n");
return 0;
}
static const struct seq_operations unloaded_tainted_modules_seq_ops = {
.start = unloaded_tainted_modules_seq_start,
.next = unloaded_tainted_modules_seq_next,
.stop = unloaded_tainted_modules_seq_stop,
.show = unloaded_tainted_modules_seq_show,
};
static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
{
return seq_open(file, &unloaded_tainted_modules_seq_ops);
}
static const struct file_operations unloaded_tainted_modules_fops = {
.open = unloaded_tainted_modules_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init unloaded_tainted_modules_init(void)
{
struct dentry *dir;
dir = debugfs_create_dir("modules", NULL);
debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
&unloaded_tainted_modules_fops);
return 0;
}
module_init(unloaded_tainted_modules_init);
#endif /* CONFIG_DEBUG_FS */

View File

@@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{

View File

@@ -79,13 +79,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
* console_sem protects the console_drivers list, and also
* provides serialisation for access to the entire console
* driver system.
* console_mutex protects console_list updates and console->flags updates.
* The flags are synchronized only for consoles that are registered, i.e.
* accessible via the console list.
*/
static DEFINE_MUTEX(console_mutex);
/*
* console_sem protects updates to console->seq and console_suspended,
* and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk;
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
void lockdep_assert_console_list_lock_held(void)
{
lockdep_assert_held(&console_mutex);
}
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool console_srcu_read_lock_is_held(void)
{
return srcu_read_lock_held(&console_srcu);
}
#endif
enum devkmsg_log_bits {
@@ -220,6 +240,69 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
/**
* console_list_lock - Lock the console list
*
* For console list or console->flags updates
*/
void console_list_lock(void)
{
/*
* In unregister_console() and console_force_preferred_locked(),
* synchronize_srcu() is called with the console_list_lock held.
* Therefore it is not allowed that the console_list_lock is taken
* with the srcu_lock held.
*
* Detecting if this context is really in the read-side critical
* section is only possible if the appropriate debug options are
* enabled.
*/
WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
srcu_read_lock_held(&console_srcu));
mutex_lock(&console_mutex);
}
EXPORT_SYMBOL(console_list_lock);
/**
* console_list_unlock - Unlock the console list
*
* Counterpart to console_list_lock()
*/
void console_list_unlock(void)
{
mutex_unlock(&console_mutex);
}
EXPORT_SYMBOL(console_list_unlock);
/**
* console_srcu_read_lock - Register a new reader for the
* SRCU-protected console list
*
* Use for_each_console_srcu() to iterate the console list
*
* Context: Any context.
* Return: A cookie to pass to console_srcu_read_unlock().
*/
int console_srcu_read_lock(void)
{
return srcu_read_lock_nmisafe(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock);
/**
* console_srcu_read_unlock - Unregister an old reader from
* the SRCU-protected console list
* @cookie: cookie returned from console_srcu_read_lock()
*
* Counterpart to console_srcu_read_lock()
*/
void console_srcu_read_unlock(int cookie)
{
srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
EXPORT_SYMBOL(console_srcu_read_unlock);
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
@@ -1814,13 +1897,13 @@ static void console_lock_spinning_enable(void)
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
* Important: Callers lose the lock if there was a busy waiter.
* They must not touch items synchronized by console_lock
* in this case.
* Important: Callers lose both the console_lock and the SRCU read lock if
* there was a busy waiter. They must not touch items synchronized by
* console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
static int console_lock_spinning_disable_and_check(void)
static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -1839,6 +1922,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
* Preserve lockdep lock ordering. Release the SRCU read lock before
* releasing the console_lock.
*/
console_srcu_read_unlock(cookie);
/*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
@@ -2322,7 +2411,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
static void call_console_driver(struct console *con, const char *text, size_t len,
char *dropped_text)
{
@@ -2553,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu)
}
/**
* console_lock - lock the console system for exclusive use.
* console_lock - block the console subsystem from printing
*
* Acquires a lock which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
* Acquires a lock which guarantees that no consoles will
* be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2573,10 +2662,10 @@ void console_lock(void)
EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
* console_trylock - try to block the console subsystem from printing
*
* Try to acquire a lock which guarantees that the caller has exclusive
* access to the console system and the console_drivers list.
* Try to acquire a lock which guarantees that no consoles will
* be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2623,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void)
* Check if the given console is currently capable and allowed to print
* records.
*
* Requires the console_lock.
* Requires the console_srcu_read_lock.
*/
static inline bool console_is_usable(struct console *con)
{
if (!(con->flags & CON_ENABLED))
short flags = console_srcu_read_flags(con);
if (!(flags & CON_ENABLED))
return false;
if (!con->write)
@@ -2638,8 +2729,7 @@ static inline bool console_is_usable(struct console *con)
* allocated. So unless they're explicitly marked as being able to
* cope (CON_ANYTIME) don't call them until this CPU is officially up.
*/
if (!cpu_online(raw_smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
return false;
return true;
@@ -2664,16 +2754,18 @@ static void __console_unlock(void)
* DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
*
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding the
* console_lock. Otherwise it is set to false.
* console_lock, in which case the caller is no longer holding both the
* console_lock and the SRCU read lock. Otherwise it is set to false.
*
* @cookie is the cookie from the SRCU read lock.
*
* Returns false if the given console has no next record to print, otherwise
* true.
*
* Requires the console_lock.
* Requires the console_lock and the SRCU read lock.
*/
static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
char *dropped_text, bool *handover)
char *dropped_text, bool *handover, int cookie)
{
static int panic_console_dropped;
struct printk_info info;
@@ -2733,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
con->seq++;
*handover = console_lock_spinning_disable_and_check();
*handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
skip:
return true;
@@ -2770,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
bool any_usable = false;
struct console *con;
bool any_progress;
int cookie;
*next_seq = 0;
*handover = false;
@@ -2777,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
do {
any_progress = false;
for_each_console(con) {
cookie = console_srcu_read_lock();
for_each_console_srcu(con) {
bool progress;
if (!console_is_usable(con))
continue;
any_usable = true;
if (con->flags & CON_EXTENDED) {
if (console_srcu_read_flags(con) & CON_EXTENDED) {
/* Extended consoles do not print "dropped messages". */
progress = console_emit_next_record(con, &text[0],
&ext_text[0], NULL,
handover);
handover, cookie);
} else {
progress = console_emit_next_record(con, &text[0],
NULL, &dropped_text[0],
handover);
handover, cookie);
}
/*
* If a handover has occurred, the SRCU read lock
* is already released.
*/
if (*handover)
return false;
@@ -2807,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
/* Allow panic_cpu to take over the consoles safely. */
if (abandon_console_lock_in_panic())
return false;
goto abandon;
if (do_cond_resched)
cond_resched();
}
console_srcu_read_unlock(cookie);
} while (any_progress);
return any_usable;
abandon:
console_srcu_read_unlock(cookie);
return false;
}
/**
* console_unlock - unlock the console system
* console_unlock - unblock the console subsystem from printing
*
* Releases the console_lock which the caller holds on the console system
* and the console driver list.
* Releases the console_lock which the caller holds to block printing of
* the console subsystem.
*
* While the console_lock was held, console output may have been buffered
* by printk(). If this is the case, console_unlock(); emits
@@ -2899,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
struct console *c;
int cookie;
/*
* console_unblank can no longer be called in interrupt context unless
* oops_in_progress is set to 1..
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
*
* If @oops_in_progress is set, this may be an atomic context.
* In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
@@ -2912,9 +3020,14 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
}
console_srcu_read_unlock(cookie);
console_unlock();
if (!oops_in_progress)
@@ -2941,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
struct console *c;
int cookie;
u64 seq;
seq = prb_first_valid_seq(prb);
for_each_console(c)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
/*
* If the above console_trylock() failed, this is an
* unsynchronized assignment. But in that case, the
* kernel is in "hope and pray" mode anyway.
*/
c->seq = seq;
}
console_srcu_read_unlock(cookie);
}
console_unlock();
}
@@ -2957,15 +3080,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
int cookie;
/*
* Take console_lock to serialize device() callback with
* other console operations. For example, fg_console is
* modified under console_lock when switching vt.
*/
console_lock();
for_each_console(c) {
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
console_srcu_read_unlock(cookie);
console_unlock();
return driver;
}
@@ -2978,17 +3111,25 @@ struct tty_driver *console_device(int *index)
void console_stop(struct console *console)
{
__pr_flush(console, 1000, true);
console_lock();
console->flags &= ~CON_ENABLED;
console_unlock();
console_list_lock();
console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
console_list_unlock();
/*
* Ensure that all SRCU list walks have completed. All contexts must
* be able to see that this console is disabled so that (for example)
* the caller can suspend the port without risk of another context
* using the port.
*/
synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
console_lock();
console->flags |= CON_ENABLED;
console_unlock();
console_list_lock();
console_srcu_write_flags(console, console->flags | CON_ENABLED);
console_list_unlock();
__pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -3081,6 +3222,72 @@ static void try_enable_default_console(struct console *newcon)
(con->flags & CON_BOOT) ? "boot" : "", \
con->name, con->index, ##__VA_ARGS__)
static void console_init_seq(struct console *newcon, bool bootcon_registered)
{
struct console *con;
bool handover;
if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
newcon->seq = syslog_seq;
mutex_unlock(&syslog_lock);
} else {
/* Begin with next message added to ringbuffer. */
newcon->seq = prb_next_seq(prb);
/*
* If any enabled boot consoles are due to be unregistered
* shortly, some may not be caught up and may be the same
* device as @newcon. Since it is not known which boot console
* is the same device, flush all consoles and, if necessary,
* start with the message of the enabled boot console that is
* the furthest behind.
*/
if (bootcon_registered && !keep_bootcon) {
/*
* Hold the console_lock to stop console printing and
* guarantee safe access to console->seq.
*/
console_lock();
/*
* Flush all consoles and set the console to start at
* the next unprinted sequence number.
*/
if (!console_flush_all(true, &newcon->seq, &handover)) {
/*
* Flushing failed. Just choose the lowest
* sequence of the enabled boot consoles.
*/
/*
* If there was a handover, this context no
* longer holds the console_lock.
*/
if (handover)
console_lock();
newcon->seq = prb_next_seq(prb);
for_each_console(con) {
if ((con->flags & CON_BOOT) &&
(con->flags & CON_ENABLED) &&
con->seq < newcon->seq) {
newcon->seq = con->seq;
}
}
}
console_unlock();
}
}
}
#define console_first() \
hlist_entry(console_list.first, struct console, node)
static int unregister_console_locked(struct console *console);
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -3103,28 +3310,29 @@ static void try_enable_default_console(struct console *newcon)
void register_console(struct console *newcon)
{
struct console *con;
bool bootcon_enabled = false;
bool realcon_enabled = false;
bool bootcon_registered = false;
bool realcon_registered = false;
int err;
console_list_lock();
for_each_console(con) {
if (WARN(con == newcon, "console '%s%d' already registered\n",
con->name, con->index))
return;
}
con->name, con->index)) {
goto unlock;
}
for_each_console(con) {
if (con->flags & CON_BOOT)
bootcon_enabled = true;
bootcon_registered = true;
else
realcon_enabled = true;
realcon_registered = true;
}
/* Do not register boot consoles when there already is a real one. */
if (newcon->flags & CON_BOOT && realcon_enabled) {
if ((newcon->flags & CON_BOOT) && realcon_registered) {
pr_info("Too late to register bootconsole %s%d\n",
newcon->name, newcon->index);
return;
goto unlock;
}
/*
@@ -3140,8 +3348,8 @@ void register_console(struct console *newcon)
* flag set and will be first in the list.
*/
if (preferred_console < 0) {
if (!console_drivers || !console_drivers->device ||
console_drivers->flags & CON_BOOT) {
if (hlist_empty(&console_list) || !console_first()->device ||
console_first()->flags & CON_BOOT) {
try_enable_default_console(newcon);
}
}
@@ -3155,7 +3363,7 @@ void register_console(struct console *newcon)
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
return;
goto unlock;
/*
* If we have a bootconsole, and are switching to a real console,
@@ -3163,39 +3371,38 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
if (bootcon_enabled &&
if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
}
newcon->dropped = 0;
console_init_seq(newcon, bootcon_registered);
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
console_lock();
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
newcon->next = console_drivers;
console_drivers = newcon;
if (newcon->next)
newcon->next->flags &= ~CON_CONSDEV;
/* Ensure this flag is always set for the head of the list */
if (hlist_empty(&console_list)) {
/* Ensure CON_CONSDEV is always set for the head. */
newcon->flags |= CON_CONSDEV;
hlist_add_head_rcu(&newcon->node, &console_list);
} else if (newcon->flags & CON_CONSDEV) {
/* Only the new head can have CON_CONSDEV set. */
console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
hlist_add_head_rcu(&newcon->node, &console_list);
} else {
newcon->next = console_drivers->next;
console_drivers->next = newcon;
hlist_add_behind_rcu(&newcon->node, console_list.first);
}
newcon->dropped = 0;
if (newcon->flags & CON_PRINTBUFFER) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
newcon->seq = syslog_seq;
mutex_unlock(&syslog_lock);
} else {
/* Begin with next message. */
newcon->seq = prb_next_seq(prb);
}
console_unlock();
/*
* No need to synchronize SRCU here! The caller does not rely
* on all contexts being able to see the new console before
* register_console() completes.
*/
console_sysfs_notify();
/*
@@ -3206,21 +3413,28 @@ void register_console(struct console *newcon)
* went to the bootconsole (that they do not see on the real console)
*/
con_printk(KERN_INFO, newcon, "enabled\n");
if (bootcon_enabled &&
if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
for_each_console(con)
struct hlist_node *tmp;
hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (con->flags & CON_BOOT)
unregister_console(con);
unregister_console_locked(con);
}
}
unlock:
console_list_unlock();
}
EXPORT_SYMBOL(register_console);
int unregister_console(struct console *console)
/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
struct console *con;
int res;
lockdep_assert_console_list_lock_held();
con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
@@ -3229,48 +3443,94 @@ int unregister_console(struct console *console)
if (res > 0)
return 0;
res = -ENODEV;
console_lock();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
} else {
for_each_console(con) {
if (con->next == console) {
con->next = console->next;
res = 0;
break;
}
}
}
/* Disable it unconditionally */
console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
if (res)
goto out_disable_unlock;
if (!console_is_registered_locked(console))
return -ENODEV;
hlist_del_init_rcu(&console->node);
/*
* <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
* </HISTORICAL>
*
* The above makes no sense as there is no guarantee that the next
* console has any device attached. Oh well....
*/
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
/*
* Ensure that all SRCU list walks have completed. All contexts
* must not be able to see this console in the list so that any
* exit/cleanup routines can be performed safely.
*/
synchronize_srcu(&console_srcu);
console->flags &= ~CON_ENABLED;
console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
return res;
}
out_disable_unlock:
console->flags &= ~CON_ENABLED;
console_unlock();
int unregister_console(struct console *console)
{
int res;
console_list_lock();
res = unregister_console_locked(console);
console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
/**
* console_force_preferred_locked - force a registered console preferred
* @con: The registered console to force preferred.
*
* Must be called under console_list_lock().
*/
void console_force_preferred_locked(struct console *con)
{
struct console *cur_pref_con;
if (!console_is_registered_locked(con))
return;
cur_pref_con = console_first();
/* Already preferred? */
if (cur_pref_con == con)
return;
/*
* Delete, but do not re-initialize the entry. This allows the console
* to continue to appear registered (via any hlist_unhashed_lockless()
* checks), even though it was briefly removed from the console list.
*/
hlist_del_rcu(&con->node);
/*
* Ensure that all SRCU list walks have completed so that the console
* can be added to the beginning of the console list and its forward
* list pointer can be re-initialized.
*/
synchronize_srcu(&console_srcu);
con->flags |= CON_CONSDEV;
WARN_ON(!con->device);
/* Only the new head can have CON_CONSDEV set. */
console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
hlist_add_head_rcu(&con->node, &console_list);
}
EXPORT_SYMBOL(console_force_preferred_locked);
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
@@ -3317,10 +3577,12 @@ void __init console_init(void)
*/
static int __init printk_late_init(void)
{
struct hlist_node *tmp;
struct console *con;
int ret;
for_each_console(con) {
console_list_lock();
hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (!(con->flags & CON_BOOT))
continue;
@@ -3337,9 +3599,11 @@ static int __init printk_late_init(void)
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
unregister_console(con);
unregister_console_locked(con);
}
}
console_list_unlock();
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
@@ -3359,6 +3623,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
struct console *c;
u64 last_diff = 0;
u64 printk_seq;
int cookie;
u64 diff;
u64 seq;
@@ -3369,9 +3634,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for (;;) {
diff = 0;
/*
* Hold the console_lock to guarantee safe access to
* console->seq and to prevent changes to @console_suspended
* until all consoles have been processed.
*/
console_lock();
for_each_console(c) {
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
if (con && con != c)
continue;
if (!console_is_usable(c))
@@ -3380,6 +3651,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
if (printk_seq < seq)
diff += seq - printk_seq;
}
console_srcu_read_unlock(cookie);
/*
* If consoles are suspended, it cannot be expected that they

View File

@@ -59,43 +59,39 @@ int profile_setup(char *str)
static const char schedstr[] = "schedule";
static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
const char *select = NULL;
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
pr_info("kernel sleep profiling enabled (shift: %u)\n",
prof_shift);
select = sleepstr;
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
pr_info("kernel schedule profiling enabled (shift: %u)\n",
prof_shift);
select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
if (str[strlen(kvmstr)] == ',')
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
pr_info("kernel KVM profiling enabled (shift: %u)\n",
prof_shift);
select = kvmstr;
} else if (get_option(&str, &par)) {
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
if (select) {
if (str[strlen(select)] == ',')
str += strlen(select) + 1;
if (get_option(&str, &par))
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
pr_info("kernel %s profiling enabled (shift: %u)\n",
select, prof_shift);
}
return 1;
}
__setup("profile=", profile_setup);

View File

@@ -54,27 +54,25 @@ config RCU_EXPERT
Say N if you are unsure.
config SRCU
bool
help
This option selects the sleepable version of RCU. This version
permits arbitrary sleeping or blocking within RCU read-side critical
sections.
def_bool y
config TINY_SRCU
bool
default y if SRCU && TINY_RCU
default y if TINY_RCU
help
This option selects the single-CPU non-preemptible version of SRCU.
config TREE_SRCU
bool
default y if SRCU && !TINY_RCU
default y if !TINY_RCU
help
This option selects the full-fledged version of SRCU.
config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
select SRCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.

View File

@@ -27,7 +27,6 @@ config RCU_SCALE_TEST
tristate "performance tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
default n
help
This option provides a kernel module that runs performance
@@ -43,7 +42,6 @@ config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
default n
help
This option provides a kernel module that runs torture tests
@@ -59,7 +57,6 @@ config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
default n
help
This option provides a kernel module that runs performance tests

View File

@@ -286,7 +286,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
#if !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -375,6 +375,10 @@ extern void rcu_init_geometry(void);
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
#endif /* !defined(CONFIG_TINY_RCU) */
#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
/*
* Wrappers for the rcu_node::lock acquire and release.
*
@@ -437,7 +441,7 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */

View File

@@ -615,10 +615,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
DEFINE_STATIC_SRCU(srcu_ctl);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
{
return srcu_read_lock(srcu_ctlp);
if (cur_ops == &srcud_ops)
return srcu_read_lock_nmisafe(srcu_ctlp);
else
return srcu_read_lock(srcu_ctlp);
}
static void
@@ -642,7 +646,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
{
srcu_read_unlock(srcu_ctlp, idx);
if (cur_ops == &srcud_ops)
srcu_read_unlock_nmisafe(srcu_ctlp, idx);
else
srcu_read_unlock(srcu_ctlp, idx);
}
static int torture_srcu_read_lock_held(void)

View File

@@ -197,6 +197,16 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
might_sleep();
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
call_srcu(ssp, &rs.head, wakeme_after_rcu);

View File

@@ -417,7 +417,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
}
return sum;
}
@@ -429,13 +429,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
if (IS_ENABLED(CONFIG_PROVE_RCU))
mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
"Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
}
@@ -503,10 +508,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
sum += READ_ONCE(cpuc->srcu_lock_count[0]);
sum += READ_ONCE(cpuc->srcu_lock_count[1]);
sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
}
return sum;
}
@@ -626,6 +631,29 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
#ifdef CONFIG_PROVE_RCU
/*
* Check for consistent NMI safety.
*/
void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
{
int nmi_safe_mask = 1 << nmi_safe;
int old_nmi_safe_mask;
struct srcu_data *sdp;
/* NMI-unsafe use in NMI is a bad sign */
WARN_ON_ONCE(!nmi_safe && in_nmi());
sdp = raw_cpu_ptr(ssp->sda);
old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
if (!old_nmi_safe_mask) {
WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
return;
}
WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
}
EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
#endif /* CONFIG_PROVE_RCU */
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
@@ -636,7 +664,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -650,10 +678,45 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
#ifdef CONFIG_NEED_SRCU_NMI_SAFE
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct, but in an NMI-safe manner using RMW atomics.
* Returns an index that must be passed to the matching srcu_read_unlock().
*/
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
int idx;
struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
atomic_long_inc(&sdp->srcu_lock_count[idx]);
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
/*
* Removes the count for the old reader from the appropriate per-CPU
* element of the srcu_struct. Note that this may well be a different
* CPU than that which was incremented by the corresponding srcu_read_lock().
*/
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
atomic_long_inc(&sdp->srcu_unlock_count[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
#endif // CONFIG_NEED_SRCU_NMI_SAFE
/*
* Start an SRCU grace period.
*/
@@ -1090,7 +1153,12 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
int ss_state;
check_init_srcu_struct(ssp);
idx = srcu_read_lock(ssp);
/*
* While starting a new grace period, make sure we are in an
* SRCU read-side critical section so that the grace-period
* sequence number cannot wrap around in the meantime.
*/
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
sdp = per_cpu_ptr(ssp->sda, 0);
@@ -1123,7 +1191,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
srcu_funnel_exp_start(ssp, sdp_mynode, s);
srcu_read_unlock(ssp, idx);
__srcu_read_unlock_nmisafe(ssp, idx);
return s;
}
@@ -1427,13 +1495,13 @@ void srcu_barrier(struct srcu_struct *ssp)
/* Initial count prevents reaching zero until all CBs are posted. */
atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
idx = srcu_read_lock(ssp);
idx = __srcu_read_lock_nmisafe(ssp);
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
else
for_each_possible_cpu(cpu)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
srcu_read_unlock(ssp, idx);
__srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
@@ -1687,8 +1755,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
u0 = data_race(sdp->srcu_unlock_count[!idx]);
u1 = data_race(sdp->srcu_unlock_count[idx]);
u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1696,8 +1764,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
l0 = data_race(sdp->srcu_lock_count[!idx]);
l1 = data_race(sdp->srcu_lock_count[idx]);
l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
c0 = l0 - u0;
c1 = l1 - u1;

View File

@@ -224,7 +224,7 @@ void rcu_test_sync_prims(void)
synchronize_rcu_expedited();
}
#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
#if !defined(CONFIG_TINY_RCU)
/*
* Switch to run-time mode once RCU has fully initialized.
@@ -239,7 +239,7 @@ static int __init rcu_set_runtime_mode(void)
}
core_initcall(rcu_set_runtime_mode);
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -559,10 +559,8 @@ static void early_boot_test_call_rcu(void)
struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
if (IS_ENABLED(CONFIG_SRCU)) {
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
}
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -585,11 +583,9 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
if (IS_ENABLED(CONFIG_SRCU)) {
early_boot_test_counter++;
srcu_barrier(&early_srcu);
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
early_boot_test_counter++;
srcu_barrier(&early_srcu);
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);

View File

@@ -243,6 +243,17 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
/*
* Notifier list for kernel code which wants to be called
* to prepare system for restart.
*/
static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
static void do_kernel_restart_prepare(void)
{
blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
}
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -254,6 +265,7 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
do_kernel_restart_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
@@ -396,6 +408,11 @@ register_sys_off_handler(enum sys_off_mode mode,
handler->list = &power_off_handler_list;
break;
case SYS_OFF_MODE_RESTART_PREPARE:
handler->list = &restart_prep_handler_list;
handler->blocking = true;
break;
case SYS_OFF_MODE_RESTART:
handler->list = &restart_handler_list;
break;

View File

@@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
const size_t pa_size = n_pages * sizeof(struct page *);
if (pa_size > PAGE_SIZE)
return vzalloc(pa_size);
return kzalloc(pa_size, GFP_KERNEL);
return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*

View File

@@ -357,10 +357,7 @@ static void __sched_core_flip(bool enabled)
/*
* Toggle the offline CPUs.
*/
cpumask_copy(&sched_core_mask, cpu_possible_mask);
cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
for_each_cpu(cpu, &sched_core_mask)
for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
@@ -704,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -4392,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
static void reset_memory_tiering(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
pgdat->nbp_threshold = 0;
pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
}
}
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -4408,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
(state & NUMA_BALANCING_MEMORY_TIERING))
reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
@@ -5162,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */

View File

@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

View File

@@ -40,6 +40,7 @@
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
#include <linux/mutex_api.h>
#include <linux/profile.h>
@@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
struct numa_group {
refcount_t refcount;
@@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
/*
* If memory tiering mode is enabled, cpupid of slow memory page is
* used to record scan time instead of CPU and PID. When tiering mode
* is disabled at run time, the scan time (in cpupid) will be
* interpreted as CPU and PID. So CPU needs to be checked to avoid to
* access out of array bound.
*/
static inline bool cpupid_valid(int cpupid)
{
return cpupid_to_cpu(cpupid) < nr_cpu_ids;
}
/*
* For memory tiering mode, if there are enough free pages (more than
* enough watermark defined here) in fast memory node, to take full
* advantage of fast memory capacity, all recently accessed slow
* memory pages will be migrated to fast memory node without
* considering hot threshold.
*/
static bool pgdat_free_space_enough(struct pglist_data *pgdat)
{
int z;
unsigned long enough_wmark;
enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
pgdat->node_present_pages >> 4);
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone))
continue;
if (zone_watermark_ok(zone, 0,
wmark_pages(zone, WMARK_PROMO) + enough_wmark,
ZONE_MOVABLE, 0))
return true;
}
return false;
}
/*
* For memory tiering mode, when page tables are scanned, the scan
* time will be recorded in struct page in addition to make page
* PROT_NONE for slow memory page. So when the page is accessed, in
* hint page fault handler, the hint page fault latency is calculated
* via,
*
* hint page fault latency = hint page fault time - scan time
*
* The smaller the hint page fault latency, the higher the possibility
* for the page to be hot.
*/
static int numa_hint_fault_latency(struct page *page)
{
int last_time, time;
time = jiffies_to_msecs(jiffies);
last_time = xchg_page_access_time(page, time);
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
/*
* For memory tiering mode, too high promotion/demotion throughput may
* hurt application latency. So we provide a mechanism to rate limit
* the number of pages that are tried to be promoted.
*/
static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
unsigned long rate_limit, int nr)
{
unsigned long nr_cand;
unsigned int now, start;
now = jiffies_to_msecs(jiffies);
mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
start = pgdat->nbp_rl_start;
if (now - start > MSEC_PER_SEC &&
cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
pgdat->nbp_rl_nr_cand = nr_cand;
if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
return true;
return false;
}
#define NUMA_MIGRATION_ADJUST_STEPS 16
static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
unsigned long rate_limit,
unsigned int ref_th)
{
unsigned int now, start, th_period, unit_th, th;
unsigned long nr_cand, ref_cand, diff_cand;
now = jiffies_to_msecs(jiffies);
th_period = sysctl_numa_balancing_scan_period_max;
start = pgdat->nbp_th_start;
if (now - start > th_period &&
cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
ref_cand = rate_limit *
sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
th = pgdat->nbp_threshold ? : ref_th;
if (diff_cand > ref_cand * 11 / 10)
th = max(th - unit_th, unit_th);
else if (diff_cand < ref_cand * 9 / 10)
th = min(th + unit_th, ref_th * 2);
pgdat->nbp_th_nr_cand = nr_cand;
pgdat->nbp_threshold = th;
}
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
/*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!node_is_toptier(src_nid)) {
struct pglist_data *pgdat;
unsigned long rate_limit;
unsigned int latency, th, def_th;
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat)) {
/* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0;
return true;
}
def_th = sysctl_numa_balancing_hot_threshold;
rate_limit = sysctl_numa_balancing_promote_rate_limit << \
(20 - PAGE_SHIFT);
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
latency = numa_hint_fault_latency(page);
if (latency >= th)
return false;
return !numa_promotion_rate_limit(pgdat, rate_limit,
thp_nr_pages(page));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
return false;
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
/*
* NUMA faults statistics are unnecessary for the slow memory
* node for memory tiering mode.
*/
if (!node_is_toptier(mem_node) &&
(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
!cpupid_valid(last_cpupid)))
return;
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
mas_set(&mas, start);
vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
mas_set(&mas, start);
vma = mas_find(&mas, ULONG_MAX);
}
for (; vma; vma = vma->vm_next) {
for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;

View File

@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@@ -201,6 +202,7 @@ void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
static_branch_disable(&psi_cgroups_enabled);
return;
}
@@ -211,7 +213,7 @@ void __init psi_init(void)
group_init(&psi_system);
}
static bool test_state(unsigned int *tasks, enum psi_states state)
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL:
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
* First we assess the aggregate resource states this CPU's
* tasks have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*
* Then we update the task counts according to the state
* First we update the task counts according to the state
* change requested through the @clear and @set bits.
*
* Then if the cgroup PSI stats accounting enabled, we
* assess the aggregate resource states this CPU's tasks
* have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
/*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
* no changes are requested.
*/
if (unlikely(clear & TSK_ONCPU)) {
state_mask = 0;
clear &= ~TSK_ONCPU;
} else if (unlikely(set & TSK_ONCPU)) {
state_mask = PSI_ONCPU;
set &= ~TSK_ONCPU;
} else {
state_mask = groupc->state_mask & PSI_ONCPU;
}
/*
* The rest of the state mask is calculated based on the task
* counts. Update those first, then construct the mask.
*/
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], groupc->tasks[4],
clear, set);
groupc->tasks[3], clear, set);
psi_bug = 1;
}
}
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
/* Calculate state mask representing active states */
if (!group->enabled) {
/*
* On the first group change after disabling PSI, conclude
* the current state and flush its time. This is unlikely
* to matter to the user, but aggregation (get_recent_times)
* may have already incorporated the live state into times_prev;
* avoid a delta sample underflow when PSI is later re-enabled.
*/
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
return;
}
for (s = 0; s < NR_PSI_STATES; s++) {
if (test_state(groupc->tasks, s))
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state.
*/
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
static inline struct psi_group *task_psi_group(struct task_struct *task)
{
if (*iter == &psi_system)
return NULL;
#ifdef CONFIG_CGROUPS
if (static_branch_likely(&psi_cgroups_enabled)) {
struct cgroup *cgroup = NULL;
if (!*iter)
cgroup = task->cgroups->dfl_cgrp;
else
cgroup = cgroup_parent(*iter);
if (cgroup && cgroup_parent(cgroup)) {
*iter = cgroup;
return cgroup_psi(cgroup);
}
}
if (static_branch_likely(&psi_cgroups_enabled))
return cgroup_psi(task_dfl_cgroup(task));
#endif
*iter = &psi_system;
return &psi_system;
}
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
u64 now;
if (!task->pid)
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set);
now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((clear & TSK_RUNNING) &&
(task->flags & PF_WQ_WORKER) &&
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
while ((group = iterate_groups(task, &iter)))
psi_group_change(group, cpu, clear, set, now, wake_clock);
group = task_psi_group(task);
do {
psi_group_change(group, cpu, clear, set, now, true);
} while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
u64 now = cpu_clock(cpu);
if (next->pid) {
bool identical_state;
psi_flags_change(next, 0, TSK_ONCPU);
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
* Set TSK_ONCPU on @next's cgroups. If @next shares any
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
identical_state = prev->psi_flags == next->psi_flags;
iter = NULL;
while ((group = iterate_groups(next, &iter))) {
if (identical_state &&
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
group = task_psi_group(next);
do {
if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
} while ((group = group->parent));
}
if (prev->pid) {
int clear = TSK_ONCPU, set = 0;
bool wake_clock = true;
/*
* When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((prev->flags & PF_WQ_WORKER) &&
wq_worker_last_func(prev) == psi_avgs_work))
wake_clock = false;
}
psi_flags_change(prev, clear, set);
iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
psi_group_change(group, cpu, clear, set, now, true);
group = task_psi_group(prev);
do {
if (group == common)
break;
psi_group_change(group, cpu, clear, set, now, wake_clock);
} while ((group = group->parent));
/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
* with dequeuing too, finish that for the rest of the hierarchy.
* TSK_ONCPU is handled up to the common ancestor. If there are
* any other differences between the two tasks (e.g. prev goes
* to sleep, or only one task is memstall), finish propagating
* those differences all the way up to the root.
*/
if (sleep) {
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
psi_group_change(group, cpu, clear, set, now, true);
for (; group; group = group->parent)
psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct task_struct *task, u32 delta)
{
int cpu = task_cpu(task);
struct psi_group *group;
struct psi_group_cpu *groupc;
u64 now;
if (!task->pid)
return;
now = cpu_clock(cpu);
group = task_psi_group(task);
do {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
write_seqcount_end(&groupc->seq);
if (group->poll_states & (1 << PSI_IRQ_FULL))
psi_schedule_poll_work(group, 1);
} while ((group = group->parent));
}
#endif
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
if (static_branch_likely(&psi_disabled))
if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM;
}
group_init(cgroup->psi);
cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
if (static_branch_likely(&psi_disabled))
if (!static_branch_likely(&psi_cgroups_enabled))
return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf;
struct rq *rq;
if (static_branch_likely(&psi_disabled)) {
if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
void psi_cgroup_restart(struct psi_group *group)
{
int cpu;
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
* instead only stop test_state() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
* since cgroup pressure files are hidden and percpu psi_group_cpu
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
* to get correct state mask from test_state() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
if (!group->enabled)
return;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
u64 now;
rq_lock_irq(rq, &rf);
now = cpu_clock(cpu);
psi_group_change(group, cpu, 0, 0, now, true);
rq_unlock_irq(rq, &rf);
}
}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
bool only_full = false;
int full;
u64 now;
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
only_full = res == PSI_IRQ;
#endif
for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
return ERR_PTR(-EINVAL);
#endif
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int psi_irq_show(struct seq_file *m, void *v)
{
return psi_show(m, &psi_system, PSI_IRQ);
}
static int psi_irq_open(struct inode *inode, struct file *file)
{
return psi_open(file, psi_irq_show);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
size_t nbytes, loff_t *ppos)
{
return psi_write(file, user_buf, nbytes, PSI_IRQ);
}
static const struct proc_ops psi_irq_proc_ops = {
.proc_open = psi_irq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_write = psi_irq_write,
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
#endif
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
#endif
}
return 0;
}

View File

@@ -2446,6 +2446,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK

View File

@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
}
#ifdef CONFIG_PSI
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_account_irqtime(struct task_struct *task, u32 delta);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO

View File

@@ -1069,7 +1069,7 @@ static int __init nrcpus(char *str)
int nr_cpus;
if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
nr_cpu_ids = nr_cpus;
set_nr_cpu_ids(nr_cpus);
return 0;
}
@@ -1087,14 +1087,16 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}
/* Called by boot processor to activate the rest. */

View File

@@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
/* The outgoing CPU will normally get done quite quickly. */
if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
goto update_state;
goto update_state_early;
udelay(5);
/* But if the outgoing CPU dawdles, wait increasingly long times. */
@@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
break;
sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
}
update_state:
update_state_early:
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
update_state:
if (oldstate == CPU_DEAD) {
/* Outgoing CPU died normally, update state. */
smp_mb(); /* atomic_read() before update. */
atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
} else {
/* Outgoing CPU still hasn't died, set state accordingly. */
if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
oldstate, CPU_BROKEN) != oldstate)
if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
&oldstate, CPU_BROKEN))
goto update_state;
ret = false;
}
@@ -475,14 +476,14 @@ bool cpu_report_death(void)
int newstate;
int cpu = smp_processor_id();
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
do {
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
if (oldstate != CPU_BROKEN)
newstate = CPU_DEAD;
else
newstate = CPU_DEAD_FROZEN;
} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
oldstate, newstate) != oldstate);
} while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
&oldstate, newstate));
return newstate == CPU_DEAD;
}

View File

@@ -9,9 +9,6 @@
#define KUNIT_PROC_READ 0
#define KUNIT_PROC_WRITE 1
static int i_zero;
static int i_one_hundred = 100;
/*
* Test that proc_dointvec will not try to use a NULL .data field even when the
* length is non-zero.
@@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
/*
* proc_dointvec expects a buffer in user space, so we allocate one. We
@@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
.maxlen = 0,
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
GFP_USER);
@@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 4;
loff_t pos = 0;
@@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
size_t len = 5;
loff_t pos = 0;
@@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "9";
size_t len = sizeof(input) - 1;
@@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
char input[] = "-9";
size_t len = sizeof(input) - 1;
@@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;
@@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &i_zero,
.extra2 = &i_one_hundred,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
};
size_t max_len = 32, len = max_len;
loff_t pos = 0;

View File

@@ -82,9 +82,16 @@
#include <linux/rtmutex.h>
#endif
/* shared constants to be used in various sysctls */
const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
EXPORT_SYMBOL_GPL(sysctl_long_vals);
#if defined(CONFIG_SYSCTL)
/* Constants used for minimum and maximum */
/* Constants used for minimum and maximum */
#ifdef CONFIG_PERF_EVENTS
static const int six_hundred_forty_kb = 640 * 1024;
@@ -129,11 +136,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
#ifdef CONFIG_COMPACTION
/* min_extfrag_threshold is SYSCTL_ZERO */;
static const int max_extfrag_threshold = 1000;
#endif
#endif /* CONFIG_SYSCTL */
/*
@@ -1052,9 +1054,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
return 0;
}
i = (unsigned long *) data;
min = (unsigned long *) table->extra1;
max = (unsigned long *) table->extra2;
i = data;
min = table->extra1;
max = table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
left = *lenp;
@@ -1641,6 +1643,14 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
},
{
.procname = "numa_balancing_promote_rate_limit_MBps",
.data = &sysctl_numa_balancing_promote_rate_limit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
@@ -2216,7 +2226,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = (void *)&max_extfrag_threshold,
.extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "compact_unevictable_allowed",

View File

@@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
head = READ_ONCE(task->task_works);
do {
head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
} while (cmpxchg(&task->task_works, head, work) != head);
} while (!try_cmpxchg(&task->task_works, &head, work));
switch (notify) {
case TWA_NONE:
@@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task,
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
if (!match(work, data))
work = READ_ONCE(*pprev);
while (work) {
if (!match(work, data)) {
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
work = READ_ONCE(*pprev);
} else if (try_cmpxchg(pprev, &work, work->next))
break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -151,16 +153,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
work = READ_ONCE(task->task_works);
do {
head = NULL;
work = READ_ONCE(task->task_works);
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work);
} while (!try_cmpxchg(&task->task_works, &work, head));
if (!work)
break;

View File

@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
cpu = prandom_u32() % nr_cpu_ids;
cpu = prandom_u32_max(nr_cpu_ids);
cpu = cpumask_next(cpu - 1, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_online_mask);

View File

@@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
static bool skip_record(struct dyn_ftrace *rec)
{
/*
* At boot up, weak functions are set to disable. Function tracing
* can be enabled before they are, and they still need to be disabled now.
* If the record is disabled, still continue if it is marked as already
* enabled (this is needed to keep the accounting working).
*/
return rec->flags & FTRACE_FL_DISABLED &&
!(rec->flags & FTRACE_FL_ENABLED);
}
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
if (rec->flags & FTRACE_FL_DISABLED)
if (skip_record(rec))
continue;
if (all) {
@@ -2016,7 +2028,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
char ins[MCOUNT_INSN_SIZE];
int i;
if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
@@ -2024,9 +2035,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
}
printk(KERN_CONT "%s", fmt);
for (i = 0; i < MCOUNT_INSN_SIZE; i++)
printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
}
enum ftrace_bug_type ftrace_bug_type;
@@ -2126,7 +2135,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
ftrace_bug_type = FTRACE_BUG_UNKNOWN;
if (rec->flags & FTRACE_FL_DISABLED)
if (skip_record(rec))
return FTRACE_UPDATE_IGNORE;
/*
@@ -2241,7 +2250,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
rec->flags = 0;
rec->flags &= FTRACE_FL_DISABLED;
else
/*
* Just disable the record, but keep the ops TRAMP
@@ -2634,7 +2643,7 @@ void __weak ftrace_replace_code(int mod_flags)
do_for_each_ftrace_rec(pg, rec) {
if (rec->flags & FTRACE_FL_DISABLED)
if (skip_record(rec))
continue;
failed = __ftrace_replace_code(rec, enable);
@@ -5427,6 +5436,8 @@ static struct ftrace_ops stub_ops = {
* it is safe to modify the ftrace record, where it should be
* currently calling @old_addr directly, to call @new_addr.
*
* This is called with direct_mutex locked.
*
* Safety checks should be made to make sure that the code at
* @rec->ip is currently calling @old_addr. And this must
* also update entry->direct to @new_addr.
@@ -5439,6 +5450,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
unsigned long ip = rec->ip;
int ret;
lockdep_assert_held(&direct_mutex);
/*
* The ftrace_lock was used to determine if the record
* had more than one registered user to it. If it did,
@@ -5461,7 +5474,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
if (ret)
goto out_lock;
ret = register_ftrace_function(&stub_ops);
ret = register_ftrace_function_nolock(&stub_ops);
if (ret) {
ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
goto out_lock;
@@ -6081,8 +6094,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
if (iter->tr && !list_empty(&iter->tr->mod_trace))
iter->hash->flags |= FTRACE_HASH_FL_MOD;
if (iter->tr) {
if (list_empty(&iter->tr->mod_trace))
iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
else
iter->hash->flags |= FTRACE_HASH_FL_MOD;
}
} else
orig_hash = &iter->ops->func_hash->notrace_hash;

View File

@@ -35,6 +35,45 @@
static struct trace_event_file *gen_kprobe_test;
static struct trace_event_file *gen_kretprobe_test;
#define KPROBE_GEN_TEST_FUNC "do_sys_open"
/* X86 */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
#define KPROBE_GEN_TEST_ARG0 "dfd=%ax"
#define KPROBE_GEN_TEST_ARG1 "filename=%dx"
#define KPROBE_GEN_TEST_ARG2 "flags=%cx"
#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)"
/* ARM64 */
#elif defined(CONFIG_ARM64)
#define KPROBE_GEN_TEST_ARG0 "dfd=%x0"
#define KPROBE_GEN_TEST_ARG1 "filename=%x1"
#define KPROBE_GEN_TEST_ARG2 "flags=%x2"
#define KPROBE_GEN_TEST_ARG3 "mode=%x3"
/* ARM */
#elif defined(CONFIG_ARM)
#define KPROBE_GEN_TEST_ARG0 "dfd=%r0"
#define KPROBE_GEN_TEST_ARG1 "filename=%r1"
#define KPROBE_GEN_TEST_ARG2 "flags=%r2"
#define KPROBE_GEN_TEST_ARG3 "mode=%r3"
/* RISCV */
#elif defined(CONFIG_RISCV)
#define KPROBE_GEN_TEST_ARG0 "dfd=%a0"
#define KPROBE_GEN_TEST_ARG1 "filename=%a1"
#define KPROBE_GEN_TEST_ARG2 "flags=%a2"
#define KPROBE_GEN_TEST_ARG3 "mode=%a3"
/* others */
#else
#define KPROBE_GEN_TEST_ARG0 NULL
#define KPROBE_GEN_TEST_ARG1 NULL
#define KPROBE_GEN_TEST_ARG2 NULL
#define KPROBE_GEN_TEST_ARG3 NULL
#endif
/*
* Test to make sure we can create a kprobe event, then add more
* fields.
@@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void)
* fields.
*/
ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
"do_sys_open",
"dfd=%ax", "filename=%dx");
KPROBE_GEN_TEST_FUNC,
KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
if (ret)
goto free;
/* Use kprobe_event_add_fields to add the rest of the fields */
ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
if (ret)
goto free;
@@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void)
* Define the kretprobe event.
*/
ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
"do_sys_open",
KPROBE_GEN_TEST_FUNC,
"$retval");
if (ret)
goto free;
@@ -206,7 +245,7 @@ static void __exit kprobe_event_gen_test_exit(void)
WARN_ON(kprobe_event_delete("gen_kprobe_test"));
/* Disable the event or you can't remove it */
WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
"kprobes",
"gen_kretprobe_test", false));

View File

@@ -413,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -884,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
* ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -917,12 +918,44 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
if (rbwork->wakeup_full) {
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
/**
* ring_buffer_wake_waiters - wake up any waiters on this ring buffer
* @buffer: The ring buffer to wake waiters on
*
* In the case of a file that represents a ring buffer is closing,
* it is prudent to wake up any waiters that are on this.
*/
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *rbwork;
if (cpu == RING_BUFFER_ALL_CPUS) {
/* Wake up individual ones too. One level recursion */
for_each_buffer_cpu(buffer, cpu)
ring_buffer_wake_waiters(buffer, cpu);
rbwork = &buffer->irq_work;
} else {
cpu_buffer = buffer->buffers[cpu];
rbwork = &cpu_buffer->irq_work;
}
rbwork->wait_index++;
/* make sure the waiters see the new index */
smp_wmb();
rb_wake_up_waiters(&rbwork->work);
}
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -938,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
long wait_index;
int ret = 0;
/*
@@ -956,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -1011,7 +1046,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
nr_pages = cpu_buffer->nr_pages;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full < full)
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (!pagebusy &&
@@ -1020,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
}
schedule();
/* Make sure to see the new wait index */
smp_rmb();
if (wait_index != work->wait_index)
break;
}
if (full)
@@ -2608,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
/* Make sure the padding is visible before the write update */
smp_wmb();
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
@@ -2619,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* time delta must be non zero */
event->time_delta = 1;
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
@@ -4587,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
/*
* The writer has preempt disable, wait for it. But not forever
* Although, 1 second is pretty much "forever"
*/
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
break;
udelay(1);
/* Get the latest version of the reader write value */
smp_rmb();
}
/* The writer is not moving forward? Something is wrong */
if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
reader = NULL;
/*
* Make sure we see any padding after the write update
* (see rb_reset_tail())
*/
smp_rmb();
return reader;
}
@@ -5232,7 +5305,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
@@ -5302,7 +5375,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
* rind_buffer_empty - is the ring buffer empty?
* ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
@@ -5616,7 +5689,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
if (full)
/*
* If a full page is expected, this can still be returned
* if there's been a previous partial read and the
* rest of the page can be read and the commit page is off
* the reader page.
*/
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))

View File

@@ -16,7 +16,7 @@
#include "wip.h"
struct rv_monitor rv_wip;
static struct rv_monitor rv_wip;
DECLARE_DA_MON_PER_CPU(wip, unsigned char);
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
@@ -60,7 +60,7 @@ static void disable_wip(void)
da_monitor_destroy_wip();
}
struct rv_monitor rv_wip = {
static struct rv_monitor rv_wip = {
.name = "wip",
.description = "wakeup in preemptive per-cpu testing monitor.",
.enable = enable_wip,
@@ -69,13 +69,13 @@ struct rv_monitor rv_wip = {
.enabled = 0,
};
static int register_wip(void)
static int __init register_wip(void)
{
rv_register_monitor(&rv_wip);
return 0;
}
static void unregister_wip(void)
static void __exit unregister_wip(void)
{
rv_unregister_monitor(&rv_wip);
}

View File

@@ -15,7 +15,7 @@
#include "wwnr.h"
struct rv_monitor rv_wwnr;
static struct rv_monitor rv_wwnr;
DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
static void handle_switch(void *data, bool preempt, struct task_struct *p,
@@ -59,7 +59,7 @@ static void disable_wwnr(void)
da_monitor_destroy_wwnr();
}
struct rv_monitor rv_wwnr = {
static struct rv_monitor rv_wwnr = {
.name = "wwnr",
.description = "wakeup while not running per-task testing model.",
.enable = enable_wwnr,
@@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = {
.enabled = 0,
};
static int register_wwnr(void)
static int __init register_wwnr(void)
{
rv_register_monitor(&rv_wwnr);
return 0;
}
static void unregister_wwnr(void)
static void __exit unregister_wwnr(void)
{
rv_unregister_monitor(&rv_wwnr);
}

View File

@@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
{
void *cond_data = NULL;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
cond_data = tr->cond_snapshot->cond_data;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
return cond_data;
}
@@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
mutex_unlock(&trace_types_lock);
@@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
{
int ret = 0;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (!tr->cond_snapshot)
@@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
return ret;
}
@@ -2200,6 +2206,11 @@ static size_t tgid_map_max;
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
/*
* Preemption must be disabled before acquiring trace_cmdline_lock.
* The various trace_arrays' max_lock must be acquired in a context
* where interrupt is disabled.
*/
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*
* This is called within the scheduler and wake up, so interrupts
* had better been disabled and run queue lock been held.
*/
lockdep_assert_preemption_disabled();
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
@@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
char buf[64];
int r;
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
goto out;
}
@@ -6407,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
had_max_tr = tr->current_trace->use_max_tr;
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
#ifdef CONFIG_TRACER_MAX_TRACE
had_max_tr = tr->allocated_snapshot;
if (had_max_tr && !t->use_max_tr) {
/*
* We need to make sure that the update_max_tr sees that
@@ -6425,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
if (t->use_max_tr && !had_max_tr) {
if (t->use_max_tr && !tr->allocated_snapshot) {
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
#else
tr->current_trace = &nop_trace;
#endif
if (t->init) {
@@ -7436,10 +7459,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
local_irq_disable();
arch_spin_lock(&tr->max_lock);
if (tr->cond_snapshot)
ret = -EBUSY;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
goto out;
@@ -8137,6 +8162,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
iter->wait_index++;
/* Make sure the waiters see the new wait_index */
smp_wmb();
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8290,6 +8321,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
long wait_index;
if (ret)
goto out;
@@ -8297,10 +8330,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
wait_index = READ_ONCE(iter->wait_index);
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
/* No need to wait after waking up when tracing is off */
if (!tracer_tracing_is_on(iter->tr))
goto out;
/* Make sure we see the new wait_index */
smp_rmb();
if (wait_index != iter->wait_index)
goto out;
goto again;
}
@@ -8311,12 +8355,34 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret;
}
/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
if (cmd)
return -ENOIOCTLCMD;
mutex_lock(&trace_types_lock);
iter->wait_index++;
/* Make sure the waiters see the new wait_index */
smp_wmb();
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
mutex_unlock(&trace_types_lock);
return 0;
}
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
@@ -9005,6 +9071,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
/* Wake up any waiters */
ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
mutex_unlock(&trace_types_lock);
}
@@ -10091,7 +10159,7 @@ __init static int tracer_alloc_buffers(void)
* buffer. The memory will be removed once the "instance" is removed.
*/
ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
"trace/RB:preapre", trace_rb_cpu_prepare,
"trace/RB:prepare", trace_rb_cpu_prepare,
NULL);
if (ret < 0)
goto out_free_cpumask;

View File

@@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
struct filter_pred;
struct regex;
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
enum regex_type {
@@ -1455,17 +1453,6 @@ struct regex {
regex_match_func match;
};
struct filter_pred {
filter_pred_fn_t fn;
u64 val;
struct regex regex;
unsigned short *ops;
struct ftrace_event_field *field;
int offset;
int not;
int op;
};
static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||

View File

@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
local_irq_disable();
start = trace_clock_local();
trace_benchmark_event(bm_str);
trace_benchmark_event(bm_str, bm_last);
stop = trace_clock_local();
local_irq_enable();

View File

@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
TRACE_EVENT_FN(benchmark_event,
TP_PROTO(const char *str),
TP_PROTO(const char *str, u64 delta),
TP_ARGS(str),
TP_ARGS(str, delta),
TP_STRUCT__entry(
__array( char, str, BENCHMARK_EVENT_STRLEN )
__field( u64, delta)
),
TP_fast_assign(
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
__entry->delta = delta;
),
TP_printk("%s", __entry->str),
TP_printk("%s delta=%llu", __entry->str, __entry->delta),
trace_benchmark_reg, trace_benchmark_unreg
);

View File

@@ -16,6 +16,7 @@
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -26,6 +27,9 @@ struct trace_eprobe {
/* tracepoint event */
const char *event_name;
/* filter string for the tracepoint */
char *filter_str;
struct trace_event_call *event;
struct dyn_event devent;
@@ -453,29 +457,14 @@ NOKPROBE_SYMBOL(process_fetch_insn)
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if (addr < TASK_SIZE)
return fetch_store_strlen_user(addr);
#endif
do {
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
return (ret < 0) ? ret : len;
return kern_fetch_store_strlen(addr);
}
/*
@@ -485,21 +474,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
return ret;
return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -509,29 +484,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)addr < TASK_SIZE)
return fetch_store_string_user(addr, dest, base);
#endif
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
/*
* Try to get string again, since the string can be changed while
* probing.
*/
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
return ret;
return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
@@ -664,14 +617,15 @@ static struct event_trigger_data *
new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
{
struct event_trigger_data *trigger;
struct event_filter *filter = NULL;
struct eprobe_data *edata;
int ret;
edata = kzalloc(sizeof(*edata), GFP_KERNEL);
trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
if (!trigger || !edata) {
kfree(edata);
kfree(trigger);
return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
goto error;
}
trigger->flags = EVENT_TRIGGER_FL_PROBE;
@@ -686,13 +640,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
trigger->cmd_ops = &event_trigger_cmd;
INIT_LIST_HEAD(&trigger->list);
RCU_INIT_POINTER(trigger->filter, NULL);
if (ep->filter_str) {
ret = create_event_filter(file->tr, file->event_call,
ep->filter_str, false, &filter);
if (ret)
goto error;
}
RCU_INIT_POINTER(trigger->filter, filter);
edata->file = file;
edata->ep = ep;
trigger->private_data = edata;
return trigger;
error:
free_event_filter(filter);
kfree(edata);
kfree(trigger);
return ERR_PTR(ret);
}
static int enable_eprobe(struct trace_eprobe *ep,
@@ -726,6 +692,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
{
struct event_trigger_data *trigger = NULL, *iter;
struct trace_event_file *file;
struct event_filter *filter;
struct eprobe_data *edata;
file = find_event_file(tr, ep->event_system, ep->event_name);
@@ -752,6 +719,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
/* Make sure nothing is using the edata or trigger */
tracepoint_synchronize_unregister();
filter = rcu_access_pointer(trigger->filter);
if (filter)
free_event_filter(filter);
kfree(edata);
kfree(trigger);
@@ -927,12 +898,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
return ret;
}
static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
{
struct event_filter *dummy;
int i, ret, len = 0;
char *p;
if (argc == 0) {
trace_probe_log_err(0, NO_EP_FILTER);
return -EINVAL;
}
/* Recover the filter string */
for (i = 0; i < argc; i++)
len += strlen(argv[i]) + 1;
ep->filter_str = kzalloc(len, GFP_KERNEL);
if (!ep->filter_str)
return -ENOMEM;
p = ep->filter_str;
for (i = 0; i < argc; i++) {
ret = snprintf(p, len, "%s ", argv[i]);
if (ret < 0)
goto error;
if (ret > len) {
ret = -E2BIG;
goto error;
}
p += ret;
len -= ret;
}
p[-1] = '\0';
/*
* Ensure the filter string can be parsed correctly. Note, this
* filter string is for the original event, not for the eprobe.
*/
ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
true, &dummy);
free_event_filter(dummy);
if (ret)
goto error;
return 0;
error:
kfree(ep->filter_str);
ep->filter_str = NULL;
return ret;
}
static int __trace_eprobe_create(int argc, const char *argv[])
{
/*
* Argument syntax:
* e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS]
* Fetch args:
* e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
* Fetch args (no space):
* <name>=$<field>[:TYPE]
*/
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
@@ -942,8 +963,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
char buf1[MAX_EVENT_NAME_LEN];
char buf2[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
int ret = 0;
int i;
int ret = 0, filter_idx = 0;
int i, filter_cnt;
if (argc < 2 || argv[0][0] != 'e')
return -ECANCELED;
@@ -968,11 +989,19 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (!event) {
strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
sanitize_event_name(buf1);
strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
event = buf1;
}
for (i = 2; i < argc; i++) {
if (!strcmp(argv[i], "if")) {
filter_idx = i + 1;
filter_cnt = argc - filter_idx;
argc = i;
break;
}
}
mutex_lock(&event_mutex);
event_call = find_and_get_event(sys_name, sys_event);
ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -988,6 +1017,14 @@ static int __trace_eprobe_create(int argc, const char *argv[])
goto error;
}
if (filter_idx) {
trace_probe_log_set_index(filter_idx);
ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
if (ret)
goto parse_error;
} else
ep->filter_str = NULL;
argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {

View File

@@ -43,6 +43,42 @@ enum filter_op_ids { OPS };
static const char * ops[] = { OPS };
enum filter_pred_fn {
FILTER_PRED_FN_NOP,
FILTER_PRED_FN_64,
FILTER_PRED_FN_S64,
FILTER_PRED_FN_U64,
FILTER_PRED_FN_32,
FILTER_PRED_FN_S32,
FILTER_PRED_FN_U32,
FILTER_PRED_FN_16,
FILTER_PRED_FN_S16,
FILTER_PRED_FN_U16,
FILTER_PRED_FN_8,
FILTER_PRED_FN_S8,
FILTER_PRED_FN_U8,
FILTER_PRED_FN_COMM,
FILTER_PRED_FN_STRING,
FILTER_PRED_FN_STRLOC,
FILTER_PRED_FN_STRRELLOC,
FILTER_PRED_FN_PCHAR_USER,
FILTER_PRED_FN_PCHAR,
FILTER_PRED_FN_CPU,
FILTER_PRED_FN_,
FILTER_PRED_TEST_VISITED,
};
struct filter_pred {
enum filter_pred_fn fn_num;
u64 val;
struct regex regex;
unsigned short *ops;
struct ftrace_event_field *field;
int offset;
int not;
int op;
};
/*
* pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
* pred_funcs_##type below must match the order of them above.
@@ -590,45 +626,49 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
return ERR_PTR(ret);
}
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr < val; \
} \
static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr <= val; \
} \
static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr > val; \
} \
static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr >= val; \
} \
static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return !!(*addr & val); \
} \
static const filter_pred_fn_t pred_funcs_##type[] = { \
filter_pred_LE_##type, \
filter_pred_LT_##type, \
filter_pred_GE_##type, \
filter_pred_GT_##type, \
filter_pred_BAND_##type, \
enum pred_cmp_types {
PRED_CMP_TYPE_NOP,
PRED_CMP_TYPE_LT,
PRED_CMP_TYPE_LE,
PRED_CMP_TYPE_GT,
PRED_CMP_TYPE_GE,
PRED_CMP_TYPE_BAND,
};
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
switch (pred->op) { \
case OP_LT: { \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr < val; \
} \
case OP_LE: { \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr <= val; \
} \
case OP_GT: { \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr > val; \
} \
case OP_GE: { \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return *addr >= val; \
} \
case OP_BAND: { \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
return !!(*addr & val); \
} \
default: \
return 0; \
} \
}
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
@@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
return cmp ^ pred->not;
}
static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
}
/*
* regex_match_foo - Basic regex callbacks
*
@@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
}
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
static int test_pred_visited_fn(struct filter_pred *pred, void *event);
#else
static int test_pred_visited_fn(struct filter_pred *pred, void *event)
{
return 0;
}
#endif
static int filter_pred_fn_call(struct filter_pred *pred, void *event);
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
@@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
for (i = 0; prog[i].pred; i++) {
struct filter_pred *pred = prog[i].pred;
int match = pred->fn(pred, rec);
int match = filter_pred_fn_call(pred, rec);
if (match == prog[i].when_to_branch)
i = prog[i].target;
}
@@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
int field_size, int field_is_signed)
static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
int field_size, int field_is_signed)
{
filter_pred_fn_t fn = NULL;
enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
int pred_func_index = -1;
switch (op) {
@@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
break;
default:
if (WARN_ON_ONCE(op < PRED_FUNC_START))
return NULL;
return fn;
pred_func_index = op - PRED_FUNC_START;
if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
return NULL;
return fn;
}
switch (field_size) {
case 8:
if (pred_func_index < 0)
fn = filter_pred_64;
fn = FILTER_PRED_FN_64;
else if (field_is_signed)
fn = pred_funcs_s64[pred_func_index];
fn = FILTER_PRED_FN_S64;
else
fn = pred_funcs_u64[pred_func_index];
fn = FILTER_PRED_FN_U64;
break;
case 4:
if (pred_func_index < 0)
fn = filter_pred_32;
fn = FILTER_PRED_FN_32;
else if (field_is_signed)
fn = pred_funcs_s32[pred_func_index];
fn = FILTER_PRED_FN_S32;
else
fn = pred_funcs_u32[pred_func_index];
fn = FILTER_PRED_FN_U32;
break;
case 2:
if (pred_func_index < 0)
fn = filter_pred_16;
fn = FILTER_PRED_FN_16;
else if (field_is_signed)
fn = pred_funcs_s16[pred_func_index];
fn = FILTER_PRED_FN_S16;
else
fn = pred_funcs_u16[pred_func_index];
fn = FILTER_PRED_FN_U16;
break;
case 1:
if (pred_func_index < 0)
fn = filter_pred_8;
fn = FILTER_PRED_FN_8;
else if (field_is_signed)
fn = pred_funcs_s8[pred_func_index];
fn = FILTER_PRED_FN_S8;
else
fn = pred_funcs_u8[pred_func_index];
fn = FILTER_PRED_FN_U8;
break;
}
return fn;
}
static int filter_pred_fn_call(struct filter_pred *pred, void *event)
{
switch (pred->fn_num) {
case FILTER_PRED_FN_64:
return filter_pred_64(pred, event);
case FILTER_PRED_FN_S64:
return filter_pred_s64(pred, event);
case FILTER_PRED_FN_U64:
return filter_pred_u64(pred, event);
case FILTER_PRED_FN_32:
return filter_pred_32(pred, event);
case FILTER_PRED_FN_S32:
return filter_pred_s32(pred, event);
case FILTER_PRED_FN_U32:
return filter_pred_u32(pred, event);
case FILTER_PRED_FN_16:
return filter_pred_16(pred, event);
case FILTER_PRED_FN_S16:
return filter_pred_s16(pred, event);
case FILTER_PRED_FN_U16:
return filter_pred_u16(pred, event);
case FILTER_PRED_FN_8:
return filter_pred_8(pred, event);
case FILTER_PRED_FN_S8:
return filter_pred_s8(pred, event);
case FILTER_PRED_FN_U8:
return filter_pred_u8(pred, event);
case FILTER_PRED_FN_COMM:
return filter_pred_comm(pred, event);
case FILTER_PRED_FN_STRING:
return filter_pred_string(pred, event);
case FILTER_PRED_FN_STRLOC:
return filter_pred_strloc(pred, event);
case FILTER_PRED_FN_STRRELLOC:
return filter_pred_strrelloc(pred, event);
case FILTER_PRED_FN_PCHAR_USER:
return filter_pred_pchar_user(pred, event);
case FILTER_PRED_FN_PCHAR:
return filter_pred_pchar(pred, event);
case FILTER_PRED_FN_CPU:
return filter_pred_cpu(pred, event);
case FILTER_PRED_TEST_VISITED:
return test_pred_visited_fn(pred, event);
default:
return 0;
}
}
/* Called when a predicate is encountered by predicate_parse() */
static int parse_pred(const char *str, void *data,
int pos, struct filter_parse_error *pe,
@@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
goto err_free;
}
pred->fn = filter_pred_none;
pred->fn_num = FILTER_PRED_FN_NOP;
/*
* Quotes are not required, but if they exist then we need
@@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
filter_build_regex(pred);
if (field->filter_type == FILTER_COMM) {
pred->fn = filter_pred_comm;
pred->fn_num = FILTER_PRED_FN_COMM;
} else if (field->filter_type == FILTER_STATIC_STRING) {
pred->fn = filter_pred_string;
pred->fn_num = FILTER_PRED_FN_STRING;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
pred->fn = filter_pred_strloc;
pred->fn_num = FILTER_PRED_FN_STRLOC;
} else if (field->filter_type == FILTER_RDYN_STRING)
pred->fn = filter_pred_strrelloc;
pred->fn_num = FILTER_PRED_FN_STRRELLOC;
else {
if (!ustring_per_cpu) {
@@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
}
if (ustring)
pred->fn = filter_pred_pchar_user;
pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
else
pred->fn = filter_pred_pchar;
pred->fn_num = FILTER_PRED_FN_PCHAR;
}
/* go past the last quote */
i++;
@@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
pred->val = val;
if (field->filter_type == FILTER_CPU)
pred->fn = filter_pred_cpu;
pred->fn_num = FILTER_PRED_FN_CPU;
else {
pred->fn = select_comparison_fn(pred->op, field->size,
field->is_signed);
pred->fn_num = select_comparison_fn(pred->op, field->size,
field->is_signed);
if (pred->op == OP_NE)
pred->not = 1;
}
@@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
WARN_ON_ONCE(!pred->fn);
WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
if (!field) {
WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
if (!strchr(fields, *field->name))
continue;
pred->fn = test_pred_visited_fn;
pred->fn_num = FILTER_PRED_TEST_VISITED;
}
}

View File

@@ -104,6 +104,38 @@ enum field_op_id {
FIELD_OP_MULT,
};
enum hist_field_fn {
HIST_FIELD_FN_NOP,
HIST_FIELD_FN_VAR_REF,
HIST_FIELD_FN_COUNTER,
HIST_FIELD_FN_CONST,
HIST_FIELD_FN_LOG2,
HIST_FIELD_FN_BUCKET,
HIST_FIELD_FN_TIMESTAMP,
HIST_FIELD_FN_CPU,
HIST_FIELD_FN_STRING,
HIST_FIELD_FN_DYNSTRING,
HIST_FIELD_FN_RELDYNSTRING,
HIST_FIELD_FN_PSTRING,
HIST_FIELD_FN_S64,
HIST_FIELD_FN_U64,
HIST_FIELD_FN_S32,
HIST_FIELD_FN_U32,
HIST_FIELD_FN_S16,
HIST_FIELD_FN_U16,
HIST_FIELD_FN_S8,
HIST_FIELD_FN_U8,
HIST_FIELD_FN_UMINUS,
HIST_FIELD_FN_MINUS,
HIST_FIELD_FN_PLUS,
HIST_FIELD_FN_DIV,
HIST_FIELD_FN_MULT,
HIST_FIELD_FN_DIV_POWER2,
HIST_FIELD_FN_DIV_NOT_POWER2,
HIST_FIELD_FN_DIV_MULT_SHIFT,
HIST_FIELD_FN_EXECNAME,
};
/*
* A hist_var (histogram variable) contains variable information for
* hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
@@ -123,15 +155,15 @@ struct hist_var {
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
enum hist_field_fn fn_num;
unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
/*
* Variable fields contain variable-specific info in var.
@@ -166,14 +198,11 @@ struct hist_field {
u64 div_multiplier;
};
static u64 hist_field_none(struct hist_field *field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
return 0;
}
static u64 hist_fn_call(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event);
static u64 hist_field_const(struct hist_field *field,
struct tracing_map_elt *elt,
@@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
u64 val = operand->fn(operand, elt, buffer, rbe, event);
u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
struct hist_field *operand = hist_field->operands[0];
unsigned long buckets = hist_field->buckets;
u64 val = operand->fn(operand, elt, buffer, rbe, event);
u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
if (WARN_ON_ONCE(!buckets))
return val;
@@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
@@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
@@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
/* Return -1 for the undefined case */
if (!val2)
@@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return val1 >> __ffs64(operand2->constant);
}
@@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
return div64_u64(val1, operand2->constant);
}
@@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
/*
* If the divisor is a constant, do a multiplication and shift instead.
@@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
return val1 * val2;
}
@@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
{
struct hist_field *operand = hist_field->operands[0];
s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -657,19 +686,19 @@ struct snapshot_context {
* Returns the specific division function to use if the divisor
* is constant. This avoids extra branches when the trigger is hit.
*/
static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
{
u64 div = divisor->constant;
if (!(div & (div - 1)))
return div_by_power_of_two;
return HIST_FIELD_FN_DIV_POWER2;
/* If the divisor is too large, do a regular division */
if (div > (1 << HIST_DIV_SHIFT))
return div_by_not_power_of_two;
return HIST_FIELD_FN_DIV_NOT_POWER2;
divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
return div_by_mult_and_shift;
return HIST_FIELD_FN_DIV_MULT_SHIFT;
}
static void track_data_free(struct track_data *track_data)
@@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field,
return field_name;
}
static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
{
hist_field_fn_t fn = NULL;
switch (field_size) {
case 8:
if (field_is_signed)
fn = hist_field_s64;
return HIST_FIELD_FN_S64;
else
fn = hist_field_u64;
break;
return HIST_FIELD_FN_U64;
case 4:
if (field_is_signed)
fn = hist_field_s32;
return HIST_FIELD_FN_S32;
else
fn = hist_field_u32;
break;
return HIST_FIELD_FN_U32;
case 2:
if (field_is_signed)
fn = hist_field_s16;
return HIST_FIELD_FN_S16;
else
fn = hist_field_u16;
break;
return HIST_FIELD_FN_U16;
case 1:
if (field_is_signed)
fn = hist_field_s8;
return HIST_FIELD_FN_S8;
else
fn = hist_field_u8;
break;
return HIST_FIELD_FN_U8;
}
return fn;
return HIST_FIELD_FN_NOP;
}
static int parse_map_size(char *str)
@@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out; /* caller will populate */
if (flags & HIST_FIELD_FL_VAR_REF) {
hist_field->fn = hist_field_var_ref;
hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
goto out;
}
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
hist_field->fn_num = HIST_FIELD_FN_COUNTER;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CONST) {
hist_field->fn = hist_field_const;
hist_field->fn_num = HIST_FIELD_FN_CONST;
hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
@@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
hist_field->fn = hist_field_none;
hist_field->fn_num = HIST_FIELD_FN_NOP;
goto out;
}
if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
hist_field_bucket;
hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
@@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_TIMESTAMP) {
hist_field->fn = hist_field_timestamp;
hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
hist_field->size = sizeof(u64);
hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
hist_field->fn = hist_field_cpu;
hist_field->fn_num = HIST_FIELD_FN_CPU;
hist_field->size = sizeof(int);
hist_field->type = "unsigned int";
goto out;
@@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto free;
if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
hist_field->fn_num = HIST_FIELD_FN_STRING;
hist_field->size = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
hist_field->fn = hist_field_dynstring;
hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
} else if (field->filter_type == FILTER_RDYN_STRING)
hist_field->fn = hist_field_reldynstring;
hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
else
hist_field->fn = hist_field_pstring;
hist_field->fn_num = HIST_FIELD_FN_PSTRING;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
@@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field->type)
goto free;
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
hist_field->fn_num = select_value_fn(field->size,
field->is_signed);
if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
destroy_hist_field(hist_field, 0);
return NULL;
}
@@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
if (!alias)
return NULL;
alias->fn = var_ref->fn;
alias->fn_num = var_ref->fn_num;
alias->operands[0] = var_ref;
if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
expr->fn = hist_field_unary_minus;
expr->fn_num = HIST_FIELD_FN_UMINUS;
expr->operands[0] = operand1;
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
hist_field_fn_t op_fn;
enum hist_field_fn op_fn;
bool combine_consts;
if (*n_subexprs > 3) {
@@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
switch (field_op) {
case FIELD_OP_MINUS:
op_fn = hist_field_minus;
op_fn = HIST_FIELD_FN_MINUS;
break;
case FIELD_OP_PLUS:
op_fn = hist_field_plus;
op_fn = HIST_FIELD_FN_PLUS;
break;
case FIELD_OP_DIV:
op_fn = hist_field_div;
op_fn = HIST_FIELD_FN_DIV;
break;
case FIELD_OP_MULT:
op_fn = hist_field_mult;
op_fn = HIST_FIELD_FN_MULT;
break;
default:
ret = -EINVAL;
@@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
op_fn = hist_field_get_div_fn(operand2);
}
expr->fn_num = op_fn;
if (combine_consts) {
if (var1)
expr->operands[0] = var1;
if (var2)
expr->operands[1] = var2;
expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
expr->fn_num = HIST_FIELD_FN_CONST;
expr->operands[0] = NULL;
expr->operands[1] = NULL;
@@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->name = expr_str(expr, 0);
} else {
expr->fn = op_fn;
/* The operand sizes should be the same, so just pick one */
expr->size = operand1->size;
expr->is_signed = operand1->is_signed;
@@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
var_val = val->fn(val, elt, buffer, rbe, rec);
var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
if (val->flags & HIST_FIELD_FL_STRING) {
@@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
return (u64)(unsigned long)(elt_data->comm);
}
static u64 hist_fn_call(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
switch (hist_field->fn_num) {
case HIST_FIELD_FN_VAR_REF:
return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_COUNTER:
return hist_field_counter(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_CONST:
return hist_field_const(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_LOG2:
return hist_field_log2(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_BUCKET:
return hist_field_bucket(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_TIMESTAMP:
return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_CPU:
return hist_field_cpu(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_STRING:
return hist_field_string(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DYNSTRING:
return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_RELDYNSTRING:
return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_PSTRING:
return hist_field_pstring(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_S64:
return hist_field_s64(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_U64:
return hist_field_u64(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_S32:
return hist_field_s32(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_U32:
return hist_field_u32(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_S16:
return hist_field_s16(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_U16:
return hist_field_u16(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_S8:
return hist_field_s8(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_U8:
return hist_field_u8(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_UMINUS:
return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_MINUS:
return hist_field_minus(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_PLUS:
return hist_field_plus(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DIV:
return hist_field_div(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_MULT:
return hist_field_mult(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DIV_POWER2:
return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DIV_NOT_POWER2:
return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DIV_MULT_SHIFT:
return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_EXECNAME:
return hist_field_execname(hist_field, elt, buffer, rbe, event);
default:
return 0;
}
}
/* Convert a var that points to common_pid.execname to a string */
static void update_var_execname(struct hist_field *hist_field)
{
@@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field)
kfree_const(hist_field->type);
hist_field->type = "char[]";
hist_field->fn = hist_field_execname;
hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
}
static int create_var_field(struct hist_trigger_data *hist_data,
@@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
@@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
@@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
HIST_STACKTRACE_SKIP);
key = entries;
} else {
field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;

View File

@@ -17,6 +17,8 @@
/* for gfp flag names */
#include <linux/trace_events.h>
#include <trace/events/mmflags.h>
#include "trace_probe.h"
#include "trace_probe_kernel.h"
#include "trace_synth.h"
@@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
{
unsigned int len = 0;
char *str_field;
int ret;
if (is_dynamic) {
u32 data_offset;
@@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry,
data_offset += event->n_u64 * sizeof(u64);
data_offset += data_size;
str_field = (char *)entry + data_offset;
len = strlen(str_val) + 1;
strscpy(str_field, str_val, len);
len = kern_fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
str_field = (char *)&entry->fields[*n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
else
#endif
ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
if (ret < 0)
strcpy(str_field, FAULT_STRING);
(*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
}
@@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
len = strlen(str_val) + 1;
len = kern_fetch_store_strlen((unsigned long)str_val);
fields_size += len;
}

View File

@@ -14,6 +14,7 @@
#include <linux/uio.h>
#include <linux/ioctl.h>
#include <linux/jhash.h>
#include <linux/refcount.h>
#include <linux/trace_events.h>
#include <linux/tracefs.h>
#include <linux/types.h>
@@ -39,28 +40,69 @@
*/
#define MAX_PAGE_ORDER 0
#define MAX_PAGES (1 << MAX_PAGE_ORDER)
#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
#define MAX_EVENTS (MAX_BYTES * 8)
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
#define MAX_FIELD_ARG_NAME 256
static char *register_page_data;
/*
* The MAP_STATUS_* macros are used for taking a index and determining the
* appropriate byte and the bit in the byte to set/reset for an event.
*
* The lower 3 bits of the index decide which bit to set.
* The remaining upper bits of the index decide which byte to use for the bit.
*
* This is used when an event has a probe attached/removed to reflect live
* status of the event wanting tracing or not to user-programs via shared
* memory maps.
*/
#define MAP_STATUS_BYTE(index) ((index) >> 3)
#define MAP_STATUS_MASK(index) BIT((index) & 7)
static DEFINE_MUTEX(reg_mutex);
static DEFINE_HASHTABLE(register_table, 4);
static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
/*
* Internal bits (kernel side only) to keep track of connected probes:
* These are used when status is requested in text form about an event. These
* bits are compared against an internal byte on the event to determine which
* probes to print out to the user.
*
* These do not reflect the mapped bytes between the user and kernel space.
*/
#define EVENT_STATUS_FTRACE BIT(0)
#define EVENT_STATUS_PERF BIT(1)
#define EVENT_STATUS_OTHER BIT(7)
/*
* Stores the pages, tables, and locks for a group of events.
* Each logical grouping of events has its own group, with a
* matching page for status checks within user programs. This
* allows for isolation of events to user programs by various
* means.
*/
struct user_event_group {
struct page *pages;
char *register_page_data;
char *system_name;
struct hlist_node node;
struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
};
/* Group for init_user_ns mapping, top-most group */
static struct user_event_group *init_group;
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
* already exist. These are globally used and their lifetime
* is tied to the refcnt member. These cannot go away until the
* refcnt reaches zero.
* refcnt reaches one.
*/
struct user_event {
struct user_event_group *group;
struct tracepoint tracepoint;
struct trace_event_call call;
struct trace_event_class class;
@@ -68,10 +110,11 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
atomic_t refcnt;
refcount_t refcnt;
int index;
int flags;
int min_size;
char status;
};
/*
@@ -86,6 +129,11 @@ struct user_event_refs {
struct user_event *events[];
};
struct user_event_file_info {
struct user_event_group *group;
struct user_event_refs *refs;
};
#define VALIDATOR_ENSURE_NULL (1 << 0)
#define VALIDATOR_REL (1 << 1)
@@ -98,7 +146,8 @@ struct user_event_validator {
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
void *tpdata, bool *faulted);
static int user_event_parse(char *name, char *args, char *flags,
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser);
static u32 user_event_key(char *name)
@@ -106,6 +155,144 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
static void set_page_reservations(char *pages, bool set)
{
int page;
for (page = 0; page < MAX_PAGES; ++page) {
void *addr = pages + (PAGE_SIZE * page);
if (set)
SetPageReserved(virt_to_page(addr));
else
ClearPageReserved(virt_to_page(addr));
}
}
static void user_event_group_destroy(struct user_event_group *group)
{
if (group->register_page_data)
set_page_reservations(group->register_page_data, false);
if (group->pages)
__free_pages(group->pages, MAX_PAGE_ORDER);
kfree(group->system_name);
kfree(group);
}
static char *user_event_group_system_name(struct user_namespace *user_ns)
{
char *system_name;
int len = sizeof(USER_EVENTS_SYSTEM) + 1;
if (user_ns != &init_user_ns) {
/*
* Unexpected at this point:
* We only currently support init_user_ns.
* When we enable more, this will trigger a failure so log.
*/
pr_warn("user_events: Namespace other than init_user_ns!\n");
return NULL;
}
system_name = kmalloc(len, GFP_KERNEL);
if (!system_name)
return NULL;
snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
return system_name;
}
static inline struct user_event_group
*user_event_group_from_user_ns(struct user_namespace *user_ns)
{
if (user_ns == &init_user_ns)
return init_group;
return NULL;
}
static struct user_event_group *current_user_event_group(void)
{
struct user_namespace *user_ns = current_user_ns();
struct user_event_group *group = NULL;
while (user_ns) {
group = user_event_group_from_user_ns(user_ns);
if (group)
break;
user_ns = user_ns->parent;
}
return group;
}
static struct user_event_group
*user_event_group_create(struct user_namespace *user_ns)
{
struct user_event_group *group;
group = kzalloc(sizeof(*group), GFP_KERNEL);
if (!group)
return NULL;
group->system_name = user_event_group_system_name(user_ns);
if (!group->system_name)
goto error;
group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
if (!group->pages)
goto error;
group->register_page_data = page_address(group->pages);
set_page_reservations(group->register_page_data, true);
/* Zero all bits beside 0 (which is reserved for failures) */
bitmap_zero(group->page_bitmap, MAX_EVENTS);
set_bit(0, group->page_bitmap);
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
return group;
error:
if (group)
user_event_group_destroy(group);
return NULL;
};
static __always_inline
void user_event_register_set(struct user_event *user)
{
int i = user->index;
user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
}
static __always_inline
void user_event_register_clear(struct user_event *user)
{
int i = user->index;
user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
}
static __always_inline __must_check
bool user_event_last_ref(struct user_event *user)
{
return refcount_read(&user->refcnt) == 1;
}
static __always_inline __must_check
size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
{
@@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
*
* Upon success user_event has its ref count increased by 1.
*/
static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
static int user_event_parse_cmd(struct user_event_group *group,
char *raw_command, struct user_event **newuser)
{
char *name = raw_command;
char *args = strpbrk(name, " ");
@@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
if (flags)
*flags++ = '\0';
return user_event_parse(name, args, flags, newuser);
return user_event_parse(group, name, args, flags, newuser);
}
static int user_field_array_size(const char *type)
@@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
goto add_field;
add_validator:
if (strstr(type, "char") != 0)
if (strstr(type, "char") != NULL)
validator_flags |= VALIDATOR_ENSURE_NULL;
validator = kmalloc(sizeof(*validator), GFP_KERNEL);
@@ -458,7 +646,7 @@ static const char *user_field_format(const char *type)
return "%d";
if (strcmp(type, "unsigned char") == 0)
return "%u";
if (strstr(type, "char[") != 0)
if (strstr(type, "char[") != NULL)
return "%s";
/* Unknown, likely struct, allowed treat as 64-bit */
@@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
return false;
check:
return strstr(type, "char") != 0;
return strstr(type, "char") != NULL;
}
#define LEN_OR_ZERO (len ? len - pos : 0)
static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
char *buf, int len, bool *colon)
{
int pos = 0, i = *iout;
*colon = false;
for (; i < argc; ++i) {
if (i != *iout)
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
if (strchr(argv[i], ';')) {
++i;
*colon = true;
break;
}
}
/* Actual set, advance i */
if (len != 0)
*iout = i;
return pos + 1;
}
static int user_field_set_string(struct ftrace_event_field *field,
char *buf, int len, bool colon)
{
int pos = 0;
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
if (colon)
pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
return pos + 1;
}
static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
{
struct ftrace_event_field *field, *next;
@@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user)
dyn_event_remove(&user->devent);
register_page_data[user->index] = 0;
clear_bit(user->index, page_bitmap);
user_event_register_clear(user);
clear_bit(user->index, user->group->page_bitmap);
hash_del(&user->node);
user_event_destroy_validators(user);
@@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user)
return ret;
}
static struct user_event *find_user_event(char *name, u32 *outkey)
static struct user_event *find_user_event(struct user_event_group *group,
char *name, u32 *outkey)
{
struct user_event *user;
u32 key = user_event_key(name);
*outkey = key;
hash_for_each_possible(register_table, user, node, key)
hash_for_each_possible(group->register_table, user, node, key)
if (!strcmp(EVENT_NAME(user), name)) {
atomic_inc(&user->refcnt);
refcount_inc(&user->refcnt);
return user;
}
@@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}
register_page_data[user->index] = status;
if (status)
user_event_register_set(user);
else
user_event_register_clear(user);
user->status = status;
}
/*
@@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
atomic_inc(&user->refcnt);
refcount_inc(&user->refcnt);
update_reg_page_for(user);
return 0;
dec:
update_reg_page_for(user);
atomic_dec(&user->refcnt);
refcount_dec(&user->refcnt);
return 0;
}
static int user_event_create(const char *raw_command)
{
struct user_event_group *group;
struct user_event *user;
char *name;
int ret;
@@ -861,14 +1098,19 @@ static int user_event_create(const char *raw_command)
if (!name)
return -ENOMEM;
mutex_lock(&reg_mutex);
group = current_user_event_group();
ret = user_event_parse_cmd(name, &user);
if (!group)
return -ENOENT;
mutex_lock(&group->reg_mutex);
ret = user_event_parse_cmd(group, name, &user);
if (!ret)
atomic_dec(&user->refcnt);
refcount_dec(&user->refcnt);
mutex_unlock(&reg_mutex);
mutex_unlock(&group->reg_mutex);
if (ret)
kfree(name);
@@ -910,14 +1152,14 @@ static bool user_event_is_busy(struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
return atomic_read(&user->refcnt) != 0;
return !user_event_last_ref(user);
}
static int user_event_free(struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
if (atomic_read(&user->refcnt) != 0)
if (!user_event_last_ref(user))
return -EBUSY;
return destroy_user_event(user);
@@ -926,49 +1168,35 @@ static int user_event_free(struct dyn_event *ev)
static bool user_field_match(struct ftrace_event_field *field, int argc,
const char **argv, int *iout)
{
char *field_name, *arg_name;
int len, pos, i = *iout;
char *field_name = NULL, *dyn_field_name = NULL;
bool colon = false, match = false;
int dyn_len, len;
if (i >= argc)
if (*iout >= argc)
return false;
len = MAX_FIELD_ARG_NAME;
field_name = kmalloc(len, GFP_KERNEL);
arg_name = kmalloc(len, GFP_KERNEL);
dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
0, &colon);
if (!arg_name || !field_name)
len = user_field_set_string(field, field_name, 0, colon);
if (dyn_len != len)
return false;
dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
field_name = kmalloc(len, GFP_KERNEL);
if (!dyn_field_name || !field_name)
goto out;
pos = 0;
user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
dyn_len, &colon);
for (; i < argc; ++i) {
if (i != *iout)
pos += snprintf(arg_name + pos, len - pos, " ");
user_field_set_string(field, field_name, len, colon);
pos += snprintf(arg_name + pos, len - pos, argv[i]);
if (strchr(argv[i], ';')) {
++i;
colon = true;
break;
}
}
pos = 0;
pos += snprintf(field_name + pos, len - pos, field->type);
pos += snprintf(field_name + pos, len - pos, " ");
pos += snprintf(field_name + pos, len - pos, field->name);
if (colon)
pos += snprintf(field_name + pos, len - pos, ";");
*iout = i;
match = strcmp(arg_name, field_name) == 0;
match = strcmp(dyn_field_name, field_name) == 0;
out:
kfree(arg_name);
kfree(dyn_field_name);
kfree(field_name);
return match;
@@ -1036,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user)
* The name buffer lifetime is owned by this method for success cases only.
* Upon success the returned user_event has its ref count increased by 1.
*/
static int user_event_parse(char *name, char *args, char *flags,
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser)
{
int ret;
@@ -1046,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags,
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
user = find_user_event(name, &key);
user = find_user_event(group, name, &key);
mutex_unlock(&event_mutex);
if (user) {
@@ -1059,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags,
return 0;
}
index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
if (index == MAX_EVENTS)
return -EMFILE;
@@ -1073,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags,
INIT_LIST_HEAD(&user->fields);
INIT_LIST_HEAD(&user->validators);
user->group = group;
user->tracepoint.name = name;
ret = user_event_parse_fields(user, args);
@@ -1091,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags,
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
user->call.tp = &user->tracepoint;
user->call.event.funcs = &user_event_funcs;
user->class.system = group->system_name;
user->class.system = USER_EVENTS_SYSTEM;
user->class.fields_array = user_event_fields_array;
user->class.get_fields = user_event_get_fields;
user->class.reg = user_event_reg;
@@ -1110,13 +1340,13 @@ static int user_event_parse(char *name, char *args, char *flags,
user->index = index;
/* Ensure we track ref */
atomic_inc(&user->refcnt);
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
set_bit(user->index, page_bitmap);
hash_add(register_table, &user->node, key);
set_bit(user->index, group->page_bitmap);
hash_add(group->register_table, &user->node, key);
mutex_unlock(&event_mutex);
@@ -1134,32 +1364,20 @@ static int user_event_parse(char *name, char *args, char *flags,
/*
* Deletes a previously created event if it is no longer being used.
*/
static int delete_user_event(char *name)
static int delete_user_event(struct user_event_group *group, char *name)
{
u32 key;
int ret;
struct user_event *user = find_user_event(name, &key);
struct user_event *user = find_user_event(group, name, &key);
if (!user)
return -ENOENT;
/* Ensure we are the last ref */
if (atomic_read(&user->refcnt) != 1) {
ret = -EBUSY;
goto put_ref;
}
refcount_dec(&user->refcnt);
ret = destroy_user_event(user);
if (!user_event_last_ref(user))
return -EBUSY;
if (ret)
goto put_ref;
return ret;
put_ref:
/* No longer have this ref */
atomic_dec(&user->refcnt);
return ret;
return destroy_user_event(user);
}
/*
@@ -1167,6 +1385,7 @@ static int delete_user_event(char *name)
*/
static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
{
struct user_event_file_info *info = file->private_data;
struct user_event_refs *refs;
struct user_event *user = NULL;
struct tracepoint *tp;
@@ -1178,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
rcu_read_lock_sched();
refs = rcu_dereference_sched(file->private_data);
refs = rcu_dereference_sched(info->refs);
/*
* The refs->events array is protected by RCU, and new items may be
@@ -1236,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
return ret;
}
static int user_events_open(struct inode *node, struct file *file)
{
struct user_event_group *group;
struct user_event_file_info *info;
group = current_user_event_group();
if (!group)
return -ENOENT;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return -ENOMEM;
info->group = group;
file->private_data = info;
return 0;
}
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
@@ -1245,7 +1486,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
if (unlikely(*ppos != 0))
return -EFAULT;
if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
count, &iov, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
@@ -1256,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
return user_events_write_core(kp->ki_filp, i);
}
static int user_events_ref_add(struct file *file, struct user_event *user)
static int user_events_ref_add(struct user_event_file_info *info,
struct user_event *user)
{
struct user_event_group *group = info->group;
struct user_event_refs *refs, *new_refs;
int i, size, count = 0;
refs = rcu_dereference_protected(file->private_data,
lockdep_is_held(&reg_mutex));
refs = rcu_dereference_protected(info->refs,
lockdep_is_held(&group->reg_mutex));
if (refs) {
count = refs->count;
@@ -1286,9 +1530,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
new_refs->events[i] = user;
atomic_inc(&user->refcnt);
refcount_inc(&user->refcnt);
rcu_assign_pointer(file->private_data, new_refs);
rcu_assign_pointer(info->refs, new_refs);
if (refs)
kfree_rcu(refs, rcu);
@@ -1309,13 +1553,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (size > PAGE_SIZE)
return -E2BIG;
return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
if (size < offsetofend(struct user_reg, write_index))
return -EINVAL;
ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
if (ret)
return ret;
kreg->size = size;
return 0;
}
/*
* Registers a user_event on behalf of a user process.
*/
static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
static long user_events_ioctl_reg(struct user_event_file_info *info,
unsigned long uarg)
{
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
struct user_reg reg;
@@ -1336,24 +1591,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
return ret;
}
ret = user_event_parse_cmd(name, &user);
ret = user_event_parse_cmd(info->group, name, &user);
if (ret) {
kfree(name);
return ret;
}
ret = user_events_ref_add(file, user);
ret = user_events_ref_add(info, user);
/* No longer need parse ref, ref_add either worked or not */
atomic_dec(&user->refcnt);
refcount_dec(&user->refcnt);
/* Positive number is index and valid */
if (ret < 0)
return ret;
put_user((u32)ret, &ureg->write_index);
put_user(user->index, &ureg->status_index);
put_user(user->index, &ureg->status_bit);
return 0;
}
@@ -1361,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
/*
* Deletes a user_event on behalf of a user process.
*/
static long user_events_ioctl_del(struct file *file, unsigned long uarg)
static long user_events_ioctl_del(struct user_event_file_info *info,
unsigned long uarg)
{
void __user *ubuf = (void __user *)uarg;
char *name;
@@ -1374,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
/* event_mutex prevents dyn_event from racing */
mutex_lock(&event_mutex);
ret = delete_user_event(name);
ret = delete_user_event(info->group, name);
mutex_unlock(&event_mutex);
kfree(name);
@@ -1388,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
static long user_events_ioctl(struct file *file, unsigned int cmd,
unsigned long uarg)
{
struct user_event_file_info *info = file->private_data;
struct user_event_group *group = info->group;
long ret = -ENOTTY;
switch (cmd) {
case DIAG_IOCSREG:
mutex_lock(&reg_mutex);
ret = user_events_ioctl_reg(file, uarg);
mutex_unlock(&reg_mutex);
mutex_lock(&group->reg_mutex);
ret = user_events_ioctl_reg(info, uarg);
mutex_unlock(&group->reg_mutex);
break;
case DIAG_IOCSDEL:
mutex_lock(&reg_mutex);
ret = user_events_ioctl_del(file, uarg);
mutex_unlock(&reg_mutex);
mutex_lock(&group->reg_mutex);
ret = user_events_ioctl_del(info, uarg);
mutex_unlock(&group->reg_mutex);
break;
}
@@ -1412,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
*/
static int user_events_release(struct inode *node, struct file *file)
{
struct user_event_file_info *info = file->private_data;
struct user_event_group *group;
struct user_event_refs *refs;
struct user_event *user;
int i;
if (!info)
return -EINVAL;
group = info->group;
/*
* Ensure refs cannot change under any situation by taking the
* register mutex during the final freeing of the references.
*/
mutex_lock(&reg_mutex);
mutex_lock(&group->reg_mutex);
refs = file->private_data;
refs = info->refs;
if (!refs)
goto out;
@@ -1436,37 +1701,56 @@ static int user_events_release(struct inode *node, struct file *file)
user = refs->events[i];
if (user)
atomic_dec(&user->refcnt);
refcount_dec(&user->refcnt);
}
out:
file->private_data = NULL;
mutex_unlock(&reg_mutex);
mutex_unlock(&group->reg_mutex);
kfree(refs);
kfree(info);
return 0;
}
static const struct file_operations user_data_fops = {
.open = user_events_open,
.write = user_events_write,
.write_iter = user_events_write_iter,
.unlocked_ioctl = user_events_ioctl,
.release = user_events_release,
};
static struct user_event_group *user_status_group(struct file *file)
{
struct seq_file *m = file->private_data;
if (!m)
return NULL;
return m->private;
}
/*
* Maps the shared page into the user process for checking if event is enabled.
*/
static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
{
char *pages;
struct user_event_group *group = user_status_group(file);
unsigned long size = vma->vm_end - vma->vm_start;
if (size != MAX_EVENTS)
if (size != MAX_BYTES)
return -EINVAL;
if (!group)
return -EINVAL;
pages = group->register_page_data;
return remap_pfn_range(vma, vma->vm_start,
virt_to_phys(register_page_data) >> PAGE_SHIFT,
virt_to_phys(pages) >> PAGE_SHIFT,
size, vm_get_page_prot(VM_READ));
}
@@ -1490,14 +1774,18 @@ static void user_seq_stop(struct seq_file *m, void *p)
static int user_seq_show(struct seq_file *m, void *p)
{
struct user_event_group *group = m->private;
struct user_event *user;
char status;
int i, active = 0, busy = 0, flags;
mutex_lock(&reg_mutex);
if (!group)
return -EINVAL;
hash_for_each(register_table, i, user, node) {
status = register_page_data[user->index];
mutex_lock(&group->reg_mutex);
hash_for_each(group->register_table, i, user, node) {
status = user->status;
flags = user->flags;
seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
@@ -1520,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p)
active++;
}
mutex_unlock(&reg_mutex);
mutex_unlock(&group->reg_mutex);
seq_puts(m, "\n");
seq_printf(m, "Active: %d\n", active);
@@ -1539,7 +1827,24 @@ static const struct seq_operations user_seq_ops = {
static int user_status_open(struct inode *node, struct file *file)
{
return seq_open(file, &user_seq_ops);
struct user_event_group *group;
int ret;
group = current_user_event_group();
if (!group)
return -ENOENT;
ret = seq_open(file, &user_seq_ops);
if (!ret) {
/* Chain group to seq_file */
struct seq_file *m = file->private_data;
m->private = group;
}
return ret;
}
static const struct file_operations user_status_fops = {
@@ -1580,42 +1885,21 @@ static int create_user_tracefs(void)
return -ENODEV;
}
static void set_page_reservations(bool set)
{
int page;
for (page = 0; page < MAX_PAGES; ++page) {
void *addr = register_page_data + (PAGE_SIZE * page);
if (set)
SetPageReserved(virt_to_page(addr));
else
ClearPageReserved(virt_to_page(addr));
}
}
static int __init trace_events_user_init(void)
{
struct page *pages;
int ret;
/* Zero all bits beside 0 (which is reserved for failures) */
bitmap_zero(page_bitmap, MAX_EVENTS);
set_bit(0, page_bitmap);
init_group = user_event_group_create(&init_user_ns);
pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
if (!pages)
if (!init_group)
return -ENOMEM;
register_page_data = page_address(pages);
set_page_reservations(true);
ret = create_user_tracefs();
if (ret) {
pr_warn("user_events could not register with tracefs\n");
set_page_reservations(false);
__free_pages(pages, MAX_PAGE_ORDER);
user_event_group_destroy(init_group);
init_group = NULL;
return ret;
}

View File

@@ -20,6 +20,7 @@
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if (addr < TASK_SIZE)
return fetch_store_strlen_user(addr);
#endif
do {
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
return (ret < 0) ? ret : len;
return kern_fetch_store_strlen(addr);
}
/*
@@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
return ret;
return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)addr < TASK_SIZE)
return fetch_store_string_user(addr, dest, base);
#endif
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
/*
* Try to get string again, since the string can be changed while
* probing.
*/
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
return ret;
return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int

View File

@@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void)
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
if (retval) {
cpus_read_unlock();
stop_per_cpu_kthreads();
break;
return retval;
}
}

View File

@@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(SAME_PROBE, "There is already the exact same probe event"),\
C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
C(BAD_ATTACH_ARG, "Attached event does not have this field"),
C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
C(NO_EP_FILTER, "No filter rule after 'if'"),
#undef C
#define C(a, b) TP_ERR_##a

View File

@@ -0,0 +1,115 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TRACE_PROBE_KERNEL_H_
#define __TRACE_PROBE_KERNEL_H_
#define FAULT_STRING "(fault)"
/*
* This depends on trace_probe.h, but can not include it due to
* the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
* Which means that any other user must include trace_probe.h before including
* this file.
*/
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
kern_fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
int ret;
ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
/*
* strnlen_user_nofault returns zero on fault, insert the
* FAULT_STRING when that occurs.
*/
if (ret <= 0)
return strlen(FAULT_STRING) + 1;
return ret;
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
kern_fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if (addr < TASK_SIZE)
return kern_fetch_store_strlen_user(addr);
#endif
do {
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
/* For faults, return enough to hold the FAULT_STRING */
return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
}
static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
{
if (ret >= 0) {
*(u32 *)dest = make_data_loc(ret, __dest - base);
} else {
strscpy(__dest, FAULT_STRING, len);
ret = strlen(__dest) + 1;
}
}
/*
* Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
* with max length and relative data location.
*/
static nokprobe_inline int
kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
set_data_loc(ret, dest, __dest, base, maxlen);
return ret;
}
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
* length and relative data location.
*/
static nokprobe_inline int
kern_fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)addr < TASK_SIZE)
return kern_fetch_store_string_user(addr, dest, base);
#endif
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
/*
* Try to get string again, since the string can be changed while
* probing.
*/
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
set_data_loc(ret, dest, __dest, base, maxlen);
return ret;
}
#endif /* __TRACE_PROBE_KERNEL_H_ */

View File

@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
unsigned int dups = 0, total_dups = 0;
unsigned int total_dups = 0;
int i;
void *key;
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
key = sort_entries[0]->key;
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
dups++; total_dups++;
total_dups++;
continue;
}
key = sort_entries[i]->key;
dups = 0;
}
WARN_ONCE(total_dups > 0,

View File

@@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
static int tracepoint_module_coming(struct module *mod)
{
struct tp_module *tp_mod;
int ret = 0;
if (!mod->num_tracepoints)
return 0;
@@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod)
*/
if (trace_module_has_bad_taint(mod))
return 0;
mutex_lock(&tracepoint_module_list_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
if (!tp_mod) {
ret = -ENOMEM;
goto end;
}
if (!tp_mod)
return -ENOMEM;
tp_mod->mod = mod;
mutex_lock(&tracepoint_module_list_mutex);
list_add_tail(&tp_mod->list, &tracepoint_module_list);
blocking_notifier_call_chain(&tracepoint_notify_list,
MODULE_STATE_COMING, tp_mod);
end:
mutex_unlock(&tracepoint_module_list_mutex);
return ret;
return 0;
}
static void tracepoint_module_going(struct module *mod)

View File

@@ -75,6 +75,13 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll);
static DEFINE_CTL_TABLE_POLL(domainname_poll);
static struct ctl_table uts_kern_table[] = {
{
.procname = "arch",
.data = init_uts_ns.name.machine,
.maxlen = sizeof(init_uts_ns.name.machine),
.mode = 0444,
.proc_handler = proc_do_uts_string,
},
{
.procname = "ostype",
.data = init_uts_ns.name.sysname,