mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-13 04:48:21 -04:00
Merge branch 'rework/console-list-lock' into for-linus
This commit is contained in:
@@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n
|
||||
KASAN_SANITIZE_kcov.o := n
|
||||
KCSAN_SANITIZE_kcov.o := n
|
||||
UBSAN_SANITIZE_kcov.o := n
|
||||
KMSAN_SANITIZE_kcov.o := n
|
||||
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
|
||||
|
||||
# Don't instrument error handlers
|
||||
|
||||
@@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead)
|
||||
unsigned long vsize = 0;
|
||||
|
||||
if (group_dead && current->mm) {
|
||||
struct mm_struct *mm = current->mm;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
mmap_read_lock(current->mm);
|
||||
vma = current->mm->mmap;
|
||||
while (vma) {
|
||||
mmap_read_lock(mm);
|
||||
for_each_vma(vmi, vma)
|
||||
vsize += vma->vm_end - vma->vm_start;
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
mmap_read_unlock(current->mm);
|
||||
mmap_read_unlock(mm);
|
||||
}
|
||||
|
||||
spin_lock_irq(¤t->sighand->siglock);
|
||||
|
||||
@@ -22,6 +22,13 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
||||
DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
|
||||
#else
|
||||
DEFINE(LRU_GEN_WIDTH, 0);
|
||||
DEFINE(__LRU_REFS_WIDTH, 0);
|
||||
#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
|
||||
attr->value_size / sizeof(u32);
|
||||
|
||||
if (!(attr->map_flags & BPF_F_ZERO_SEED))
|
||||
bloom->hash_seed = get_random_int();
|
||||
bloom->hash_seed = get_random_u32();
|
||||
|
||||
return &bloom->map;
|
||||
}
|
||||
|
||||
@@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
|
||||
hdr->size = size;
|
||||
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
|
||||
PAGE_SIZE - sizeof(*hdr));
|
||||
start = (get_random_int() % hole) & ~(alignment - 1);
|
||||
start = prandom_u32_max(hole) & ~(alignment - 1);
|
||||
|
||||
/* Leave a random number of instructions before BPF code. */
|
||||
*image_ptr = &hdr->image[start];
|
||||
@@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
|
||||
|
||||
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
|
||||
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
|
||||
start = (get_random_int() % hole) & ~(alignment - 1);
|
||||
start = prandom_u32_max(hole) & ~(alignment - 1);
|
||||
|
||||
*image_ptr = &ro_header->image[start];
|
||||
*rw_image = &(*rw_header)->image[start];
|
||||
@@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
|
||||
bool emit_zext)
|
||||
{
|
||||
struct bpf_insn *to = to_buff;
|
||||
u32 imm_rnd = get_random_int();
|
||||
u32 imm_rnd = get_random_u32();
|
||||
s16 off;
|
||||
|
||||
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
|
||||
@@ -2007,7 +2007,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
|
||||
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
|
||||
{ \
|
||||
u64 stack[stack_size / sizeof(u64)]; \
|
||||
u64 regs[MAX_BPF_EXT_REG]; \
|
||||
u64 regs[MAX_BPF_EXT_REG] = {}; \
|
||||
\
|
||||
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
|
||||
ARG1 = (u64) (unsigned long) ctx; \
|
||||
|
||||
@@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
if (htab->map.map_flags & BPF_F_ZERO_SEED)
|
||||
htab->hashrnd = 0;
|
||||
else
|
||||
htab->hashrnd = get_random_int();
|
||||
htab->hashrnd = get_random_u32();
|
||||
|
||||
htab_init_buckets(htab);
|
||||
|
||||
|
||||
@@ -445,8 +445,8 @@ struct bpf_iter_seq_task_vma_info {
|
||||
};
|
||||
|
||||
enum bpf_task_vma_iter_find_op {
|
||||
task_vma_iter_first_vma, /* use mm->mmap */
|
||||
task_vma_iter_next_vma, /* use curr_vma->vm_next */
|
||||
task_vma_iter_first_vma, /* use find_vma() with addr 0 */
|
||||
task_vma_iter_next_vma, /* use vma_next() with curr_vma */
|
||||
task_vma_iter_find_vma, /* use find_vma() to find next vma */
|
||||
};
|
||||
|
||||
@@ -544,10 +544,10 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
|
||||
|
||||
switch (op) {
|
||||
case task_vma_iter_first_vma:
|
||||
curr_vma = curr_task->mm->mmap;
|
||||
curr_vma = find_vma(curr_task->mm, 0);
|
||||
break;
|
||||
case task_vma_iter_next_vma:
|
||||
curr_vma = curr_vma->vm_next;
|
||||
curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
|
||||
break;
|
||||
case task_vma_iter_find_vma:
|
||||
/* We dropped mmap_lock so it is necessary to use find_vma
|
||||
@@ -561,7 +561,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
|
||||
if (curr_vma &&
|
||||
curr_vma->vm_start == info->prev_vm_start &&
|
||||
curr_vma->vm_end == info->prev_vm_end)
|
||||
curr_vma = curr_vma->vm_next;
|
||||
curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
|
||||
break;
|
||||
}
|
||||
if (!curr_vma) {
|
||||
|
||||
@@ -13350,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
|
||||
aux[adj_idx].ptr_type == PTR_TO_CTX)
|
||||
continue;
|
||||
|
||||
imm_rnd = get_random_int();
|
||||
imm_rnd = get_random_u32();
|
||||
rnd_hi32_patch[0] = insn;
|
||||
rnd_hi32_patch[1].imm = imm_rnd;
|
||||
rnd_hi32_patch[3].dst_reg = load_reg;
|
||||
|
||||
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
|
||||
@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
|
||||
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_IO);
|
||||
}
|
||||
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_MEM);
|
||||
}
|
||||
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_CPU);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
{
|
||||
struct cgroup_file_ctx *ctx = of->priv;
|
||||
struct psi_trigger *new;
|
||||
@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
psi = cgroup_psi(cgrp);
|
||||
new = psi_trigger_create(psi, buf, res);
|
||||
if (IS_ERR(new)) {
|
||||
cgroup_put(cgrp);
|
||||
@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
|
||||
return pressure_write(of, buf, nbytes, PSI_IO);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
return pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
return pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_IRQ);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return pressure_write(of, buf, nbytes, PSI_IRQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int cgroup_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
seq_printf(seq, "%d\n", psi->enabled);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
ssize_t ret;
|
||||
int enable;
|
||||
struct cgroup *cgrp;
|
||||
struct psi_group *psi;
|
||||
|
||||
ret = kstrtoint(strstrip(buf), 0, &enable);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (enable < 0 || enable > 1)
|
||||
return -ERANGE;
|
||||
|
||||
cgrp = cgroup_kn_lock_live(of->kn, false);
|
||||
if (!cgrp)
|
||||
return -ENOENT;
|
||||
|
||||
psi = cgroup_psi(cgrp);
|
||||
if (psi->enabled != enable) {
|
||||
int i;
|
||||
|
||||
/* show or hide {cpu,memory,io,irq}.pressure files */
|
||||
for (i = 0; i < NR_PSI_RESOURCES; i++)
|
||||
cgroup_file_show(&cgrp->psi_files[i], enable);
|
||||
|
||||
psi->enabled = enable;
|
||||
if (enable)
|
||||
psi_cgroup_restart(psi);
|
||||
}
|
||||
|
||||
cgroup_kn_unlock(of->kn);
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
|
||||
@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
|
||||
|
||||
bool cgroup_psi_enabled(void)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return false;
|
||||
|
||||
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
|
||||
}
|
||||
|
||||
@@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
|
||||
#ifdef CONFIG_PSI
|
||||
{
|
||||
.name = "io.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
|
||||
.seq_show = cgroup_io_pressure_show,
|
||||
.write = cgroup_io_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "memory.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
|
||||
.seq_show = cgroup_memory_pressure_show,
|
||||
.write = cgroup_memory_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "cpu.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
|
||||
.seq_show = cgroup_cpu_pressure_show,
|
||||
.write = cgroup_cpu_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
.release = cgroup_pressure_release,
|
||||
},
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
{
|
||||
.name = "irq.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
|
||||
.seq_show = cgroup_irq_pressure_show,
|
||||
.write = cgroup_irq_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
.release = cgroup_pressure_release,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.name = "cgroup.pressure",
|
||||
.seq_show = cgroup_pressure_show,
|
||||
.write = cgroup_pressure_write,
|
||||
},
|
||||
#endif /* CONFIG_PSI */
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
@@ -50,7 +50,6 @@
|
||||
#include <linux/pid.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/irq.h>
|
||||
#include <linux/security.h>
|
||||
@@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
|
||||
if (!CACHE_FLUSH_IS_SAFE)
|
||||
return;
|
||||
|
||||
if (current->mm) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < VMACACHE_SIZE; i++) {
|
||||
if (!current->vmacache.vmas[i])
|
||||
continue;
|
||||
flush_cache_range(current->vmacache.vmas[i],
|
||||
addr, addr + BREAK_INSTR_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/* Force flush instruction cache if it was outside the mm */
|
||||
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
|
||||
}
|
||||
|
||||
@@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
|
||||
{
|
||||
struct console *c;
|
||||
const char *cp;
|
||||
int cookie;
|
||||
int len;
|
||||
|
||||
if (msg_len == 0)
|
||||
@@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len)
|
||||
cp++;
|
||||
}
|
||||
|
||||
for_each_console(c) {
|
||||
if (!(c->flags & CON_ENABLED))
|
||||
/*
|
||||
* The console_srcu_read_lock() only provides safe console list
|
||||
* traversal. The use of the ->write() callback relies on all other
|
||||
* CPUs being stopped at the moment and console drivers being able to
|
||||
* handle reentrance when @oops_in_progress is set.
|
||||
*
|
||||
* There is no guarantee that every console driver can handle
|
||||
* reentrance in this way; the developer deploying the debugger
|
||||
* is responsible for ensuring that the console drivers they
|
||||
* have selected handle reentrance appropriately.
|
||||
*/
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(c) {
|
||||
if (!(console_srcu_read_flags(c) & CON_ENABLED))
|
||||
continue;
|
||||
if (c == dbg_io_ops->cons)
|
||||
continue;
|
||||
@@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
|
||||
--oops_in_progress;
|
||||
touch_nmi_watchdog();
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
}
|
||||
|
||||
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
|
||||
|
||||
@@ -214,13 +214,22 @@ void __delayacct_freepages_end(void)
|
||||
¤t->delays->freepages_count);
|
||||
}
|
||||
|
||||
void __delayacct_thrashing_start(void)
|
||||
void __delayacct_thrashing_start(bool *in_thrashing)
|
||||
{
|
||||
*in_thrashing = !!current->in_thrashing;
|
||||
if (*in_thrashing)
|
||||
return;
|
||||
|
||||
current->in_thrashing = 1;
|
||||
current->delays->thrashing_start = local_clock();
|
||||
}
|
||||
|
||||
void __delayacct_thrashing_end(void)
|
||||
void __delayacct_thrashing_end(bool *in_thrashing)
|
||||
{
|
||||
if (*in_thrashing)
|
||||
return;
|
||||
|
||||
current->in_thrashing = 0;
|
||||
delayacct_end(¤t->delays->lock,
|
||||
¤t->delays->thrashing_start,
|
||||
¤t->delays->thrashing_delay,
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <linux/dma-map-ops.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kmsan.h>
|
||||
#include <linux/of_device.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
@@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
|
||||
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
|
||||
else
|
||||
addr = ops->map_page(dev, page, offset, size, dir, attrs);
|
||||
kmsan_handle_dma(page, offset, size, dir);
|
||||
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
|
||||
|
||||
return addr;
|
||||
@@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
|
||||
else
|
||||
ents = ops->map_sg(dev, sg, nents, dir, attrs);
|
||||
|
||||
if (ents > 0)
|
||||
if (ents > 0) {
|
||||
kmsan_handle_dma_sg(sg, nents, dir);
|
||||
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
|
||||
else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
|
||||
ents != -EIO && ents != -EREMOTEIO))
|
||||
} else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
|
||||
ents != -EIO && ents != -EREMOTEIO)) {
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return ents;
|
||||
}
|
||||
|
||||
@@ -346,22 +346,27 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
|
||||
memblock_free(tlb, PAGE_ALIGN(bytes));
|
||||
|
||||
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
|
||||
if (nslabs < IO_TLB_MIN_SLABS)
|
||||
panic("%s: Failed to remap %zu bytes\n",
|
||||
__func__, bytes);
|
||||
goto retry;
|
||||
if (nslabs >= IO_TLB_MIN_SLABS)
|
||||
goto retry;
|
||||
|
||||
pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
|
||||
return;
|
||||
}
|
||||
|
||||
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
|
||||
mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
|
||||
if (!mem->slots)
|
||||
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
|
||||
__func__, alloc_size, PAGE_SIZE);
|
||||
if (!mem->slots) {
|
||||
pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
|
||||
__func__, alloc_size, PAGE_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
|
||||
default_nareas), SMP_CACHE_BYTES);
|
||||
if (!mem->areas)
|
||||
panic("%s: Failed to allocate mem->areas.\n", __func__);
|
||||
if (!mem->areas) {
|
||||
pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
|
||||
default_nareas);
|
||||
@@ -545,9 +550,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
|
||||
}
|
||||
|
||||
if (PageHighMem(pfn_to_page(pfn))) {
|
||||
/* The buffer does not have a mapping. Map it in and copy */
|
||||
unsigned int offset = orig_addr & ~PAGE_MASK;
|
||||
char *buffer;
|
||||
struct page *page;
|
||||
unsigned int sz = 0;
|
||||
unsigned long flags;
|
||||
|
||||
@@ -555,12 +559,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
|
||||
sz = min_t(size_t, PAGE_SIZE - offset, size);
|
||||
|
||||
local_irq_save(flags);
|
||||
buffer = kmap_atomic(pfn_to_page(pfn));
|
||||
page = pfn_to_page(pfn);
|
||||
if (dir == DMA_TO_DEVICE)
|
||||
memcpy(vaddr, buffer + offset, sz);
|
||||
memcpy_from_page(vaddr, page, offset, sz);
|
||||
else
|
||||
memcpy(buffer + offset, vaddr, sz);
|
||||
kunmap_atomic(buffer);
|
||||
memcpy_to_page(page, offset, vaddr, sz);
|
||||
local_irq_restore(flags);
|
||||
|
||||
size -= sz;
|
||||
@@ -731,8 +734,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
|
||||
int index;
|
||||
phys_addr_t tlb_addr;
|
||||
|
||||
if (!mem || !mem->nslabs)
|
||||
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
|
||||
if (!mem || !mem->nslabs) {
|
||||
dev_warn_ratelimited(dev,
|
||||
"Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
|
||||
return (phys_addr_t)DMA_MAPPING_ERROR;
|
||||
}
|
||||
|
||||
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
|
||||
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <linux/resume_user_mode.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kmsan.h>
|
||||
#include <linux/livepatch.h>
|
||||
#include <linux/audit.h>
|
||||
#include <linux/tick.h>
|
||||
@@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
|
||||
user_exit_irqoff();
|
||||
|
||||
instrumentation_begin();
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
trace_hardirqs_off_finish();
|
||||
instrumentation_end();
|
||||
}
|
||||
@@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
|
||||
lockdep_hardirqs_off(CALLER_ADDR0);
|
||||
ct_irq_enter();
|
||||
instrumentation_begin();
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
trace_hardirqs_off_finish();
|
||||
instrumentation_end();
|
||||
|
||||
@@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
|
||||
*/
|
||||
lockdep_hardirqs_off(CALLER_ADDR0);
|
||||
instrumentation_begin();
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
rcu_irq_enter_check_tick();
|
||||
trace_hardirqs_off_finish();
|
||||
instrumentation_end();
|
||||
@@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
|
||||
ct_nmi_enter();
|
||||
|
||||
instrumentation_begin();
|
||||
kmsan_unpoison_entry_regs(regs);
|
||||
trace_hardirqs_off_finish();
|
||||
ftrace_nmi_enter();
|
||||
instrumentation_end();
|
||||
|
||||
@@ -10270,8 +10270,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
|
||||
struct perf_addr_filter_range *fr)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
if (!vma->vm_file)
|
||||
continue;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/rmap.h> /* anon_vma_prepare */
|
||||
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
|
||||
#include <linux/swap.h> /* try_to_free_swap */
|
||||
#include <linux/swap.h> /* folio_free_swap */
|
||||
#include <linux/ptrace.h> /* user_enable_single_step */
|
||||
#include <linux/kdebug.h> /* notifier mechanism */
|
||||
#include "../../mm/internal.h" /* munlock_vma_page */
|
||||
@@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
|
||||
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
struct page *old_page, struct page *new_page)
|
||||
{
|
||||
struct folio *old_folio = page_folio(old_page);
|
||||
struct folio *new_folio;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
|
||||
int err;
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
@@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
addr + PAGE_SIZE);
|
||||
|
||||
if (new_page) {
|
||||
err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
|
||||
GFP_KERNEL);
|
||||
new_folio = page_folio(new_page);
|
||||
err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* For try_to_free_swap() below */
|
||||
lock_page(old_page);
|
||||
/* For folio_free_swap() below */
|
||||
folio_lock(old_folio);
|
||||
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
err = -EAGAIN;
|
||||
@@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
|
||||
|
||||
if (new_page) {
|
||||
get_page(new_page);
|
||||
folio_get(new_folio);
|
||||
page_add_new_anon_rmap(new_page, vma, addr);
|
||||
lru_cache_add_inactive_or_unevictable(new_page, vma);
|
||||
folio_add_lru_vma(new_folio, vma);
|
||||
} else
|
||||
/* no new page, just dec_mm_counter for old_page */
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
|
||||
if (!PageAnon(old_page)) {
|
||||
if (!folio_test_anon(old_folio)) {
|
||||
dec_mm_counter(mm, mm_counter_file(old_page));
|
||||
inc_mm_counter(mm, MM_ANONPAGES);
|
||||
}
|
||||
@@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
mk_pte(new_page, vma->vm_page_prot));
|
||||
|
||||
page_remove_rmap(old_page, vma, false);
|
||||
if (!page_mapped(old_page))
|
||||
try_to_free_swap(old_page);
|
||||
if (!folio_mapped(old_folio))
|
||||
folio_free_swap(old_folio);
|
||||
page_vma_mapped_walk_done(&pvmw);
|
||||
put_page(old_page);
|
||||
folio_put(old_folio);
|
||||
|
||||
err = 0;
|
||||
unlock:
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
unlock_page(old_page);
|
||||
folio_unlock(old_folio);
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
|
||||
static struct vm_area_struct *
|
||||
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *tmp;
|
||||
|
||||
for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
|
||||
for_each_vma(vmi, tmp)
|
||||
if (valid_ref_ctr_vma(uprobe, tmp))
|
||||
return tmp;
|
||||
|
||||
@@ -552,7 +555,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
|
||||
|
||||
/* try collapse pmd for compound page */
|
||||
if (!ret && orig_page_huge)
|
||||
collapse_pte_mapped_thp(mm, vaddr);
|
||||
collapse_pte_mapped_thp(mm, vaddr, false);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
|
||||
|
||||
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
int err = 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long vaddr;
|
||||
loff_t offset;
|
||||
|
||||
@@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void)
|
||||
|
||||
static void mmf_recalc_uprobes(struct mm_struct *mm)
|
||||
{
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
if (!valid_vma(vma, false))
|
||||
continue;
|
||||
/*
|
||||
|
||||
@@ -60,6 +60,7 @@
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/kcov.h>
|
||||
#include <linux/kmsan.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/rcuwait.h>
|
||||
#include <linux/compat.h>
|
||||
@@ -183,6 +184,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
|
||||
call_rcu(&task->rcu, delayed_put_task_struct);
|
||||
}
|
||||
|
||||
void __weak release_thread(struct task_struct *dead_task)
|
||||
{
|
||||
}
|
||||
|
||||
void release_task(struct task_struct *p)
|
||||
{
|
||||
struct task_struct *leader;
|
||||
@@ -466,6 +471,7 @@ void mm_update_next_owner(struct mm_struct *mm)
|
||||
goto retry;
|
||||
}
|
||||
WRITE_ONCE(mm->owner, c);
|
||||
lru_gen_migrate_mm(mm);
|
||||
task_unlock(c);
|
||||
put_task_struct(c);
|
||||
}
|
||||
@@ -759,6 +765,7 @@ void __noreturn do_exit(long code)
|
||||
WARN_ON(tsk->plug);
|
||||
|
||||
kcov_task_exit(tsk);
|
||||
kmsan_task_exit(tsk);
|
||||
|
||||
coredump_task_exit(tsk);
|
||||
ptrace_event(PTRACE_EVENT_EXIT, code);
|
||||
|
||||
@@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
|
||||
/* cut off if it is too long */
|
||||
if (count > KSYM_NAME_LEN)
|
||||
count = KSYM_NAME_LEN;
|
||||
buf = kmalloc(count + 1, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
if (copy_from_user(buf, buffer, count)) {
|
||||
ret = -EFAULT;
|
||||
goto out_free;
|
||||
}
|
||||
buf[count] = '\0';
|
||||
buf = memdup_user_nul(buffer, count);
|
||||
if (IS_ERR(buf))
|
||||
return PTR_ERR(buf);
|
||||
|
||||
sym = strstrip(buf);
|
||||
|
||||
mutex_lock(&fei_lock);
|
||||
@@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
|
||||
}
|
||||
|
||||
ret = register_kprobe(&attr->kp);
|
||||
if (!ret)
|
||||
fei_debugfs_add_attr(attr);
|
||||
if (ret < 0)
|
||||
fei_attr_remove(attr);
|
||||
else {
|
||||
list_add_tail(&attr->list, &fei_attr_list);
|
||||
ret = count;
|
||||
if (ret) {
|
||||
fei_attr_free(attr);
|
||||
goto out;
|
||||
}
|
||||
fei_debugfs_add_attr(attr);
|
||||
list_add_tail(&attr->list, &fei_attr_list);
|
||||
ret = count;
|
||||
out:
|
||||
mutex_unlock(&fei_lock);
|
||||
out_free:
|
||||
kfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -37,13 +37,13 @@
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/iocontext.h>
|
||||
#include <linux/key.h>
|
||||
#include <linux/kmsan.h>
|
||||
#include <linux/binfmts.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/cpu.h>
|
||||
@@ -97,7 +97,6 @@
|
||||
#include <linux/scs.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include <asm/pgalloc.h>
|
||||
#include <linux/uaccess.h>
|
||||
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
|
||||
*/
|
||||
*new = data_race(*orig);
|
||||
INIT_LIST_HEAD(&new->anon_vma_chain);
|
||||
new->vm_next = new->vm_prev = NULL;
|
||||
dup_anon_vma_name(orig, new);
|
||||
}
|
||||
return new;
|
||||
@@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
struct mm_struct *oldmm)
|
||||
{
|
||||
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
|
||||
struct rb_node **rb_link, *rb_parent;
|
||||
struct vm_area_struct *mpnt, *tmp;
|
||||
int retval;
|
||||
unsigned long charge;
|
||||
unsigned long charge = 0;
|
||||
LIST_HEAD(uf);
|
||||
MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
|
||||
MA_STATE(mas, &mm->mm_mt, 0, 0);
|
||||
|
||||
uprobe_start_dup_mmap();
|
||||
if (mmap_write_lock_killable(oldmm)) {
|
||||
@@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
mm->exec_vm = oldmm->exec_vm;
|
||||
mm->stack_vm = oldmm->stack_vm;
|
||||
|
||||
rb_link = &mm->mm_rb.rb_node;
|
||||
rb_parent = NULL;
|
||||
pprev = &mm->mmap;
|
||||
retval = ksm_fork(mm, oldmm);
|
||||
if (retval)
|
||||
goto out;
|
||||
khugepaged_fork(mm, oldmm);
|
||||
|
||||
prev = NULL;
|
||||
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
|
||||
retval = mas_expected_entries(&mas, oldmm->map_count);
|
||||
if (retval)
|
||||
goto out;
|
||||
|
||||
mas_for_each(&old_mas, mpnt, ULONG_MAX) {
|
||||
struct file *file;
|
||||
|
||||
if (mpnt->vm_flags & VM_DONTCOPY) {
|
||||
@@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
retval = -EINTR;
|
||||
goto out;
|
||||
goto loop_out;
|
||||
}
|
||||
if (mpnt->vm_flags & VM_ACCOUNT) {
|
||||
unsigned long len = vma_pages(mpnt);
|
||||
@@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear hugetlb-related page reserves for children. This only
|
||||
* affects MAP_PRIVATE mappings. Faults generated by the child
|
||||
* are not guaranteed to succeed, even if read-only
|
||||
* Copy/update hugetlb private vma information.
|
||||
*/
|
||||
if (is_vm_hugetlb_page(tmp))
|
||||
reset_vma_resv_huge_pages(tmp);
|
||||
hugetlb_dup_vma_private(tmp);
|
||||
|
||||
/*
|
||||
* Link in the new vma and copy the page table entries.
|
||||
*/
|
||||
*pprev = tmp;
|
||||
pprev = &tmp->vm_next;
|
||||
tmp->vm_prev = prev;
|
||||
prev = tmp;
|
||||
|
||||
__vma_link_rb(mm, tmp, rb_link, rb_parent);
|
||||
rb_link = &tmp->vm_rb.rb_right;
|
||||
rb_parent = &tmp->vm_rb;
|
||||
/* Link the vma into the MT */
|
||||
mas.index = tmp->vm_start;
|
||||
mas.last = tmp->vm_end - 1;
|
||||
mas_store(&mas, tmp);
|
||||
if (mas_is_err(&mas))
|
||||
goto fail_nomem_mas_store;
|
||||
|
||||
mm->map_count++;
|
||||
if (!(tmp->vm_flags & VM_WIPEONFORK))
|
||||
@@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
tmp->vm_ops->open(tmp);
|
||||
|
||||
if (retval)
|
||||
goto out;
|
||||
goto loop_out;
|
||||
}
|
||||
/* a new mm has just been created */
|
||||
retval = arch_dup_mmap(oldmm, mm);
|
||||
loop_out:
|
||||
mas_destroy(&mas);
|
||||
out:
|
||||
mmap_write_unlock(mm);
|
||||
flush_tlb_mm(oldmm);
|
||||
@@ -714,6 +708,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
fail_uprobe_end:
|
||||
uprobe_end_dup_mmap();
|
||||
return retval;
|
||||
|
||||
fail_nomem_mas_store:
|
||||
unlink_anon_vmas(tmp);
|
||||
fail_nomem_anon_vma_fork:
|
||||
mpol_put(vma_policy(tmp));
|
||||
fail_nomem_policy:
|
||||
@@ -721,7 +718,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
fail_nomem:
|
||||
retval = -ENOMEM;
|
||||
vm_unacct_memory(charge);
|
||||
goto out;
|
||||
goto loop_out;
|
||||
}
|
||||
|
||||
static inline int mm_alloc_pgd(struct mm_struct *mm)
|
||||
@@ -1026,6 +1023,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
||||
tsk->worker_private = NULL;
|
||||
|
||||
kcov_task_init(tsk);
|
||||
kmsan_task_create(tsk);
|
||||
kmap_local_fork(tsk);
|
||||
|
||||
#ifdef CONFIG_FAULT_INJECTION
|
||||
@@ -1109,9 +1107,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
|
||||
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
struct user_namespace *user_ns)
|
||||
{
|
||||
mm->mmap = NULL;
|
||||
mm->mm_rb = RB_ROOT;
|
||||
mm->vmacache_seqnum = 0;
|
||||
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
|
||||
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
|
||||
atomic_set(&mm->mm_users, 1);
|
||||
atomic_set(&mm->mm_count, 1);
|
||||
seqcount_init(&mm->write_protect_seq);
|
||||
@@ -1152,6 +1149,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
goto fail_nocontext;
|
||||
|
||||
mm->user_ns = get_user_ns(user_ns);
|
||||
lru_gen_init_mm(mm);
|
||||
return mm;
|
||||
|
||||
fail_nocontext:
|
||||
@@ -1194,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm)
|
||||
}
|
||||
if (mm->binfmt)
|
||||
module_put(mm->binfmt->module);
|
||||
lru_gen_del_mm(mm);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
@@ -1285,13 +1284,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
|
||||
/* Forbid mm->exe_file change if old file still mapped. */
|
||||
old_exe_file = get_mm_exe_file(mm);
|
||||
if (old_exe_file) {
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
mmap_read_lock(mm);
|
||||
for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
|
||||
for_each_vma(vmi, vma) {
|
||||
if (!vma->vm_file)
|
||||
continue;
|
||||
if (path_equal(&vma->vm_file->f_path,
|
||||
&old_exe_file->f_path))
|
||||
&old_exe_file->f_path)) {
|
||||
ret = -EBUSY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
fput(old_exe_file);
|
||||
@@ -1566,9 +1568,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
|
||||
if (!oldmm)
|
||||
return 0;
|
||||
|
||||
/* initialize the new vmacache entries */
|
||||
vmacache_flush(tsk);
|
||||
|
||||
if (clone_flags & CLONE_VM) {
|
||||
mmget(oldmm);
|
||||
mm = oldmm;
|
||||
@@ -2693,6 +2692,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
|
||||
get_task_struct(p);
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
|
||||
/* lock the task to synchronize with memcg migration */
|
||||
task_lock(p);
|
||||
lru_gen_add_mm(p->mm);
|
||||
task_unlock(p);
|
||||
}
|
||||
|
||||
wake_up_new_task(p);
|
||||
|
||||
/* forking complete and child started to run, tell ptracer */
|
||||
|
||||
@@ -31,8 +31,8 @@ if [ "$building_out_of_srctree" ]; then
|
||||
fi
|
||||
all_dirs="$all_dirs $dir_list"
|
||||
|
||||
# include/generated/compile.h is ignored because it is touched even when none
|
||||
# of the source files changed.
|
||||
# include/generated/utsversion.h is ignored because it is generated after this
|
||||
# script is executed. (utsversion.h is unneeded for kheaders)
|
||||
#
|
||||
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
|
||||
# updated, but the contents might be still the same. When any CONFIG option is
|
||||
@@ -42,7 +42,7 @@ all_dirs="$all_dirs $dir_list"
|
||||
#
|
||||
# Ignore them for md5 calculation to avoid pointless regeneration.
|
||||
headers_md5="$(find $all_dirs -name "*.h" |
|
||||
grep -v "include/generated/compile.h" |
|
||||
grep -v "include/generated/utsversion.h" |
|
||||
grep -v "include/generated/autoconf.h" |
|
||||
xargs ls -l | md5sum | cut -d ' ' -f1)"
|
||||
|
||||
|
||||
@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
|
||||
|
||||
/**
|
||||
* generic_handle_irq_safe - Invoke the handler for a HW irq belonging
|
||||
* to a domain from any context.
|
||||
* @domain: The domain where to perform the lookup
|
||||
* @hwirq: The HW irq number to convert to a logical one
|
||||
*
|
||||
* Returns: 0 on success, a negative value on error.
|
||||
*
|
||||
* This function can be called from any context (IRQ or process
|
||||
* context). If the interrupt is marked as 'enforce IRQ-context only' then
|
||||
* the function must be invoked from hard interrupt context.
|
||||
*/
|
||||
int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
local_irq_save(flags);
|
||||
ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
|
||||
local_irq_restore(flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
|
||||
|
||||
/**
|
||||
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
|
||||
* to a domain.
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/hashtable.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kmsan-checks.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/printk.h>
|
||||
@@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
|
||||
INIT_LIST_HEAD(&area->list);
|
||||
area->size = size;
|
||||
list_add(&area->list, &kcov_remote_areas);
|
||||
/*
|
||||
* KMSAN doesn't instrument this file, so it may not know area->list
|
||||
* is initialized. Unpoison it explicitly to avoid reports in
|
||||
* kcov_remote_area_get().
|
||||
*/
|
||||
kmsan_unpoison_memory(&area->list, sizeof(area->list));
|
||||
}
|
||||
|
||||
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
static bool __init test_requires(void)
|
||||
{
|
||||
/* random should be initialized for the below tests */
|
||||
return prandom_u32() + prandom_u32() != 0;
|
||||
return get_random_u32() + get_random_u32() != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -46,7 +46,7 @@ static bool __init test_encode_decode(void)
|
||||
unsigned long addr;
|
||||
size_t verif_size;
|
||||
|
||||
prandom_bytes(&addr, sizeof(addr));
|
||||
get_random_bytes(&addr, sizeof(addr));
|
||||
if (addr < PAGE_SIZE)
|
||||
addr = PAGE_SIZE;
|
||||
|
||||
|
||||
@@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
|
||||
|
||||
/*
|
||||
* Because we write directly to the reserved memory region when loading
|
||||
* crash kernels we need a mutex here to prevent multiple crash kernels
|
||||
* from attempting to load simultaneously, and to prevent a crash kernel
|
||||
* from loading over the top of a in use crash kernel.
|
||||
*
|
||||
* KISS: always take the mutex.
|
||||
* crash kernels we need a serialization here to prevent multiple crash
|
||||
* kernels from attempting to load simultaneously.
|
||||
*/
|
||||
if (!mutex_trylock(&kexec_mutex))
|
||||
if (!kexec_trylock())
|
||||
return -EBUSY;
|
||||
|
||||
if (flags & KEXEC_ON_CRASH) {
|
||||
@@ -165,7 +162,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
|
||||
|
||||
kimage_free(image);
|
||||
out_unlock:
|
||||
mutex_unlock(&kexec_mutex);
|
||||
kexec_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
#include <crypto/hash.h>
|
||||
#include "kexec_internal.h"
|
||||
|
||||
DEFINE_MUTEX(kexec_mutex);
|
||||
atomic_t __kexec_lock = ATOMIC_INIT(0);
|
||||
|
||||
/* Per cpu memory for storing cpu states in case of system crash. */
|
||||
note_buf_t __percpu *crash_notes;
|
||||
@@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image,
|
||||
if (result < 0)
|
||||
goto out;
|
||||
|
||||
ptr = kmap(page);
|
||||
ptr = kmap_local_page(page);
|
||||
/* Start with a clear page */
|
||||
clear_page(ptr);
|
||||
ptr += maddr & ~PAGE_MASK;
|
||||
@@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image,
|
||||
memcpy(ptr, kbuf, uchunk);
|
||||
else
|
||||
result = copy_from_user(ptr, buf, uchunk);
|
||||
kunmap(page);
|
||||
kunmap_local(ptr);
|
||||
if (result) {
|
||||
result = -EFAULT;
|
||||
goto out;
|
||||
@@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image,
|
||||
goto out;
|
||||
}
|
||||
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
|
||||
ptr = kmap(page);
|
||||
ptr = kmap_local_page(page);
|
||||
ptr += maddr & ~PAGE_MASK;
|
||||
mchunk = min_t(size_t, mbytes,
|
||||
PAGE_SIZE - (maddr & ~PAGE_MASK));
|
||||
@@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
|
||||
else
|
||||
result = copy_from_user(ptr, buf, uchunk);
|
||||
kexec_flush_icache_page(page);
|
||||
kunmap(page);
|
||||
kunmap_local(ptr);
|
||||
arch_kexec_pre_free_pages(page_address(page), 1);
|
||||
if (result) {
|
||||
result = -EFAULT;
|
||||
@@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init);
|
||||
*/
|
||||
void __noclone __crash_kexec(struct pt_regs *regs)
|
||||
{
|
||||
/* Take the kexec_mutex here to prevent sys_kexec_load
|
||||
/* Take the kexec_lock here to prevent sys_kexec_load
|
||||
* running on one cpu from replacing the crash kernel
|
||||
* we are using after a panic on a different cpu.
|
||||
*
|
||||
@@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
|
||||
* of memory the xchg(&kexec_crash_image) would be
|
||||
* sufficient. But since I reuse the memory...
|
||||
*/
|
||||
if (mutex_trylock(&kexec_mutex)) {
|
||||
if (kexec_trylock()) {
|
||||
if (kexec_crash_image) {
|
||||
struct pt_regs fixed_regs;
|
||||
|
||||
@@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
|
||||
machine_crash_shutdown(&fixed_regs);
|
||||
machine_kexec(kexec_crash_image);
|
||||
}
|
||||
mutex_unlock(&kexec_mutex);
|
||||
kexec_unlock();
|
||||
}
|
||||
}
|
||||
STACK_FRAME_NON_STANDARD(__crash_kexec);
|
||||
@@ -1004,14 +1004,17 @@ void crash_kexec(struct pt_regs *regs)
|
||||
}
|
||||
}
|
||||
|
||||
size_t crash_get_memory_size(void)
|
||||
ssize_t crash_get_memory_size(void)
|
||||
{
|
||||
size_t size = 0;
|
||||
ssize_t size = 0;
|
||||
|
||||
if (!kexec_trylock())
|
||||
return -EBUSY;
|
||||
|
||||
mutex_lock(&kexec_mutex);
|
||||
if (crashk_res.end != crashk_res.start)
|
||||
size = resource_size(&crashk_res);
|
||||
mutex_unlock(&kexec_mutex);
|
||||
|
||||
kexec_unlock();
|
||||
return size;
|
||||
}
|
||||
|
||||
@@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size)
|
||||
unsigned long old_size;
|
||||
struct resource *ram_res;
|
||||
|
||||
mutex_lock(&kexec_mutex);
|
||||
if (!kexec_trylock())
|
||||
return -EBUSY;
|
||||
|
||||
if (kexec_crash_image) {
|
||||
ret = -ENOENT;
|
||||
@@ -1060,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size)
|
||||
insert_resource(&iomem_resource, ram_res);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&kexec_mutex);
|
||||
kexec_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1132,7 +1136,7 @@ int kernel_kexec(void)
|
||||
{
|
||||
int error = 0;
|
||||
|
||||
if (!mutex_trylock(&kexec_mutex))
|
||||
if (!kexec_trylock())
|
||||
return -EBUSY;
|
||||
if (!kexec_image) {
|
||||
error = -EINVAL;
|
||||
@@ -1208,6 +1212,6 @@ int kernel_kexec(void)
|
||||
#endif
|
||||
|
||||
Unlock:
|
||||
mutex_unlock(&kexec_mutex);
|
||||
kexec_unlock();
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
|
||||
|
||||
image = NULL;
|
||||
|
||||
if (!mutex_trylock(&kexec_mutex))
|
||||
if (!kexec_trylock())
|
||||
return -EBUSY;
|
||||
|
||||
dest_image = &kexec_image;
|
||||
@@ -411,7 +411,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
|
||||
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
|
||||
arch_kexec_protect_crashkres();
|
||||
|
||||
mutex_unlock(&kexec_mutex);
|
||||
kexec_unlock();
|
||||
kimage_free(image);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
|
||||
int kimage_is_destination_range(struct kimage *image,
|
||||
unsigned long start, unsigned long end);
|
||||
|
||||
extern struct mutex kexec_mutex;
|
||||
/*
|
||||
* Whatever is used to serialize accesses to the kexec_crash_image needs to be
|
||||
* NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
|
||||
* "simple" atomic variable that is acquired with a cmpxchg().
|
||||
*/
|
||||
extern atomic_t __kexec_lock;
|
||||
static inline bool kexec_trylock(void)
|
||||
{
|
||||
return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
|
||||
}
|
||||
static inline void kexec_unlock(void)
|
||||
{
|
||||
atomic_set_release(&__kexec_lock, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KEXEC_FILE
|
||||
#include <linux/purgatory.h>
|
||||
|
||||
@@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded);
|
||||
static ssize_t kexec_crash_size_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%zu\n", crash_get_memory_size());
|
||||
ssize_t size = crash_get_memory_size();
|
||||
|
||||
if (size < 0)
|
||||
return size;
|
||||
|
||||
return sprintf(buf, "%zd\n", size);
|
||||
}
|
||||
static ssize_t kexec_crash_size_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
|
||||
@@ -112,7 +112,7 @@ static void __sched
|
||||
account_global_scheduler_latency(struct task_struct *tsk,
|
||||
struct latency_record *lat)
|
||||
{
|
||||
int firstnonnull = MAXLR + 1;
|
||||
int firstnonnull = MAXLR;
|
||||
int i;
|
||||
|
||||
/* skip kernel threads for now */
|
||||
@@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
|
||||
}
|
||||
|
||||
i = firstnonnull;
|
||||
if (i >= MAXLR - 1)
|
||||
if (i >= MAXLR)
|
||||
return;
|
||||
|
||||
/* Allocted a new one: */
|
||||
|
||||
@@ -325,6 +325,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
|
||||
* /sys/kernel/livepatch/<patch>/transition
|
||||
* /sys/kernel/livepatch/<patch>/force
|
||||
* /sys/kernel/livepatch/<patch>/<object>
|
||||
* /sys/kernel/livepatch/<patch>/<object>/patched
|
||||
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
|
||||
*/
|
||||
static int __klp_disable_patch(struct klp_patch *patch);
|
||||
@@ -431,6 +432,22 @@ static struct attribute *klp_patch_attrs[] = {
|
||||
};
|
||||
ATTRIBUTE_GROUPS(klp_patch);
|
||||
|
||||
static ssize_t patched_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct klp_object *obj;
|
||||
|
||||
obj = container_of(kobj, struct klp_object, kobj);
|
||||
return sysfs_emit(buf, "%d\n", obj->patched);
|
||||
}
|
||||
|
||||
static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
|
||||
static struct attribute *klp_object_attrs[] = {
|
||||
&patched_kobj_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(klp_object);
|
||||
|
||||
static void klp_free_object_dynamic(struct klp_object *obj)
|
||||
{
|
||||
kfree(obj->name);
|
||||
@@ -576,6 +593,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
|
||||
static struct kobj_type klp_ktype_object = {
|
||||
.release = klp_kobj_release_object,
|
||||
.sysfs_ops = &kobj_sysfs_ops,
|
||||
.default_groups = klp_object_groups,
|
||||
};
|
||||
|
||||
static void klp_kobj_release_func(struct kobject *kobj)
|
||||
@@ -1171,7 +1189,7 @@ int klp_module_coming(struct module *mod)
|
||||
return -EINVAL;
|
||||
|
||||
if (!strcmp(mod->name, "vmlinux")) {
|
||||
pr_err("vmlinux.ko: invalid module name");
|
||||
pr_err("vmlinux.ko: invalid module name\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
||||
@@ -610,9 +610,23 @@ void klp_reverse_transition(void)
|
||||
/* Called from copy_process() during fork */
|
||||
void klp_copy_process(struct task_struct *child)
|
||||
{
|
||||
child->patch_state = current->patch_state;
|
||||
|
||||
/* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
|
||||
/*
|
||||
* The parent process may have gone through a KLP transition since
|
||||
* the thread flag was copied in setup_thread_stack earlier. Bring
|
||||
* the task flag up to date with the parent here.
|
||||
*
|
||||
* The operation is serialized against all klp_*_transition()
|
||||
* operations by the tasklist_lock. The only exception is
|
||||
* klp_update_patch_state(current), but we cannot race with
|
||||
* that because we are current.
|
||||
*/
|
||||
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
|
||||
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
|
||||
else
|
||||
clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
|
||||
|
||||
child->patch_state = current->patch_state;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n
|
||||
|
||||
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
|
||||
|
||||
# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
|
||||
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
|
||||
KCSAN_SANITIZE_lockdep.o := n
|
||||
KMSAN_SANITIZE_lockdep.o := n
|
||||
|
||||
ifdef CONFIG_FUNCTION_TRACER
|
||||
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
|
||||
|
||||
@@ -399,7 +399,7 @@ static int *get_random_order(int count)
|
||||
order[n] = n;
|
||||
|
||||
for (n = count - 1; n > 1; n--) {
|
||||
r = get_random_int() % (n + 1);
|
||||
r = prandom_u32_max(n + 1);
|
||||
if (r != n) {
|
||||
tmp = order[n];
|
||||
order[n] = order[r];
|
||||
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
|
||||
{
|
||||
struct stress *stress = container_of(work, typeof(*stress), work);
|
||||
const int nlocks = stress->nlocks;
|
||||
struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
|
||||
struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks);
|
||||
int err;
|
||||
|
||||
do {
|
||||
|
||||
@@ -256,7 +256,7 @@ void module_decompress_cleanup(struct load_info *info)
|
||||
static ssize_t compression_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
|
||||
return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
|
||||
}
|
||||
|
||||
static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <linux/printk.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/rculist.h>
|
||||
#include "internal.h"
|
||||
|
||||
@@ -21,6 +22,9 @@ int try_add_tainted_module(struct module *mod)
|
||||
|
||||
module_assert_mutex_or_preempt();
|
||||
|
||||
if (!mod->taints)
|
||||
goto out;
|
||||
|
||||
list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
|
||||
lockdep_is_held(&module_mutex)) {
|
||||
if (!strcmp(mod_taint->name, mod->name) &&
|
||||
@@ -59,3 +63,70 @@ void print_unloaded_tainted_modules(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
|
||||
__acquires(rcu)
|
||||
{
|
||||
rcu_read_lock();
|
||||
return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
|
||||
}
|
||||
|
||||
static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
|
||||
}
|
||||
|
||||
static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
|
||||
__releases(rcu)
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct mod_unload_taint *mod_taint;
|
||||
char buf[MODULE_FLAGS_BUF_SIZE];
|
||||
size_t l;
|
||||
|
||||
mod_taint = list_entry(p, struct mod_unload_taint, list);
|
||||
l = module_flags_taint(mod_taint->taints, buf);
|
||||
buf[l++] = '\0';
|
||||
|
||||
seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
|
||||
seq_puts(m, "\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations unloaded_tainted_modules_seq_ops = {
|
||||
.start = unloaded_tainted_modules_seq_start,
|
||||
.next = unloaded_tainted_modules_seq_next,
|
||||
.stop = unloaded_tainted_modules_seq_stop,
|
||||
.show = unloaded_tainted_modules_seq_show,
|
||||
};
|
||||
|
||||
static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &unloaded_tainted_modules_seq_ops);
|
||||
}
|
||||
|
||||
static const struct file_operations unloaded_tainted_modules_fops = {
|
||||
.open = unloaded_tainted_modules_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static int __init unloaded_tainted_modules_init(void)
|
||||
{
|
||||
struct dentry *dir;
|
||||
|
||||
dir = debugfs_create_dir("modules", NULL);
|
||||
debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
|
||||
&unloaded_tainted_modules_fops);
|
||||
|
||||
return 0;
|
||||
}
|
||||
module_init(unloaded_tainted_modules_init);
|
||||
#endif /* CONFIG_DEBUG_FS */
|
||||
|
||||
@@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
|
||||
{
|
||||
return idr_get_next(&ns->idr, &nr);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(find_ge_pid);
|
||||
|
||||
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
|
||||
{
|
||||
|
||||
@@ -79,13 +79,20 @@ int oops_in_progress;
|
||||
EXPORT_SYMBOL(oops_in_progress);
|
||||
|
||||
/*
|
||||
* console_sem protects the console_drivers list, and also
|
||||
* provides serialisation for access to the entire console
|
||||
* driver system.
|
||||
* console_mutex protects console_list updates and console->flags updates.
|
||||
* The flags are synchronized only for consoles that are registered, i.e.
|
||||
* accessible via the console list.
|
||||
*/
|
||||
static DEFINE_MUTEX(console_mutex);
|
||||
|
||||
/*
|
||||
* console_sem protects updates to console->seq and console_suspended,
|
||||
* and also provides serialization for console printing.
|
||||
*/
|
||||
static DEFINE_SEMAPHORE(console_sem);
|
||||
struct console *console_drivers;
|
||||
EXPORT_SYMBOL_GPL(console_drivers);
|
||||
HLIST_HEAD(console_list);
|
||||
EXPORT_SYMBOL_GPL(console_list);
|
||||
DEFINE_STATIC_SRCU(console_srcu);
|
||||
|
||||
/*
|
||||
* System may need to suppress printk message under certain
|
||||
@@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk;
|
||||
static struct lockdep_map console_lock_dep_map = {
|
||||
.name = "console_lock"
|
||||
};
|
||||
|
||||
void lockdep_assert_console_list_lock_held(void)
|
||||
{
|
||||
lockdep_assert_held(&console_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
bool console_srcu_read_lock_is_held(void)
|
||||
{
|
||||
return srcu_read_lock_held(&console_srcu);
|
||||
}
|
||||
#endif
|
||||
|
||||
enum devkmsg_log_bits {
|
||||
@@ -220,6 +240,69 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
|
||||
}
|
||||
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
|
||||
|
||||
/**
|
||||
* console_list_lock - Lock the console list
|
||||
*
|
||||
* For console list or console->flags updates
|
||||
*/
|
||||
void console_list_lock(void)
|
||||
{
|
||||
/*
|
||||
* In unregister_console() and console_force_preferred_locked(),
|
||||
* synchronize_srcu() is called with the console_list_lock held.
|
||||
* Therefore it is not allowed that the console_list_lock is taken
|
||||
* with the srcu_lock held.
|
||||
*
|
||||
* Detecting if this context is really in the read-side critical
|
||||
* section is only possible if the appropriate debug options are
|
||||
* enabled.
|
||||
*/
|
||||
WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
|
||||
srcu_read_lock_held(&console_srcu));
|
||||
|
||||
mutex_lock(&console_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(console_list_lock);
|
||||
|
||||
/**
|
||||
* console_list_unlock - Unlock the console list
|
||||
*
|
||||
* Counterpart to console_list_lock()
|
||||
*/
|
||||
void console_list_unlock(void)
|
||||
{
|
||||
mutex_unlock(&console_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(console_list_unlock);
|
||||
|
||||
/**
|
||||
* console_srcu_read_lock - Register a new reader for the
|
||||
* SRCU-protected console list
|
||||
*
|
||||
* Use for_each_console_srcu() to iterate the console list
|
||||
*
|
||||
* Context: Any context.
|
||||
* Return: A cookie to pass to console_srcu_read_unlock().
|
||||
*/
|
||||
int console_srcu_read_lock(void)
|
||||
{
|
||||
return srcu_read_lock_nmisafe(&console_srcu);
|
||||
}
|
||||
EXPORT_SYMBOL(console_srcu_read_lock);
|
||||
|
||||
/**
|
||||
* console_srcu_read_unlock - Unregister an old reader from
|
||||
* the SRCU-protected console list
|
||||
* @cookie: cookie returned from console_srcu_read_lock()
|
||||
*
|
||||
* Counterpart to console_srcu_read_lock()
|
||||
*/
|
||||
void console_srcu_read_unlock(int cookie)
|
||||
{
|
||||
srcu_read_unlock_nmisafe(&console_srcu, cookie);
|
||||
}
|
||||
EXPORT_SYMBOL(console_srcu_read_unlock);
|
||||
|
||||
/*
|
||||
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
|
||||
* macros instead of functions so that _RET_IP_ contains useful information.
|
||||
@@ -1814,13 +1897,13 @@ static void console_lock_spinning_enable(void)
|
||||
* safe to start busy waiting for the lock. Second, it checks if
|
||||
* there is a busy waiter and passes the lock rights to her.
|
||||
*
|
||||
* Important: Callers lose the lock if there was a busy waiter.
|
||||
* They must not touch items synchronized by console_lock
|
||||
* in this case.
|
||||
* Important: Callers lose both the console_lock and the SRCU read lock if
|
||||
* there was a busy waiter. They must not touch items synchronized by
|
||||
* console_lock or SRCU read lock in this case.
|
||||
*
|
||||
* Return: 1 if the lock rights were passed, 0 otherwise.
|
||||
*/
|
||||
static int console_lock_spinning_disable_and_check(void)
|
||||
static int console_lock_spinning_disable_and_check(int cookie)
|
||||
{
|
||||
int waiter;
|
||||
|
||||
@@ -1839,6 +1922,12 @@ static int console_lock_spinning_disable_and_check(void)
|
||||
|
||||
spin_release(&console_owner_dep_map, _THIS_IP_);
|
||||
|
||||
/*
|
||||
* Preserve lockdep lock ordering. Release the SRCU read lock before
|
||||
* releasing the console_lock.
|
||||
*/
|
||||
console_srcu_read_unlock(cookie);
|
||||
|
||||
/*
|
||||
* Hand off console_lock to waiter. The waiter will perform
|
||||
* the up(). After this, the waiter is the console_lock owner.
|
||||
@@ -2322,7 +2411,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
|
||||
char *text, size_t text_len,
|
||||
struct dev_printk_info *dev_info) { return 0; }
|
||||
static void console_lock_spinning_enable(void) { }
|
||||
static int console_lock_spinning_disable_and_check(void) { return 0; }
|
||||
static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
|
||||
static void call_console_driver(struct console *con, const char *text, size_t len,
|
||||
char *dropped_text)
|
||||
{
|
||||
@@ -2553,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu)
|
||||
}
|
||||
|
||||
/**
|
||||
* console_lock - lock the console system for exclusive use.
|
||||
* console_lock - block the console subsystem from printing
|
||||
*
|
||||
* Acquires a lock which guarantees that the caller has
|
||||
* exclusive access to the console system and the console_drivers list.
|
||||
* Acquires a lock which guarantees that no consoles will
|
||||
* be in or enter their write() callback.
|
||||
*
|
||||
* Can sleep, returns nothing.
|
||||
*/
|
||||
@@ -2573,10 +2662,10 @@ void console_lock(void)
|
||||
EXPORT_SYMBOL(console_lock);
|
||||
|
||||
/**
|
||||
* console_trylock - try to lock the console system for exclusive use.
|
||||
* console_trylock - try to block the console subsystem from printing
|
||||
*
|
||||
* Try to acquire a lock which guarantees that the caller has exclusive
|
||||
* access to the console system and the console_drivers list.
|
||||
* Try to acquire a lock which guarantees that no consoles will
|
||||
* be in or enter their write() callback.
|
||||
*
|
||||
* returns 1 on success, and 0 on failure to acquire the lock.
|
||||
*/
|
||||
@@ -2623,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void)
|
||||
* Check if the given console is currently capable and allowed to print
|
||||
* records.
|
||||
*
|
||||
* Requires the console_lock.
|
||||
* Requires the console_srcu_read_lock.
|
||||
*/
|
||||
static inline bool console_is_usable(struct console *con)
|
||||
{
|
||||
if (!(con->flags & CON_ENABLED))
|
||||
short flags = console_srcu_read_flags(con);
|
||||
|
||||
if (!(flags & CON_ENABLED))
|
||||
return false;
|
||||
|
||||
if (!con->write)
|
||||
@@ -2638,8 +2729,7 @@ static inline bool console_is_usable(struct console *con)
|
||||
* allocated. So unless they're explicitly marked as being able to
|
||||
* cope (CON_ANYTIME) don't call them until this CPU is officially up.
|
||||
*/
|
||||
if (!cpu_online(raw_smp_processor_id()) &&
|
||||
!(con->flags & CON_ANYTIME))
|
||||
if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@@ -2664,16 +2754,18 @@ static void __console_unlock(void)
|
||||
* DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
|
||||
*
|
||||
* @handover will be set to true if a printk waiter has taken over the
|
||||
* console_lock, in which case the caller is no longer holding the
|
||||
* console_lock. Otherwise it is set to false.
|
||||
* console_lock, in which case the caller is no longer holding both the
|
||||
* console_lock and the SRCU read lock. Otherwise it is set to false.
|
||||
*
|
||||
* @cookie is the cookie from the SRCU read lock.
|
||||
*
|
||||
* Returns false if the given console has no next record to print, otherwise
|
||||
* true.
|
||||
*
|
||||
* Requires the console_lock.
|
||||
* Requires the console_lock and the SRCU read lock.
|
||||
*/
|
||||
static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
|
||||
char *dropped_text, bool *handover)
|
||||
char *dropped_text, bool *handover, int cookie)
|
||||
{
|
||||
static int panic_console_dropped;
|
||||
struct printk_info info;
|
||||
@@ -2733,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
|
||||
|
||||
con->seq++;
|
||||
|
||||
*handover = console_lock_spinning_disable_and_check();
|
||||
*handover = console_lock_spinning_disable_and_check(cookie);
|
||||
printk_safe_exit_irqrestore(flags);
|
||||
skip:
|
||||
return true;
|
||||
@@ -2770,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
|
||||
bool any_usable = false;
|
||||
struct console *con;
|
||||
bool any_progress;
|
||||
int cookie;
|
||||
|
||||
*next_seq = 0;
|
||||
*handover = false;
|
||||
@@ -2777,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
|
||||
do {
|
||||
any_progress = false;
|
||||
|
||||
for_each_console(con) {
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(con) {
|
||||
bool progress;
|
||||
|
||||
if (!console_is_usable(con))
|
||||
continue;
|
||||
any_usable = true;
|
||||
|
||||
if (con->flags & CON_EXTENDED) {
|
||||
if (console_srcu_read_flags(con) & CON_EXTENDED) {
|
||||
/* Extended consoles do not print "dropped messages". */
|
||||
progress = console_emit_next_record(con, &text[0],
|
||||
&ext_text[0], NULL,
|
||||
handover);
|
||||
handover, cookie);
|
||||
} else {
|
||||
progress = console_emit_next_record(con, &text[0],
|
||||
NULL, &dropped_text[0],
|
||||
handover);
|
||||
handover, cookie);
|
||||
}
|
||||
|
||||
/*
|
||||
* If a handover has occurred, the SRCU read lock
|
||||
* is already released.
|
||||
*/
|
||||
if (*handover)
|
||||
return false;
|
||||
|
||||
@@ -2807,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
|
||||
|
||||
/* Allow panic_cpu to take over the consoles safely. */
|
||||
if (abandon_console_lock_in_panic())
|
||||
return false;
|
||||
goto abandon;
|
||||
|
||||
if (do_cond_resched)
|
||||
cond_resched();
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
} while (any_progress);
|
||||
|
||||
return any_usable;
|
||||
|
||||
abandon:
|
||||
console_srcu_read_unlock(cookie);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* console_unlock - unlock the console system
|
||||
* console_unlock - unblock the console subsystem from printing
|
||||
*
|
||||
* Releases the console_lock which the caller holds on the console system
|
||||
* and the console driver list.
|
||||
* Releases the console_lock which the caller holds to block printing of
|
||||
* the console subsystem.
|
||||
*
|
||||
* While the console_lock was held, console output may have been buffered
|
||||
* by printk(). If this is the case, console_unlock(); emits
|
||||
@@ -2899,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule);
|
||||
void console_unblank(void)
|
||||
{
|
||||
struct console *c;
|
||||
int cookie;
|
||||
|
||||
/*
|
||||
* console_unblank can no longer be called in interrupt context unless
|
||||
* oops_in_progress is set to 1..
|
||||
* Stop console printing because the unblank() callback may
|
||||
* assume the console is not within its write() callback.
|
||||
*
|
||||
* If @oops_in_progress is set, this may be an atomic context.
|
||||
* In that case, attempt a trylock as best-effort.
|
||||
*/
|
||||
if (oops_in_progress) {
|
||||
if (down_trylock_console_sem() != 0)
|
||||
@@ -2912,9 +3020,14 @@ void console_unblank(void)
|
||||
|
||||
console_locked = 1;
|
||||
console_may_schedule = 0;
|
||||
for_each_console(c)
|
||||
if ((c->flags & CON_ENABLED) && c->unblank)
|
||||
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(c) {
|
||||
if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
|
||||
c->unblank();
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
|
||||
console_unlock();
|
||||
|
||||
if (!oops_in_progress)
|
||||
@@ -2941,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode)
|
||||
|
||||
if (mode == CONSOLE_REPLAY_ALL) {
|
||||
struct console *c;
|
||||
int cookie;
|
||||
u64 seq;
|
||||
|
||||
seq = prb_first_valid_seq(prb);
|
||||
for_each_console(c)
|
||||
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(c) {
|
||||
/*
|
||||
* If the above console_trylock() failed, this is an
|
||||
* unsynchronized assignment. But in that case, the
|
||||
* kernel is in "hope and pray" mode anyway.
|
||||
*/
|
||||
c->seq = seq;
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
}
|
||||
console_unlock();
|
||||
}
|
||||
@@ -2957,15 +3080,25 @@ struct tty_driver *console_device(int *index)
|
||||
{
|
||||
struct console *c;
|
||||
struct tty_driver *driver = NULL;
|
||||
int cookie;
|
||||
|
||||
/*
|
||||
* Take console_lock to serialize device() callback with
|
||||
* other console operations. For example, fg_console is
|
||||
* modified under console_lock when switching vt.
|
||||
*/
|
||||
console_lock();
|
||||
for_each_console(c) {
|
||||
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(c) {
|
||||
if (!c->device)
|
||||
continue;
|
||||
driver = c->device(c, index);
|
||||
if (driver)
|
||||
break;
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
|
||||
console_unlock();
|
||||
return driver;
|
||||
}
|
||||
@@ -2978,17 +3111,25 @@ struct tty_driver *console_device(int *index)
|
||||
void console_stop(struct console *console)
|
||||
{
|
||||
__pr_flush(console, 1000, true);
|
||||
console_lock();
|
||||
console->flags &= ~CON_ENABLED;
|
||||
console_unlock();
|
||||
console_list_lock();
|
||||
console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
|
||||
console_list_unlock();
|
||||
|
||||
/*
|
||||
* Ensure that all SRCU list walks have completed. All contexts must
|
||||
* be able to see that this console is disabled so that (for example)
|
||||
* the caller can suspend the port without risk of another context
|
||||
* using the port.
|
||||
*/
|
||||
synchronize_srcu(&console_srcu);
|
||||
}
|
||||
EXPORT_SYMBOL(console_stop);
|
||||
|
||||
void console_start(struct console *console)
|
||||
{
|
||||
console_lock();
|
||||
console->flags |= CON_ENABLED;
|
||||
console_unlock();
|
||||
console_list_lock();
|
||||
console_srcu_write_flags(console, console->flags | CON_ENABLED);
|
||||
console_list_unlock();
|
||||
__pr_flush(console, 1000, true);
|
||||
}
|
||||
EXPORT_SYMBOL(console_start);
|
||||
@@ -3081,6 +3222,72 @@ static void try_enable_default_console(struct console *newcon)
|
||||
(con->flags & CON_BOOT) ? "boot" : "", \
|
||||
con->name, con->index, ##__VA_ARGS__)
|
||||
|
||||
static void console_init_seq(struct console *newcon, bool bootcon_registered)
|
||||
{
|
||||
struct console *con;
|
||||
bool handover;
|
||||
|
||||
if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
|
||||
/* Get a consistent copy of @syslog_seq. */
|
||||
mutex_lock(&syslog_lock);
|
||||
newcon->seq = syslog_seq;
|
||||
mutex_unlock(&syslog_lock);
|
||||
} else {
|
||||
/* Begin with next message added to ringbuffer. */
|
||||
newcon->seq = prb_next_seq(prb);
|
||||
|
||||
/*
|
||||
* If any enabled boot consoles are due to be unregistered
|
||||
* shortly, some may not be caught up and may be the same
|
||||
* device as @newcon. Since it is not known which boot console
|
||||
* is the same device, flush all consoles and, if necessary,
|
||||
* start with the message of the enabled boot console that is
|
||||
* the furthest behind.
|
||||
*/
|
||||
if (bootcon_registered && !keep_bootcon) {
|
||||
/*
|
||||
* Hold the console_lock to stop console printing and
|
||||
* guarantee safe access to console->seq.
|
||||
*/
|
||||
console_lock();
|
||||
|
||||
/*
|
||||
* Flush all consoles and set the console to start at
|
||||
* the next unprinted sequence number.
|
||||
*/
|
||||
if (!console_flush_all(true, &newcon->seq, &handover)) {
|
||||
/*
|
||||
* Flushing failed. Just choose the lowest
|
||||
* sequence of the enabled boot consoles.
|
||||
*/
|
||||
|
||||
/*
|
||||
* If there was a handover, this context no
|
||||
* longer holds the console_lock.
|
||||
*/
|
||||
if (handover)
|
||||
console_lock();
|
||||
|
||||
newcon->seq = prb_next_seq(prb);
|
||||
for_each_console(con) {
|
||||
if ((con->flags & CON_BOOT) &&
|
||||
(con->flags & CON_ENABLED) &&
|
||||
con->seq < newcon->seq) {
|
||||
newcon->seq = con->seq;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define console_first() \
|
||||
hlist_entry(console_list.first, struct console, node)
|
||||
|
||||
static int unregister_console_locked(struct console *console);
|
||||
|
||||
/*
|
||||
* The console driver calls this routine during kernel initialization
|
||||
* to register the console printing procedure with printk() and to
|
||||
@@ -3103,28 +3310,29 @@ static void try_enable_default_console(struct console *newcon)
|
||||
void register_console(struct console *newcon)
|
||||
{
|
||||
struct console *con;
|
||||
bool bootcon_enabled = false;
|
||||
bool realcon_enabled = false;
|
||||
bool bootcon_registered = false;
|
||||
bool realcon_registered = false;
|
||||
int err;
|
||||
|
||||
console_list_lock();
|
||||
|
||||
for_each_console(con) {
|
||||
if (WARN(con == newcon, "console '%s%d' already registered\n",
|
||||
con->name, con->index))
|
||||
return;
|
||||
}
|
||||
con->name, con->index)) {
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
for_each_console(con) {
|
||||
if (con->flags & CON_BOOT)
|
||||
bootcon_enabled = true;
|
||||
bootcon_registered = true;
|
||||
else
|
||||
realcon_enabled = true;
|
||||
realcon_registered = true;
|
||||
}
|
||||
|
||||
/* Do not register boot consoles when there already is a real one. */
|
||||
if (newcon->flags & CON_BOOT && realcon_enabled) {
|
||||
if ((newcon->flags & CON_BOOT) && realcon_registered) {
|
||||
pr_info("Too late to register bootconsole %s%d\n",
|
||||
newcon->name, newcon->index);
|
||||
return;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3140,8 +3348,8 @@ void register_console(struct console *newcon)
|
||||
* flag set and will be first in the list.
|
||||
*/
|
||||
if (preferred_console < 0) {
|
||||
if (!console_drivers || !console_drivers->device ||
|
||||
console_drivers->flags & CON_BOOT) {
|
||||
if (hlist_empty(&console_list) || !console_first()->device ||
|
||||
console_first()->flags & CON_BOOT) {
|
||||
try_enable_default_console(newcon);
|
||||
}
|
||||
}
|
||||
@@ -3155,7 +3363,7 @@ void register_console(struct console *newcon)
|
||||
|
||||
/* printk() messages are not printed to the Braille console. */
|
||||
if (err || newcon->flags & CON_BRL)
|
||||
return;
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If we have a bootconsole, and are switching to a real console,
|
||||
@@ -3163,39 +3371,38 @@ void register_console(struct console *newcon)
|
||||
* the real console are the same physical device, it's annoying to
|
||||
* see the beginning boot messages twice
|
||||
*/
|
||||
if (bootcon_enabled &&
|
||||
if (bootcon_registered &&
|
||||
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
|
||||
newcon->flags &= ~CON_PRINTBUFFER;
|
||||
}
|
||||
|
||||
newcon->dropped = 0;
|
||||
console_init_seq(newcon, bootcon_registered);
|
||||
|
||||
/*
|
||||
* Put this console in the list - keep the
|
||||
* preferred driver at the head of the list.
|
||||
* Put this console in the list - keep the
|
||||
* preferred driver at the head of the list.
|
||||
*/
|
||||
console_lock();
|
||||
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
|
||||
newcon->next = console_drivers;
|
||||
console_drivers = newcon;
|
||||
if (newcon->next)
|
||||
newcon->next->flags &= ~CON_CONSDEV;
|
||||
/* Ensure this flag is always set for the head of the list */
|
||||
if (hlist_empty(&console_list)) {
|
||||
/* Ensure CON_CONSDEV is always set for the head. */
|
||||
newcon->flags |= CON_CONSDEV;
|
||||
hlist_add_head_rcu(&newcon->node, &console_list);
|
||||
|
||||
} else if (newcon->flags & CON_CONSDEV) {
|
||||
/* Only the new head can have CON_CONSDEV set. */
|
||||
console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
|
||||
hlist_add_head_rcu(&newcon->node, &console_list);
|
||||
|
||||
} else {
|
||||
newcon->next = console_drivers->next;
|
||||
console_drivers->next = newcon;
|
||||
hlist_add_behind_rcu(&newcon->node, console_list.first);
|
||||
}
|
||||
|
||||
newcon->dropped = 0;
|
||||
if (newcon->flags & CON_PRINTBUFFER) {
|
||||
/* Get a consistent copy of @syslog_seq. */
|
||||
mutex_lock(&syslog_lock);
|
||||
newcon->seq = syslog_seq;
|
||||
mutex_unlock(&syslog_lock);
|
||||
} else {
|
||||
/* Begin with next message. */
|
||||
newcon->seq = prb_next_seq(prb);
|
||||
}
|
||||
console_unlock();
|
||||
/*
|
||||
* No need to synchronize SRCU here! The caller does not rely
|
||||
* on all contexts being able to see the new console before
|
||||
* register_console() completes.
|
||||
*/
|
||||
|
||||
console_sysfs_notify();
|
||||
|
||||
/*
|
||||
@@ -3206,21 +3413,28 @@ void register_console(struct console *newcon)
|
||||
* went to the bootconsole (that they do not see on the real console)
|
||||
*/
|
||||
con_printk(KERN_INFO, newcon, "enabled\n");
|
||||
if (bootcon_enabled &&
|
||||
if (bootcon_registered &&
|
||||
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
|
||||
!keep_bootcon) {
|
||||
for_each_console(con)
|
||||
struct hlist_node *tmp;
|
||||
|
||||
hlist_for_each_entry_safe(con, tmp, &console_list, node) {
|
||||
if (con->flags & CON_BOOT)
|
||||
unregister_console(con);
|
||||
unregister_console_locked(con);
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
console_list_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(register_console);
|
||||
|
||||
int unregister_console(struct console *console)
|
||||
/* Must be called under console_list_lock(). */
|
||||
static int unregister_console_locked(struct console *console)
|
||||
{
|
||||
struct console *con;
|
||||
int res;
|
||||
|
||||
lockdep_assert_console_list_lock_held();
|
||||
|
||||
con_printk(KERN_INFO, console, "disabled\n");
|
||||
|
||||
res = _braille_unregister_console(console);
|
||||
@@ -3229,48 +3443,94 @@ int unregister_console(struct console *console)
|
||||
if (res > 0)
|
||||
return 0;
|
||||
|
||||
res = -ENODEV;
|
||||
console_lock();
|
||||
if (console_drivers == console) {
|
||||
console_drivers=console->next;
|
||||
res = 0;
|
||||
} else {
|
||||
for_each_console(con) {
|
||||
if (con->next == console) {
|
||||
con->next = console->next;
|
||||
res = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Disable it unconditionally */
|
||||
console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
|
||||
|
||||
if (res)
|
||||
goto out_disable_unlock;
|
||||
if (!console_is_registered_locked(console))
|
||||
return -ENODEV;
|
||||
|
||||
hlist_del_init_rcu(&console->node);
|
||||
|
||||
/*
|
||||
* <HISTORICAL>
|
||||
* If this isn't the last console and it has CON_CONSDEV set, we
|
||||
* need to set it on the next preferred console.
|
||||
* </HISTORICAL>
|
||||
*
|
||||
* The above makes no sense as there is no guarantee that the next
|
||||
* console has any device attached. Oh well....
|
||||
*/
|
||||
if (console_drivers != NULL && console->flags & CON_CONSDEV)
|
||||
console_drivers->flags |= CON_CONSDEV;
|
||||
if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
|
||||
console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
|
||||
|
||||
/*
|
||||
* Ensure that all SRCU list walks have completed. All contexts
|
||||
* must not be able to see this console in the list so that any
|
||||
* exit/cleanup routines can be performed safely.
|
||||
*/
|
||||
synchronize_srcu(&console_srcu);
|
||||
|
||||
console->flags &= ~CON_ENABLED;
|
||||
console_unlock();
|
||||
console_sysfs_notify();
|
||||
|
||||
if (console->exit)
|
||||
res = console->exit(console);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
out_disable_unlock:
|
||||
console->flags &= ~CON_ENABLED;
|
||||
console_unlock();
|
||||
int unregister_console(struct console *console)
|
||||
{
|
||||
int res;
|
||||
|
||||
console_list_lock();
|
||||
res = unregister_console_locked(console);
|
||||
console_list_unlock();
|
||||
return res;
|
||||
}
|
||||
EXPORT_SYMBOL(unregister_console);
|
||||
|
||||
/**
|
||||
* console_force_preferred_locked - force a registered console preferred
|
||||
* @con: The registered console to force preferred.
|
||||
*
|
||||
* Must be called under console_list_lock().
|
||||
*/
|
||||
void console_force_preferred_locked(struct console *con)
|
||||
{
|
||||
struct console *cur_pref_con;
|
||||
|
||||
if (!console_is_registered_locked(con))
|
||||
return;
|
||||
|
||||
cur_pref_con = console_first();
|
||||
|
||||
/* Already preferred? */
|
||||
if (cur_pref_con == con)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Delete, but do not re-initialize the entry. This allows the console
|
||||
* to continue to appear registered (via any hlist_unhashed_lockless()
|
||||
* checks), even though it was briefly removed from the console list.
|
||||
*/
|
||||
hlist_del_rcu(&con->node);
|
||||
|
||||
/*
|
||||
* Ensure that all SRCU list walks have completed so that the console
|
||||
* can be added to the beginning of the console list and its forward
|
||||
* list pointer can be re-initialized.
|
||||
*/
|
||||
synchronize_srcu(&console_srcu);
|
||||
|
||||
con->flags |= CON_CONSDEV;
|
||||
WARN_ON(!con->device);
|
||||
|
||||
/* Only the new head can have CON_CONSDEV set. */
|
||||
console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
|
||||
hlist_add_head_rcu(&con->node, &console_list);
|
||||
}
|
||||
EXPORT_SYMBOL(console_force_preferred_locked);
|
||||
|
||||
/*
|
||||
* Initialize the console device. This is called *early*, so
|
||||
* we can't necessarily depend on lots of kernel help here.
|
||||
@@ -3317,10 +3577,12 @@ void __init console_init(void)
|
||||
*/
|
||||
static int __init printk_late_init(void)
|
||||
{
|
||||
struct hlist_node *tmp;
|
||||
struct console *con;
|
||||
int ret;
|
||||
|
||||
for_each_console(con) {
|
||||
console_list_lock();
|
||||
hlist_for_each_entry_safe(con, tmp, &console_list, node) {
|
||||
if (!(con->flags & CON_BOOT))
|
||||
continue;
|
||||
|
||||
@@ -3337,9 +3599,11 @@ static int __init printk_late_init(void)
|
||||
*/
|
||||
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
|
||||
con->name, con->index);
|
||||
unregister_console(con);
|
||||
unregister_console_locked(con);
|
||||
}
|
||||
}
|
||||
console_list_unlock();
|
||||
|
||||
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
|
||||
console_cpu_notify);
|
||||
WARN_ON(ret < 0);
|
||||
@@ -3359,6 +3623,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
|
||||
struct console *c;
|
||||
u64 last_diff = 0;
|
||||
u64 printk_seq;
|
||||
int cookie;
|
||||
u64 diff;
|
||||
u64 seq;
|
||||
|
||||
@@ -3369,9 +3634,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
|
||||
for (;;) {
|
||||
diff = 0;
|
||||
|
||||
/*
|
||||
* Hold the console_lock to guarantee safe access to
|
||||
* console->seq and to prevent changes to @console_suspended
|
||||
* until all consoles have been processed.
|
||||
*/
|
||||
console_lock();
|
||||
|
||||
for_each_console(c) {
|
||||
cookie = console_srcu_read_lock();
|
||||
for_each_console_srcu(c) {
|
||||
if (con && con != c)
|
||||
continue;
|
||||
if (!console_is_usable(c))
|
||||
@@ -3380,6 +3651,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
|
||||
if (printk_seq < seq)
|
||||
diff += seq - printk_seq;
|
||||
}
|
||||
console_srcu_read_unlock(cookie);
|
||||
|
||||
/*
|
||||
* If consoles are suspended, it cannot be expected that they
|
||||
|
||||
@@ -59,43 +59,39 @@ int profile_setup(char *str)
|
||||
static const char schedstr[] = "schedule";
|
||||
static const char sleepstr[] = "sleep";
|
||||
static const char kvmstr[] = "kvm";
|
||||
const char *select = NULL;
|
||||
int par;
|
||||
|
||||
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
force_schedstat_enabled();
|
||||
prof_on = SLEEP_PROFILING;
|
||||
if (str[strlen(sleepstr)] == ',')
|
||||
str += strlen(sleepstr) + 1;
|
||||
if (get_option(&str, &par))
|
||||
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
|
||||
pr_info("kernel sleep profiling enabled (shift: %u)\n",
|
||||
prof_shift);
|
||||
select = sleepstr;
|
||||
#else
|
||||
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
|
||||
#endif /* CONFIG_SCHEDSTATS */
|
||||
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
|
||||
prof_on = SCHED_PROFILING;
|
||||
if (str[strlen(schedstr)] == ',')
|
||||
str += strlen(schedstr) + 1;
|
||||
if (get_option(&str, &par))
|
||||
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
|
||||
pr_info("kernel schedule profiling enabled (shift: %u)\n",
|
||||
prof_shift);
|
||||
select = schedstr;
|
||||
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
|
||||
prof_on = KVM_PROFILING;
|
||||
if (str[strlen(kvmstr)] == ',')
|
||||
str += strlen(kvmstr) + 1;
|
||||
if (get_option(&str, &par))
|
||||
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
|
||||
pr_info("kernel KVM profiling enabled (shift: %u)\n",
|
||||
prof_shift);
|
||||
select = kvmstr;
|
||||
} else if (get_option(&str, &par)) {
|
||||
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
|
||||
prof_on = CPU_PROFILING;
|
||||
pr_info("kernel profiling enabled (shift: %u)\n",
|
||||
prof_shift);
|
||||
}
|
||||
|
||||
if (select) {
|
||||
if (str[strlen(select)] == ',')
|
||||
str += strlen(select) + 1;
|
||||
if (get_option(&str, &par))
|
||||
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
|
||||
pr_info("kernel %s profiling enabled (shift: %u)\n",
|
||||
select, prof_shift);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
__setup("profile=", profile_setup);
|
||||
|
||||
@@ -54,27 +54,25 @@ config RCU_EXPERT
|
||||
Say N if you are unsure.
|
||||
|
||||
config SRCU
|
||||
bool
|
||||
help
|
||||
This option selects the sleepable version of RCU. This version
|
||||
permits arbitrary sleeping or blocking within RCU read-side critical
|
||||
sections.
|
||||
def_bool y
|
||||
|
||||
config TINY_SRCU
|
||||
bool
|
||||
default y if SRCU && TINY_RCU
|
||||
default y if TINY_RCU
|
||||
help
|
||||
This option selects the single-CPU non-preemptible version of SRCU.
|
||||
|
||||
config TREE_SRCU
|
||||
bool
|
||||
default y if SRCU && !TINY_RCU
|
||||
default y if !TINY_RCU
|
||||
help
|
||||
This option selects the full-fledged version of SRCU.
|
||||
|
||||
config NEED_SRCU_NMI_SAFE
|
||||
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
|
||||
|
||||
config TASKS_RCU_GENERIC
|
||||
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
|
||||
select SRCU
|
||||
help
|
||||
This option enables generic infrastructure code supporting
|
||||
task-based RCU implementations. Not for manual selection.
|
||||
|
||||
@@ -27,7 +27,6 @@ config RCU_SCALE_TEST
|
||||
tristate "performance tests for RCU"
|
||||
depends on DEBUG_KERNEL
|
||||
select TORTURE_TEST
|
||||
select SRCU
|
||||
default n
|
||||
help
|
||||
This option provides a kernel module that runs performance
|
||||
@@ -43,7 +42,6 @@ config RCU_TORTURE_TEST
|
||||
tristate "torture tests for RCU"
|
||||
depends on DEBUG_KERNEL
|
||||
select TORTURE_TEST
|
||||
select SRCU
|
||||
default n
|
||||
help
|
||||
This option provides a kernel module that runs torture tests
|
||||
@@ -59,7 +57,6 @@ config RCU_REF_SCALE_TEST
|
||||
tristate "Scalability tests for read-side synchronization (RCU and others)"
|
||||
depends on DEBUG_KERNEL
|
||||
select TORTURE_TEST
|
||||
select SRCU
|
||||
default n
|
||||
help
|
||||
This option provides a kernel module that runs performance tests
|
||||
|
||||
@@ -286,7 +286,7 @@ void rcu_test_sync_prims(void);
|
||||
*/
|
||||
extern void resched_cpu(int cpu);
|
||||
|
||||
#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
|
||||
#if !defined(CONFIG_TINY_RCU)
|
||||
|
||||
#include <linux/rcu_node_tree.h>
|
||||
|
||||
@@ -375,6 +375,10 @@ extern void rcu_init_geometry(void);
|
||||
(cpu) <= rnp->grphi; \
|
||||
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
|
||||
|
||||
#endif /* !defined(CONFIG_TINY_RCU) */
|
||||
|
||||
#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
|
||||
|
||||
/*
|
||||
* Wrappers for the rcu_node::lock acquire and release.
|
||||
*
|
||||
@@ -437,7 +441,7 @@ do { \
|
||||
#define raw_lockdep_assert_held_rcu_node(p) \
|
||||
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
|
||||
|
||||
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
|
||||
#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
|
||||
|
||||
#ifdef CONFIG_TINY_RCU
|
||||
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
|
||||
|
||||
@@ -615,10 +615,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
|
||||
DEFINE_STATIC_SRCU(srcu_ctl);
|
||||
static struct srcu_struct srcu_ctld;
|
||||
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
|
||||
static struct rcu_torture_ops srcud_ops;
|
||||
|
||||
static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
|
||||
{
|
||||
return srcu_read_lock(srcu_ctlp);
|
||||
if (cur_ops == &srcud_ops)
|
||||
return srcu_read_lock_nmisafe(srcu_ctlp);
|
||||
else
|
||||
return srcu_read_lock(srcu_ctlp);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -642,7 +646,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
|
||||
|
||||
static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
|
||||
{
|
||||
srcu_read_unlock(srcu_ctlp, idx);
|
||||
if (cur_ops == &srcud_ops)
|
||||
srcu_read_unlock_nmisafe(srcu_ctlp, idx);
|
||||
else
|
||||
srcu_read_unlock(srcu_ctlp, idx);
|
||||
}
|
||||
|
||||
static int torture_srcu_read_lock_held(void)
|
||||
|
||||
@@ -197,6 +197,16 @@ void synchronize_srcu(struct srcu_struct *ssp)
|
||||
{
|
||||
struct rcu_synchronize rs;
|
||||
|
||||
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
|
||||
lock_is_held(&rcu_bh_lock_map) ||
|
||||
lock_is_held(&rcu_lock_map) ||
|
||||
lock_is_held(&rcu_sched_lock_map),
|
||||
"Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
|
||||
|
||||
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
|
||||
return;
|
||||
|
||||
might_sleep();
|
||||
init_rcu_head_on_stack(&rs.head);
|
||||
init_completion(&rs.completion);
|
||||
call_srcu(ssp, &rs.head, wakeme_after_rcu);
|
||||
|
||||
@@ -417,7 +417,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
|
||||
sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
@@ -429,13 +429,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
|
||||
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long mask = 0;
|
||||
unsigned long sum = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
|
||||
sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
|
||||
if (IS_ENABLED(CONFIG_PROVE_RCU))
|
||||
mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
|
||||
}
|
||||
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
|
||||
"Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
|
||||
return sum;
|
||||
}
|
||||
|
||||
@@ -503,10 +508,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[0]);
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[1]);
|
||||
sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
|
||||
sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
|
||||
sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
|
||||
sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
|
||||
sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
|
||||
sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
@@ -626,6 +631,29 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
|
||||
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
/*
|
||||
* Check for consistent NMI safety.
|
||||
*/
|
||||
void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
|
||||
{
|
||||
int nmi_safe_mask = 1 << nmi_safe;
|
||||
int old_nmi_safe_mask;
|
||||
struct srcu_data *sdp;
|
||||
|
||||
/* NMI-unsafe use in NMI is a bad sign */
|
||||
WARN_ON_ONCE(!nmi_safe && in_nmi());
|
||||
sdp = raw_cpu_ptr(ssp->sda);
|
||||
old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
|
||||
if (!old_nmi_safe_mask) {
|
||||
WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
|
||||
return;
|
||||
}
|
||||
WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
|
||||
#endif /* CONFIG_PROVE_RCU */
|
||||
|
||||
/*
|
||||
* Counts the new reader in the appropriate per-CPU element of the
|
||||
* srcu_struct.
|
||||
@@ -636,7 +664,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
|
||||
int idx;
|
||||
|
||||
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
|
||||
this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
|
||||
this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
|
||||
smp_mb(); /* B */ /* Avoid leaking the critical section. */
|
||||
return idx;
|
||||
}
|
||||
@@ -650,10 +678,45 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
|
||||
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
|
||||
{
|
||||
smp_mb(); /* C */ /* Avoid leaking the critical section. */
|
||||
this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
|
||||
this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
#ifdef CONFIG_NEED_SRCU_NMI_SAFE
|
||||
|
||||
/*
|
||||
* Counts the new reader in the appropriate per-CPU element of the
|
||||
* srcu_struct, but in an NMI-safe manner using RMW atomics.
|
||||
* Returns an index that must be passed to the matching srcu_read_unlock().
|
||||
*/
|
||||
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
|
||||
{
|
||||
int idx;
|
||||
struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
|
||||
|
||||
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
|
||||
atomic_long_inc(&sdp->srcu_lock_count[idx]);
|
||||
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
|
||||
return idx;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
|
||||
|
||||
/*
|
||||
* Removes the count for the old reader from the appropriate per-CPU
|
||||
* element of the srcu_struct. Note that this may well be a different
|
||||
* CPU than that which was incremented by the corresponding srcu_read_lock().
|
||||
*/
|
||||
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
|
||||
{
|
||||
struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
|
||||
|
||||
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
|
||||
atomic_long_inc(&sdp->srcu_unlock_count[idx]);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
|
||||
|
||||
#endif // CONFIG_NEED_SRCU_NMI_SAFE
|
||||
|
||||
/*
|
||||
* Start an SRCU grace period.
|
||||
*/
|
||||
@@ -1090,7 +1153,12 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
|
||||
int ss_state;
|
||||
|
||||
check_init_srcu_struct(ssp);
|
||||
idx = srcu_read_lock(ssp);
|
||||
/*
|
||||
* While starting a new grace period, make sure we are in an
|
||||
* SRCU read-side critical section so that the grace-period
|
||||
* sequence number cannot wrap around in the meantime.
|
||||
*/
|
||||
idx = __srcu_read_lock_nmisafe(ssp);
|
||||
ss_state = smp_load_acquire(&ssp->srcu_size_state);
|
||||
if (ss_state < SRCU_SIZE_WAIT_CALL)
|
||||
sdp = per_cpu_ptr(ssp->sda, 0);
|
||||
@@ -1123,7 +1191,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
|
||||
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
|
||||
else if (needexp)
|
||||
srcu_funnel_exp_start(ssp, sdp_mynode, s);
|
||||
srcu_read_unlock(ssp, idx);
|
||||
__srcu_read_unlock_nmisafe(ssp, idx);
|
||||
return s;
|
||||
}
|
||||
|
||||
@@ -1427,13 +1495,13 @@ void srcu_barrier(struct srcu_struct *ssp)
|
||||
/* Initial count prevents reaching zero until all CBs are posted. */
|
||||
atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
|
||||
|
||||
idx = srcu_read_lock(ssp);
|
||||
idx = __srcu_read_lock_nmisafe(ssp);
|
||||
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
|
||||
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
|
||||
else
|
||||
for_each_possible_cpu(cpu)
|
||||
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
|
||||
srcu_read_unlock(ssp, idx);
|
||||
__srcu_read_unlock_nmisafe(ssp, idx);
|
||||
|
||||
/* Remove the initial count, at which point reaching zero can happen. */
|
||||
if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
|
||||
@@ -1687,8 +1755,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
|
||||
struct srcu_data *sdp;
|
||||
|
||||
sdp = per_cpu_ptr(ssp->sda, cpu);
|
||||
u0 = data_race(sdp->srcu_unlock_count[!idx]);
|
||||
u1 = data_race(sdp->srcu_unlock_count[idx]);
|
||||
u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
|
||||
u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
|
||||
|
||||
/*
|
||||
* Make sure that a lock is always counted if the corresponding
|
||||
@@ -1696,8 +1764,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
l0 = data_race(sdp->srcu_lock_count[!idx]);
|
||||
l1 = data_race(sdp->srcu_lock_count[idx]);
|
||||
l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
|
||||
l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
|
||||
|
||||
c0 = l0 - u0;
|
||||
c1 = l1 - u1;
|
||||
|
||||
@@ -224,7 +224,7 @@ void rcu_test_sync_prims(void)
|
||||
synchronize_rcu_expedited();
|
||||
}
|
||||
|
||||
#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
|
||||
#if !defined(CONFIG_TINY_RCU)
|
||||
|
||||
/*
|
||||
* Switch to run-time mode once RCU has fully initialized.
|
||||
@@ -239,7 +239,7 @@ static int __init rcu_set_runtime_mode(void)
|
||||
}
|
||||
core_initcall(rcu_set_runtime_mode);
|
||||
|
||||
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
|
||||
#endif /* #if !defined(CONFIG_TINY_RCU) */
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
static struct lock_class_key rcu_lock_key;
|
||||
@@ -559,10 +559,8 @@ static void early_boot_test_call_rcu(void)
|
||||
struct early_boot_kfree_rcu *rhp;
|
||||
|
||||
call_rcu(&head, test_callback);
|
||||
if (IS_ENABLED(CONFIG_SRCU)) {
|
||||
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
|
||||
call_srcu(&early_srcu, &shead, test_callback);
|
||||
}
|
||||
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
|
||||
call_srcu(&early_srcu, &shead, test_callback);
|
||||
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
|
||||
if (!WARN_ON_ONCE(!rhp))
|
||||
kfree_rcu(rhp, rh);
|
||||
@@ -585,11 +583,9 @@ static int rcu_verify_early_boot_tests(void)
|
||||
if (rcu_self_test) {
|
||||
early_boot_test_counter++;
|
||||
rcu_barrier();
|
||||
if (IS_ENABLED(CONFIG_SRCU)) {
|
||||
early_boot_test_counter++;
|
||||
srcu_barrier(&early_srcu);
|
||||
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
|
||||
}
|
||||
early_boot_test_counter++;
|
||||
srcu_barrier(&early_srcu);
|
||||
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
|
||||
}
|
||||
if (rcu_self_test_counter != early_boot_test_counter) {
|
||||
WARN_ON(1);
|
||||
|
||||
@@ -243,6 +243,17 @@ void migrate_to_reboot_cpu(void)
|
||||
set_cpus_allowed_ptr(current, cpumask_of(cpu));
|
||||
}
|
||||
|
||||
/*
|
||||
* Notifier list for kernel code which wants to be called
|
||||
* to prepare system for restart.
|
||||
*/
|
||||
static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
|
||||
|
||||
static void do_kernel_restart_prepare(void)
|
||||
{
|
||||
blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* kernel_restart - reboot the system
|
||||
* @cmd: pointer to buffer containing command to execute for restart
|
||||
@@ -254,6 +265,7 @@ void migrate_to_reboot_cpu(void)
|
||||
void kernel_restart(char *cmd)
|
||||
{
|
||||
kernel_restart_prepare(cmd);
|
||||
do_kernel_restart_prepare();
|
||||
migrate_to_reboot_cpu();
|
||||
syscore_shutdown();
|
||||
if (!cmd)
|
||||
@@ -396,6 +408,11 @@ register_sys_off_handler(enum sys_off_mode mode,
|
||||
handler->list = &power_off_handler_list;
|
||||
break;
|
||||
|
||||
case SYS_OFF_MODE_RESTART_PREPARE:
|
||||
handler->list = &restart_prep_handler_list;
|
||||
handler->blocking = true;
|
||||
break;
|
||||
|
||||
case SYS_OFF_MODE_RESTART:
|
||||
handler->list = &restart_handler_list;
|
||||
break;
|
||||
|
||||
@@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
|
||||
*/
|
||||
static struct page **relay_alloc_page_array(unsigned int n_pages)
|
||||
{
|
||||
const size_t pa_size = n_pages * sizeof(struct page *);
|
||||
if (pa_size > PAGE_SIZE)
|
||||
return vzalloc(pa_size);
|
||||
return kzalloc(pa_size, GFP_KERNEL);
|
||||
return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -357,10 +357,7 @@ static void __sched_core_flip(bool enabled)
|
||||
/*
|
||||
* Toggle the offline CPUs.
|
||||
*/
|
||||
cpumask_copy(&sched_core_mask, cpu_possible_mask);
|
||||
cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
|
||||
|
||||
for_each_cpu(cpu, &sched_core_mask)
|
||||
for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
|
||||
cpu_rq(cpu)->core_enabled = enabled;
|
||||
|
||||
cpus_read_unlock();
|
||||
@@ -704,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
||||
|
||||
rq->prev_irq_time += irq_delta;
|
||||
delta -= irq_delta;
|
||||
psi_account_irqtime(rq->curr, irq_delta);
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
||||
if (static_key_false((¶virt_steal_rq_enabled))) {
|
||||
@@ -4392,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
static void reset_memory_tiering(void)
|
||||
{
|
||||
struct pglist_data *pgdat;
|
||||
|
||||
for_each_online_pgdat(pgdat) {
|
||||
pgdat->nbp_threshold = 0;
|
||||
pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
|
||||
pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
|
||||
}
|
||||
}
|
||||
|
||||
int sysctl_numa_balancing(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
@@ -4408,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
|
||||
if (err < 0)
|
||||
return err;
|
||||
if (write) {
|
||||
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
|
||||
(state & NUMA_BALANCING_MEMORY_TIERING))
|
||||
reset_memory_tiering();
|
||||
sysctl_numa_balancing_mode = state;
|
||||
__set_numabalancing_state(state);
|
||||
}
|
||||
@@ -5162,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
|
||||
* finish_task_switch()'s mmdrop().
|
||||
*/
|
||||
switch_mm_irqs_off(prev->active_mm, next->mm, next);
|
||||
lru_gen_use_mm(next->mm);
|
||||
|
||||
if (!prev->mm) { // from kernel
|
||||
/* will mmdrop() in finish_task_switch(). */
|
||||
|
||||
@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
|
||||
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
|
||||
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
|
||||
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
|
||||
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
|
||||
#endif
|
||||
|
||||
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
|
||||
#include <linux/cpuidle.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/memory-tiers.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/mutex_api.h>
|
||||
#include <linux/profile.h>
|
||||
@@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
||||
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
|
||||
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
||||
|
||||
/* The page with hint page fault latency < threshold in ms is considered hot */
|
||||
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
|
||||
|
||||
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
|
||||
unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
|
||||
|
||||
struct numa_group {
|
||||
refcount_t refcount;
|
||||
|
||||
@@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
|
||||
return 1000 * faults / total_faults;
|
||||
}
|
||||
|
||||
/*
|
||||
* If memory tiering mode is enabled, cpupid of slow memory page is
|
||||
* used to record scan time instead of CPU and PID. When tiering mode
|
||||
* is disabled at run time, the scan time (in cpupid) will be
|
||||
* interpreted as CPU and PID. So CPU needs to be checked to avoid to
|
||||
* access out of array bound.
|
||||
*/
|
||||
static inline bool cpupid_valid(int cpupid)
|
||||
{
|
||||
return cpupid_to_cpu(cpupid) < nr_cpu_ids;
|
||||
}
|
||||
|
||||
/*
|
||||
* For memory tiering mode, if there are enough free pages (more than
|
||||
* enough watermark defined here) in fast memory node, to take full
|
||||
* advantage of fast memory capacity, all recently accessed slow
|
||||
* memory pages will be migrated to fast memory node without
|
||||
* considering hot threshold.
|
||||
*/
|
||||
static bool pgdat_free_space_enough(struct pglist_data *pgdat)
|
||||
{
|
||||
int z;
|
||||
unsigned long enough_wmark;
|
||||
|
||||
enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
|
||||
pgdat->node_present_pages >> 4);
|
||||
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
|
||||
struct zone *zone = pgdat->node_zones + z;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
if (zone_watermark_ok(zone, 0,
|
||||
wmark_pages(zone, WMARK_PROMO) + enough_wmark,
|
||||
ZONE_MOVABLE, 0))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* For memory tiering mode, when page tables are scanned, the scan
|
||||
* time will be recorded in struct page in addition to make page
|
||||
* PROT_NONE for slow memory page. So when the page is accessed, in
|
||||
* hint page fault handler, the hint page fault latency is calculated
|
||||
* via,
|
||||
*
|
||||
* hint page fault latency = hint page fault time - scan time
|
||||
*
|
||||
* The smaller the hint page fault latency, the higher the possibility
|
||||
* for the page to be hot.
|
||||
*/
|
||||
static int numa_hint_fault_latency(struct page *page)
|
||||
{
|
||||
int last_time, time;
|
||||
|
||||
time = jiffies_to_msecs(jiffies);
|
||||
last_time = xchg_page_access_time(page, time);
|
||||
|
||||
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* For memory tiering mode, too high promotion/demotion throughput may
|
||||
* hurt application latency. So we provide a mechanism to rate limit
|
||||
* the number of pages that are tried to be promoted.
|
||||
*/
|
||||
static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
|
||||
unsigned long rate_limit, int nr)
|
||||
{
|
||||
unsigned long nr_cand;
|
||||
unsigned int now, start;
|
||||
|
||||
now = jiffies_to_msecs(jiffies);
|
||||
mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
|
||||
nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
|
||||
start = pgdat->nbp_rl_start;
|
||||
if (now - start > MSEC_PER_SEC &&
|
||||
cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
|
||||
pgdat->nbp_rl_nr_cand = nr_cand;
|
||||
if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#define NUMA_MIGRATION_ADJUST_STEPS 16
|
||||
|
||||
static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
|
||||
unsigned long rate_limit,
|
||||
unsigned int ref_th)
|
||||
{
|
||||
unsigned int now, start, th_period, unit_th, th;
|
||||
unsigned long nr_cand, ref_cand, diff_cand;
|
||||
|
||||
now = jiffies_to_msecs(jiffies);
|
||||
th_period = sysctl_numa_balancing_scan_period_max;
|
||||
start = pgdat->nbp_th_start;
|
||||
if (now - start > th_period &&
|
||||
cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
|
||||
ref_cand = rate_limit *
|
||||
sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
|
||||
nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
|
||||
diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
|
||||
unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
|
||||
th = pgdat->nbp_threshold ? : ref_th;
|
||||
if (diff_cand > ref_cand * 11 / 10)
|
||||
th = max(th - unit_th, unit_th);
|
||||
else if (diff_cand < ref_cand * 9 / 10)
|
||||
th = min(th + unit_th, ref_th * 2);
|
||||
pgdat->nbp_th_nr_cand = nr_cand;
|
||||
pgdat->nbp_threshold = th;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
||||
int src_nid, int dst_cpu)
|
||||
{
|
||||
@@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
||||
int dst_nid = cpu_to_node(dst_cpu);
|
||||
int last_cpupid, this_cpupid;
|
||||
|
||||
/*
|
||||
* The pages in slow memory node should be migrated according
|
||||
* to hot/cold instead of private/shared.
|
||||
*/
|
||||
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
|
||||
!node_is_toptier(src_nid)) {
|
||||
struct pglist_data *pgdat;
|
||||
unsigned long rate_limit;
|
||||
unsigned int latency, th, def_th;
|
||||
|
||||
pgdat = NODE_DATA(dst_nid);
|
||||
if (pgdat_free_space_enough(pgdat)) {
|
||||
/* workload changed, reset hot threshold */
|
||||
pgdat->nbp_threshold = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
def_th = sysctl_numa_balancing_hot_threshold;
|
||||
rate_limit = sysctl_numa_balancing_promote_rate_limit << \
|
||||
(20 - PAGE_SHIFT);
|
||||
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
|
||||
|
||||
th = pgdat->nbp_threshold ? : def_th;
|
||||
latency = numa_hint_fault_latency(page);
|
||||
if (latency >= th)
|
||||
return false;
|
||||
|
||||
return !numa_promotion_rate_limit(pgdat, rate_limit,
|
||||
thp_nr_pages(page));
|
||||
}
|
||||
|
||||
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
|
||||
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
|
||||
|
||||
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
|
||||
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Allow first faults or private faults to migrate immediately early in
|
||||
* the lifetime of a task. The magic number 4 is based on waiting for
|
||||
@@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
if (!p->mm)
|
||||
return;
|
||||
|
||||
/*
|
||||
* NUMA faults statistics are unnecessary for the slow memory
|
||||
* node for memory tiering mode.
|
||||
*/
|
||||
if (!node_is_toptier(mem_node) &&
|
||||
(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
|
||||
!cpupid_valid(last_cpupid)))
|
||||
return;
|
||||
|
||||
/* Allocate buffer to track faults on a per-node basis */
|
||||
if (unlikely(!p->numa_faults)) {
|
||||
int size = sizeof(*p->numa_faults) *
|
||||
@@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
|
||||
struct task_struct *p = current;
|
||||
struct mm_struct *mm = p->mm;
|
||||
u64 runtime = p->se.sum_exec_runtime;
|
||||
MA_STATE(mas, &mm->mm_mt, 0, 0);
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long start, end;
|
||||
unsigned long nr_pte_updates = 0;
|
||||
@@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
|
||||
|
||||
if (!mmap_read_trylock(mm))
|
||||
return;
|
||||
vma = find_vma(mm, start);
|
||||
mas_set(&mas, start);
|
||||
vma = mas_find(&mas, ULONG_MAX);
|
||||
if (!vma) {
|
||||
reset_ptenuma_scan(p);
|
||||
start = 0;
|
||||
vma = mm->mmap;
|
||||
mas_set(&mas, start);
|
||||
vma = mas_find(&mas, ULONG_MAX);
|
||||
}
|
||||
for (; vma; vma = vma->vm_next) {
|
||||
|
||||
for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
|
||||
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
|
||||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
|
||||
continue;
|
||||
|
||||
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
group->enabled = true;
|
||||
for_each_possible_cpu(cpu)
|
||||
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
|
||||
group->avg_last_update = sched_clock();
|
||||
@@ -201,6 +202,7 @@ void __init psi_init(void)
|
||||
{
|
||||
if (!psi_enable) {
|
||||
static_branch_enable(&psi_disabled);
|
||||
static_branch_disable(&psi_cgroups_enabled);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -211,7 +213,7 @@ void __init psi_init(void)
|
||||
group_init(&psi_system);
|
||||
}
|
||||
|
||||
static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
|
||||
{
|
||||
switch (state) {
|
||||
case PSI_IO_SOME:
|
||||
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
return unlikely(tasks[NR_MEMSTALL] &&
|
||||
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||
return unlikely(tasks[NR_RUNNING] > oncpu);
|
||||
case PSI_CPU_FULL:
|
||||
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
|
||||
return unlikely(tasks[NR_RUNNING] && !oncpu);
|
||||
case PSI_NONIDLE:
|
||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||
tasks[NR_RUNNING];
|
||||
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
bool wake_clock)
|
||||
{
|
||||
struct psi_group_cpu *groupc;
|
||||
u32 state_mask = 0;
|
||||
unsigned int t, m;
|
||||
enum psi_states s;
|
||||
u32 state_mask;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
/*
|
||||
* First we assess the aggregate resource states this CPU's
|
||||
* tasks have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*
|
||||
* Then we update the task counts according to the state
|
||||
* First we update the task counts according to the state
|
||||
* change requested through the @clear and @set bits.
|
||||
*
|
||||
* Then if the cgroup PSI stats accounting enabled, we
|
||||
* assess the aggregate resource states this CPU's tasks
|
||||
* have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*/
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, now);
|
||||
/*
|
||||
* Start with TSK_ONCPU, which doesn't have a corresponding
|
||||
* task count - it's just a boolean flag directly encoded in
|
||||
* the state mask. Clear, set, or carry the current state if
|
||||
* no changes are requested.
|
||||
*/
|
||||
if (unlikely(clear & TSK_ONCPU)) {
|
||||
state_mask = 0;
|
||||
clear &= ~TSK_ONCPU;
|
||||
} else if (unlikely(set & TSK_ONCPU)) {
|
||||
state_mask = PSI_ONCPU;
|
||||
set &= ~TSK_ONCPU;
|
||||
} else {
|
||||
state_mask = groupc->state_mask & PSI_ONCPU;
|
||||
}
|
||||
|
||||
/*
|
||||
* The rest of the state mask is calculated based on the task
|
||||
* counts. Update those first, then construct the mask.
|
||||
*/
|
||||
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
|
||||
if (!(m & (1 << t)))
|
||||
continue;
|
||||
if (groupc->tasks[t]) {
|
||||
groupc->tasks[t]--;
|
||||
} else if (!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
groupc->tasks[3], groupc->tasks[4],
|
||||
clear, set);
|
||||
groupc->tasks[3], clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
}
|
||||
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
if (set & (1 << t))
|
||||
groupc->tasks[t]++;
|
||||
|
||||
/* Calculate state mask representing active states */
|
||||
if (!group->enabled) {
|
||||
/*
|
||||
* On the first group change after disabling PSI, conclude
|
||||
* the current state and flush its time. This is unlikely
|
||||
* to matter to the user, but aggregation (get_recent_times)
|
||||
* may have already incorporated the live state into times_prev;
|
||||
* avoid a delta sample underflow when PSI is later re-enabled.
|
||||
*/
|
||||
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
return;
|
||||
}
|
||||
|
||||
for (s = 0; s < NR_PSI_STATES; s++) {
|
||||
if (test_state(groupc->tasks, s))
|
||||
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
|
||||
state_mask |= (1 << s);
|
||||
}
|
||||
|
||||
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
* task in a cgroup is in_memstall, the corresponding groupc
|
||||
* on that cpu is in PSI_MEM_FULL state.
|
||||
*/
|
||||
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
|
||||
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
|
||||
state_mask |= (1 << PSI_MEM_FULL);
|
||||
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
|
||||
}
|
||||
|
||||
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
|
||||
static inline struct psi_group *task_psi_group(struct task_struct *task)
|
||||
{
|
||||
if (*iter == &psi_system)
|
||||
return NULL;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
if (static_branch_likely(&psi_cgroups_enabled)) {
|
||||
struct cgroup *cgroup = NULL;
|
||||
|
||||
if (!*iter)
|
||||
cgroup = task->cgroups->dfl_cgrp;
|
||||
else
|
||||
cgroup = cgroup_parent(*iter);
|
||||
|
||||
if (cgroup && cgroup_parent(cgroup)) {
|
||||
*iter = cgroup;
|
||||
return cgroup_psi(cgroup);
|
||||
}
|
||||
}
|
||||
if (static_branch_likely(&psi_cgroups_enabled))
|
||||
return cgroup_psi(task_dfl_cgroup(task));
|
||||
#endif
|
||||
*iter = &psi_system;
|
||||
return &psi_system;
|
||||
}
|
||||
|
||||
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
struct psi_group *group;
|
||||
bool wake_clock = true;
|
||||
void *iter = NULL;
|
||||
u64 now;
|
||||
|
||||
if (!task->pid)
|
||||
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
psi_flags_change(task, clear, set);
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
* task changes, so we wake it back up if necessary. However,
|
||||
* don't do this if the task change is the aggregation worker
|
||||
* itself going to sleep, or we'll ping-pong forever.
|
||||
*/
|
||||
if (unlikely((clear & TSK_RUNNING) &&
|
||||
(task->flags & PF_WQ_WORKER) &&
|
||||
wq_worker_last_func(task) == psi_avgs_work))
|
||||
wake_clock = false;
|
||||
|
||||
while ((group = iterate_groups(task, &iter)))
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
{
|
||||
struct psi_group *group, *common = NULL;
|
||||
int cpu = task_cpu(prev);
|
||||
void *iter;
|
||||
u64 now = cpu_clock(cpu);
|
||||
|
||||
if (next->pid) {
|
||||
bool identical_state;
|
||||
|
||||
psi_flags_change(next, 0, TSK_ONCPU);
|
||||
/*
|
||||
* When switching between tasks that have an identical
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* we reach the first common ancestor. Iterate @next's
|
||||
* ancestors only until we encounter @prev's ONCPU.
|
||||
* Set TSK_ONCPU on @next's cgroups. If @next shares any
|
||||
* ancestors with @prev, those will already have @prev's
|
||||
* TSK_ONCPU bit set, and we can stop the iteration there.
|
||||
*/
|
||||
identical_state = prev->psi_flags == next->psi_flags;
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(next, &iter))) {
|
||||
if (identical_state &&
|
||||
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
|
||||
group = task_psi_group(next);
|
||||
do {
|
||||
if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
|
||||
PSI_ONCPU) {
|
||||
common = group;
|
||||
break;
|
||||
}
|
||||
|
||||
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
|
||||
}
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
|
||||
if (prev->pid) {
|
||||
int clear = TSK_ONCPU, set = 0;
|
||||
bool wake_clock = true;
|
||||
|
||||
/*
|
||||
* When we're going to sleep, psi_dequeue() lets us
|
||||
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
clear |= TSK_MEMSTALL_RUNNING;
|
||||
if (prev->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
* task changes, so we wake it back up if necessary. However,
|
||||
* don't do this if the task change is the aggregation worker
|
||||
* itself going to sleep, or we'll ping-pong forever.
|
||||
*/
|
||||
if (unlikely((prev->flags & PF_WQ_WORKER) &&
|
||||
wq_worker_last_func(prev) == psi_avgs_work))
|
||||
wake_clock = false;
|
||||
}
|
||||
|
||||
psi_flags_change(prev, clear, set);
|
||||
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(prev, &iter)) && group != common)
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
group = task_psi_group(prev);
|
||||
do {
|
||||
if (group == common)
|
||||
break;
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
} while ((group = group->parent));
|
||||
|
||||
/*
|
||||
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
|
||||
* with dequeuing too, finish that for the rest of the hierarchy.
|
||||
* TSK_ONCPU is handled up to the common ancestor. If there are
|
||||
* any other differences between the two tasks (e.g. prev goes
|
||||
* to sleep, or only one task is memstall), finish propagating
|
||||
* those differences all the way up to the root.
|
||||
*/
|
||||
if (sleep) {
|
||||
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
|
||||
clear &= ~TSK_ONCPU;
|
||||
for (; group; group = iterate_groups(prev, &iter))
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
for (; group; group = group->parent)
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
struct psi_group *group;
|
||||
struct psi_group_cpu *groupc;
|
||||
u64 now;
|
||||
|
||||
if (!task->pid)
|
||||
return;
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
if (!group->enabled)
|
||||
continue;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, now);
|
||||
groupc->times[PSI_IRQ_FULL] += delta;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
|
||||
if (group->poll_states & (1 << PSI_IRQ_FULL))
|
||||
psi_schedule_poll_work(group, 1);
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* psi_memstall_enter - mark the beginning of a memory stall section
|
||||
* @flags: flags to handle nested sections
|
||||
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
|
||||
#ifdef CONFIG_CGROUPS
|
||||
int psi_cgroup_alloc(struct cgroup *cgroup)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
if (!static_branch_likely(&psi_cgroups_enabled))
|
||||
return 0;
|
||||
|
||||
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
|
||||
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
|
||||
return -ENOMEM;
|
||||
}
|
||||
group_init(cgroup->psi);
|
||||
cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void psi_cgroup_free(struct cgroup *cgroup)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
if (!static_branch_likely(&psi_cgroups_enabled))
|
||||
return;
|
||||
|
||||
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
|
||||
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
if (static_branch_likely(&psi_disabled)) {
|
||||
if (!static_branch_likely(&psi_cgroups_enabled)) {
|
||||
/*
|
||||
* Lame to do this here, but the scheduler cannot be locked
|
||||
* from the outside, so we move cgroups from inside sched/.
|
||||
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
||||
|
||||
task_rq_unlock(rq, task, &rf);
|
||||
}
|
||||
|
||||
void psi_cgroup_restart(struct psi_group *group)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* After we disable psi_group->enabled, we don't actually
|
||||
* stop percpu tasks accounting in each psi_group_cpu,
|
||||
* instead only stop test_state() loop, record_times()
|
||||
* and averaging worker, see psi_group_change() for details.
|
||||
*
|
||||
* When disable cgroup PSI, this function has nothing to sync
|
||||
* since cgroup pressure files are hidden and percpu psi_group_cpu
|
||||
* would see !psi_group->enabled and only do task accounting.
|
||||
*
|
||||
* When re-enable cgroup PSI, this function use psi_group_change()
|
||||
* to get correct state mask from test_state() loop on tasks[],
|
||||
* and restart groupc->state_start from now, use .clear = .set = 0
|
||||
* here since no task status really changed.
|
||||
*/
|
||||
if (!group->enabled)
|
||||
return;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
u64 now;
|
||||
|
||||
rq_lock_irq(rq, &rf);
|
||||
now = cpu_clock(cpu);
|
||||
psi_group_change(group, cpu, 0, 0, now, true);
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_CGROUPS */
|
||||
|
||||
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
{
|
||||
bool only_full = false;
|
||||
int full;
|
||||
u64 now;
|
||||
|
||||
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
group->avg_next_update = update_averages(group, now);
|
||||
mutex_unlock(&group->avgs_lock);
|
||||
|
||||
for (full = 0; full < 2; full++) {
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
only_full = res == PSI_IRQ;
|
||||
#endif
|
||||
|
||||
for (full = 0; full < 2 - only_full; full++) {
|
||||
unsigned long avg[3] = { 0, };
|
||||
u64 total = 0;
|
||||
int w;
|
||||
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
}
|
||||
|
||||
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
|
||||
full ? "full" : "some",
|
||||
full || only_full ? "full" : "some",
|
||||
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
|
||||
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
|
||||
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
|
||||
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
else
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
|
||||
return ERR_PTR(-EINVAL);
|
||||
#endif
|
||||
|
||||
if (state >= PSI_NONIDLE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
|
||||
.proc_release = psi_fop_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
static int psi_irq_show(struct seq_file *m, void *v)
|
||||
{
|
||||
return psi_show(m, &psi_system, PSI_IRQ);
|
||||
}
|
||||
|
||||
static int psi_irq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return psi_open(file, psi_irq_show);
|
||||
}
|
||||
|
||||
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
|
||||
size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
return psi_write(file, user_buf, nbytes, PSI_IRQ);
|
||||
}
|
||||
|
||||
static const struct proc_ops psi_irq_proc_ops = {
|
||||
.proc_open = psi_irq_open,
|
||||
.proc_read = seq_read,
|
||||
.proc_lseek = seq_lseek,
|
||||
.proc_write = psi_irq_write,
|
||||
.proc_poll = psi_fop_poll,
|
||||
.proc_release = psi_fop_release,
|
||||
};
|
||||
#endif
|
||||
|
||||
static int __init psi_proc_init(void)
|
||||
{
|
||||
if (psi_enable) {
|
||||
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
|
||||
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
|
||||
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
|
||||
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2446,6 +2446,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
|
||||
extern unsigned int sysctl_numa_balancing_scan_period_min;
|
||||
extern unsigned int sysctl_numa_balancing_scan_period_max;
|
||||
extern unsigned int sysctl_numa_balancing_scan_size;
|
||||
extern unsigned int sysctl_numa_balancing_hot_threshold;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
|
||||
@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PSI
|
||||
void psi_task_change(struct task_struct *task, int clear, int set);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta);
|
||||
|
||||
/*
|
||||
* PSI tracks state that persists across sleeps, such as iowaits and
|
||||
* memory stalls. As a result, it has to distinguish between sleeps,
|
||||
@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
||||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep) {}
|
||||
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
|
||||
#endif /* CONFIG_PSI */
|
||||
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
|
||||
@@ -1069,7 +1069,7 @@ static int __init nrcpus(char *str)
|
||||
int nr_cpus;
|
||||
|
||||
if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
|
||||
nr_cpu_ids = nr_cpus;
|
||||
set_nr_cpu_ids(nr_cpus);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1087,14 +1087,16 @@ static int __init maxcpus(char *str)
|
||||
|
||||
early_param("maxcpus", maxcpus);
|
||||
|
||||
#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
|
||||
/* Setup number of possible processor ids */
|
||||
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
|
||||
EXPORT_SYMBOL(nr_cpu_ids);
|
||||
#endif
|
||||
|
||||
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
|
||||
void __init setup_nr_cpu_ids(void)
|
||||
{
|
||||
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
|
||||
set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
|
||||
}
|
||||
|
||||
/* Called by boot processor to activate the rest. */
|
||||
|
||||
@@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
|
||||
|
||||
/* The outgoing CPU will normally get done quite quickly. */
|
||||
if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
|
||||
goto update_state;
|
||||
goto update_state_early;
|
||||
udelay(5);
|
||||
|
||||
/* But if the outgoing CPU dawdles, wait increasingly long times. */
|
||||
@@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
|
||||
break;
|
||||
sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
|
||||
}
|
||||
update_state:
|
||||
update_state_early:
|
||||
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
|
||||
update_state:
|
||||
if (oldstate == CPU_DEAD) {
|
||||
/* Outgoing CPU died normally, update state. */
|
||||
smp_mb(); /* atomic_read() before update. */
|
||||
atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
|
||||
} else {
|
||||
/* Outgoing CPU still hasn't died, set state accordingly. */
|
||||
if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
|
||||
oldstate, CPU_BROKEN) != oldstate)
|
||||
if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
|
||||
&oldstate, CPU_BROKEN))
|
||||
goto update_state;
|
||||
ret = false;
|
||||
}
|
||||
@@ -475,14 +476,14 @@ bool cpu_report_death(void)
|
||||
int newstate;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
|
||||
do {
|
||||
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
|
||||
if (oldstate != CPU_BROKEN)
|
||||
newstate = CPU_DEAD;
|
||||
else
|
||||
newstate = CPU_DEAD_FROZEN;
|
||||
} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
|
||||
oldstate, newstate) != oldstate);
|
||||
} while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
|
||||
&oldstate, newstate));
|
||||
return newstate == CPU_DEAD;
|
||||
}
|
||||
|
||||
|
||||
@@ -9,9 +9,6 @@
|
||||
#define KUNIT_PROC_READ 0
|
||||
#define KUNIT_PROC_WRITE 1
|
||||
|
||||
static int i_zero;
|
||||
static int i_one_hundred = 100;
|
||||
|
||||
/*
|
||||
* Test that proc_dointvec will not try to use a NULL .data field even when the
|
||||
* length is non-zero.
|
||||
@@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
/*
|
||||
* proc_dointvec expects a buffer in user space, so we allocate one. We
|
||||
@@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test)
|
||||
.maxlen = 0,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
|
||||
GFP_USER);
|
||||
@@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
|
||||
GFP_USER);
|
||||
@@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set(
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int),
|
||||
GFP_USER);
|
||||
@@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
size_t len = 4;
|
||||
loff_t pos = 0;
|
||||
@@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
size_t len = 5;
|
||||
loff_t pos = 0;
|
||||
@@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
char input[] = "9";
|
||||
size_t len = sizeof(input) - 1;
|
||||
@@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test)
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
char input[] = "-9";
|
||||
size_t len = sizeof(input) - 1;
|
||||
@@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min(
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
size_t max_len = 32, len = max_len;
|
||||
loff_t pos = 0;
|
||||
@@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
.extra1 = &i_zero,
|
||||
.extra2 = &i_one_hundred,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE_HUNDRED,
|
||||
};
|
||||
size_t max_len = 32, len = max_len;
|
||||
loff_t pos = 0;
|
||||
|
||||
@@ -82,9 +82,16 @@
|
||||
#include <linux/rtmutex.h>
|
||||
#endif
|
||||
|
||||
/* shared constants to be used in various sysctls */
|
||||
const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
|
||||
EXPORT_SYMBOL(sysctl_vals);
|
||||
|
||||
const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
|
||||
EXPORT_SYMBOL_GPL(sysctl_long_vals);
|
||||
|
||||
#if defined(CONFIG_SYSCTL)
|
||||
|
||||
/* Constants used for minimum and maximum */
|
||||
/* Constants used for minimum and maximum */
|
||||
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
static const int six_hundred_forty_kb = 640 * 1024;
|
||||
@@ -129,11 +136,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
|
||||
int sysctl_legacy_va_layout;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
/* min_extfrag_threshold is SYSCTL_ZERO */;
|
||||
static const int max_extfrag_threshold = 1000;
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
/*
|
||||
@@ -1052,9 +1054,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
|
||||
return 0;
|
||||
}
|
||||
|
||||
i = (unsigned long *) data;
|
||||
min = (unsigned long *) table->extra1;
|
||||
max = (unsigned long *) table->extra2;
|
||||
i = data;
|
||||
min = table->extra1;
|
||||
max = table->extra2;
|
||||
vleft = table->maxlen / sizeof(unsigned long);
|
||||
left = *lenp;
|
||||
|
||||
@@ -1641,6 +1643,14 @@ static struct ctl_table kern_table[] = {
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_FOUR,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_promote_rate_limit_MBps",
|
||||
.data = &sysctl_numa_balancing_promote_rate_limit,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
},
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
{
|
||||
.procname = "panic",
|
||||
@@ -2216,7 +2226,7 @@ static struct ctl_table vm_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = (void *)&max_extfrag_threshold,
|
||||
.extra2 = SYSCTL_ONE_THOUSAND,
|
||||
},
|
||||
{
|
||||
.procname = "compact_unevictable_allowed",
|
||||
|
||||
@@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
|
||||
/* record the work call stack in order to print it in KASAN reports */
|
||||
kasan_record_aux_stack(work);
|
||||
|
||||
head = READ_ONCE(task->task_works);
|
||||
do {
|
||||
head = READ_ONCE(task->task_works);
|
||||
if (unlikely(head == &work_exited))
|
||||
return -ESRCH;
|
||||
work->next = head;
|
||||
} while (cmpxchg(&task->task_works, head, work) != head);
|
||||
} while (!try_cmpxchg(&task->task_works, &head, work));
|
||||
|
||||
switch (notify) {
|
||||
case TWA_NONE:
|
||||
@@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task,
|
||||
* we raced with task_work_run(), *pprev == NULL/exited.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&task->pi_lock, flags);
|
||||
while ((work = READ_ONCE(*pprev))) {
|
||||
if (!match(work, data))
|
||||
work = READ_ONCE(*pprev);
|
||||
while (work) {
|
||||
if (!match(work, data)) {
|
||||
pprev = &work->next;
|
||||
else if (cmpxchg(pprev, work, work->next) == work)
|
||||
work = READ_ONCE(*pprev);
|
||||
} else if (try_cmpxchg(pprev, &work, work->next))
|
||||
break;
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
|
||||
@@ -151,16 +153,16 @@ void task_work_run(void)
|
||||
* work->func() can do task_work_add(), do not set
|
||||
* work_exited unless the list is empty.
|
||||
*/
|
||||
work = READ_ONCE(task->task_works);
|
||||
do {
|
||||
head = NULL;
|
||||
work = READ_ONCE(task->task_works);
|
||||
if (!work) {
|
||||
if (task->flags & PF_EXITING)
|
||||
head = &work_exited;
|
||||
else
|
||||
break;
|
||||
}
|
||||
} while (cmpxchg(&task->task_works, work, head) != work);
|
||||
} while (!try_cmpxchg(&task->task_works, &work, head));
|
||||
|
||||
if (!work)
|
||||
break;
|
||||
|
||||
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
|
||||
* CPUs that are currently online.
|
||||
*/
|
||||
for (i = 1; i < n; i++) {
|
||||
cpu = prandom_u32() % nr_cpu_ids;
|
||||
cpu = prandom_u32_max(nr_cpu_ids);
|
||||
cpu = cpumask_next(cpu - 1, cpu_online_mask);
|
||||
if (cpu >= nr_cpu_ids)
|
||||
cpu = cpumask_first(cpu_online_mask);
|
||||
|
||||
@@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
|
||||
static struct ftrace_ops *
|
||||
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
|
||||
|
||||
static bool skip_record(struct dyn_ftrace *rec)
|
||||
{
|
||||
/*
|
||||
* At boot up, weak functions are set to disable. Function tracing
|
||||
* can be enabled before they are, and they still need to be disabled now.
|
||||
* If the record is disabled, still continue if it is marked as already
|
||||
* enabled (this is needed to keep the accounting working).
|
||||
*/
|
||||
return rec->flags & FTRACE_FL_DISABLED &&
|
||||
!(rec->flags & FTRACE_FL_ENABLED);
|
||||
}
|
||||
|
||||
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
|
||||
int filter_hash,
|
||||
bool inc)
|
||||
@@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
|
||||
int in_hash = 0;
|
||||
int match = 0;
|
||||
|
||||
if (rec->flags & FTRACE_FL_DISABLED)
|
||||
if (skip_record(rec))
|
||||
continue;
|
||||
|
||||
if (all) {
|
||||
@@ -2016,7 +2028,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
|
||||
static void print_ip_ins(const char *fmt, const unsigned char *p)
|
||||
{
|
||||
char ins[MCOUNT_INSN_SIZE];
|
||||
int i;
|
||||
|
||||
if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
|
||||
printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
|
||||
@@ -2024,9 +2035,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
|
||||
}
|
||||
|
||||
printk(KERN_CONT "%s", fmt);
|
||||
|
||||
for (i = 0; i < MCOUNT_INSN_SIZE; i++)
|
||||
printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
|
||||
pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
|
||||
}
|
||||
|
||||
enum ftrace_bug_type ftrace_bug_type;
|
||||
@@ -2126,7 +2135,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
|
||||
|
||||
ftrace_bug_type = FTRACE_BUG_UNKNOWN;
|
||||
|
||||
if (rec->flags & FTRACE_FL_DISABLED)
|
||||
if (skip_record(rec))
|
||||
return FTRACE_UPDATE_IGNORE;
|
||||
|
||||
/*
|
||||
@@ -2241,7 +2250,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
|
||||
if (update) {
|
||||
/* If there's no more users, clear all flags */
|
||||
if (!ftrace_rec_count(rec))
|
||||
rec->flags = 0;
|
||||
rec->flags &= FTRACE_FL_DISABLED;
|
||||
else
|
||||
/*
|
||||
* Just disable the record, but keep the ops TRAMP
|
||||
@@ -2634,7 +2643,7 @@ void __weak ftrace_replace_code(int mod_flags)
|
||||
|
||||
do_for_each_ftrace_rec(pg, rec) {
|
||||
|
||||
if (rec->flags & FTRACE_FL_DISABLED)
|
||||
if (skip_record(rec))
|
||||
continue;
|
||||
|
||||
failed = __ftrace_replace_code(rec, enable);
|
||||
@@ -5427,6 +5436,8 @@ static struct ftrace_ops stub_ops = {
|
||||
* it is safe to modify the ftrace record, where it should be
|
||||
* currently calling @old_addr directly, to call @new_addr.
|
||||
*
|
||||
* This is called with direct_mutex locked.
|
||||
*
|
||||
* Safety checks should be made to make sure that the code at
|
||||
* @rec->ip is currently calling @old_addr. And this must
|
||||
* also update entry->direct to @new_addr.
|
||||
@@ -5439,6 +5450,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
|
||||
unsigned long ip = rec->ip;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&direct_mutex);
|
||||
|
||||
/*
|
||||
* The ftrace_lock was used to determine if the record
|
||||
* had more than one registered user to it. If it did,
|
||||
@@ -5461,7 +5474,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
|
||||
if (ret)
|
||||
goto out_lock;
|
||||
|
||||
ret = register_ftrace_function(&stub_ops);
|
||||
ret = register_ftrace_function_nolock(&stub_ops);
|
||||
if (ret) {
|
||||
ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
|
||||
goto out_lock;
|
||||
@@ -6081,8 +6094,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
|
||||
|
||||
if (filter_hash) {
|
||||
orig_hash = &iter->ops->func_hash->filter_hash;
|
||||
if (iter->tr && !list_empty(&iter->tr->mod_trace))
|
||||
iter->hash->flags |= FTRACE_HASH_FL_MOD;
|
||||
if (iter->tr) {
|
||||
if (list_empty(&iter->tr->mod_trace))
|
||||
iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
|
||||
else
|
||||
iter->hash->flags |= FTRACE_HASH_FL_MOD;
|
||||
}
|
||||
} else
|
||||
orig_hash = &iter->ops->func_hash->notrace_hash;
|
||||
|
||||
|
||||
@@ -35,6 +35,45 @@
|
||||
static struct trace_event_file *gen_kprobe_test;
|
||||
static struct trace_event_file *gen_kretprobe_test;
|
||||
|
||||
#define KPROBE_GEN_TEST_FUNC "do_sys_open"
|
||||
|
||||
/* X86 */
|
||||
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
|
||||
#define KPROBE_GEN_TEST_ARG0 "dfd=%ax"
|
||||
#define KPROBE_GEN_TEST_ARG1 "filename=%dx"
|
||||
#define KPROBE_GEN_TEST_ARG2 "flags=%cx"
|
||||
#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)"
|
||||
|
||||
/* ARM64 */
|
||||
#elif defined(CONFIG_ARM64)
|
||||
#define KPROBE_GEN_TEST_ARG0 "dfd=%x0"
|
||||
#define KPROBE_GEN_TEST_ARG1 "filename=%x1"
|
||||
#define KPROBE_GEN_TEST_ARG2 "flags=%x2"
|
||||
#define KPROBE_GEN_TEST_ARG3 "mode=%x3"
|
||||
|
||||
/* ARM */
|
||||
#elif defined(CONFIG_ARM)
|
||||
#define KPROBE_GEN_TEST_ARG0 "dfd=%r0"
|
||||
#define KPROBE_GEN_TEST_ARG1 "filename=%r1"
|
||||
#define KPROBE_GEN_TEST_ARG2 "flags=%r2"
|
||||
#define KPROBE_GEN_TEST_ARG3 "mode=%r3"
|
||||
|
||||
/* RISCV */
|
||||
#elif defined(CONFIG_RISCV)
|
||||
#define KPROBE_GEN_TEST_ARG0 "dfd=%a0"
|
||||
#define KPROBE_GEN_TEST_ARG1 "filename=%a1"
|
||||
#define KPROBE_GEN_TEST_ARG2 "flags=%a2"
|
||||
#define KPROBE_GEN_TEST_ARG3 "mode=%a3"
|
||||
|
||||
/* others */
|
||||
#else
|
||||
#define KPROBE_GEN_TEST_ARG0 NULL
|
||||
#define KPROBE_GEN_TEST_ARG1 NULL
|
||||
#define KPROBE_GEN_TEST_ARG2 NULL
|
||||
#define KPROBE_GEN_TEST_ARG3 NULL
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Test to make sure we can create a kprobe event, then add more
|
||||
* fields.
|
||||
@@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void)
|
||||
* fields.
|
||||
*/
|
||||
ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
|
||||
"do_sys_open",
|
||||
"dfd=%ax", "filename=%dx");
|
||||
KPROBE_GEN_TEST_FUNC,
|
||||
KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
|
||||
if (ret)
|
||||
goto free;
|
||||
|
||||
/* Use kprobe_event_add_fields to add the rest of the fields */
|
||||
|
||||
ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
|
||||
ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
|
||||
if (ret)
|
||||
goto free;
|
||||
|
||||
@@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void)
|
||||
* Define the kretprobe event.
|
||||
*/
|
||||
ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
|
||||
"do_sys_open",
|
||||
KPROBE_GEN_TEST_FUNC,
|
||||
"$retval");
|
||||
if (ret)
|
||||
goto free;
|
||||
@@ -206,7 +245,7 @@ static void __exit kprobe_event_gen_test_exit(void)
|
||||
WARN_ON(kprobe_event_delete("gen_kprobe_test"));
|
||||
|
||||
/* Disable the event or you can't remove it */
|
||||
WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
|
||||
WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
|
||||
"kprobes",
|
||||
"gen_kretprobe_test", false));
|
||||
|
||||
|
||||
@@ -413,6 +413,7 @@ struct rb_irq_work {
|
||||
struct irq_work work;
|
||||
wait_queue_head_t waiters;
|
||||
wait_queue_head_t full_waiters;
|
||||
long wait_index;
|
||||
bool waiters_pending;
|
||||
bool full_waiters_pending;
|
||||
bool wakeup_full;
|
||||
@@ -884,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
|
||||
}
|
||||
|
||||
/**
|
||||
* ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
|
||||
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
|
||||
* @buffer: The ring_buffer to get the number of pages from
|
||||
* @cpu: The cpu of the ring_buffer to get the number of pages from
|
||||
*
|
||||
@@ -917,12 +918,44 @@ static void rb_wake_up_waiters(struct irq_work *work)
|
||||
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
|
||||
|
||||
wake_up_all(&rbwork->waiters);
|
||||
if (rbwork->wakeup_full) {
|
||||
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
|
||||
rbwork->wakeup_full = false;
|
||||
rbwork->full_waiters_pending = false;
|
||||
wake_up_all(&rbwork->full_waiters);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ring_buffer_wake_waiters - wake up any waiters on this ring buffer
|
||||
* @buffer: The ring buffer to wake waiters on
|
||||
*
|
||||
* In the case of a file that represents a ring buffer is closing,
|
||||
* it is prudent to wake up any waiters that are on this.
|
||||
*/
|
||||
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
|
||||
{
|
||||
struct ring_buffer_per_cpu *cpu_buffer;
|
||||
struct rb_irq_work *rbwork;
|
||||
|
||||
if (cpu == RING_BUFFER_ALL_CPUS) {
|
||||
|
||||
/* Wake up individual ones too. One level recursion */
|
||||
for_each_buffer_cpu(buffer, cpu)
|
||||
ring_buffer_wake_waiters(buffer, cpu);
|
||||
|
||||
rbwork = &buffer->irq_work;
|
||||
} else {
|
||||
cpu_buffer = buffer->buffers[cpu];
|
||||
rbwork = &cpu_buffer->irq_work;
|
||||
}
|
||||
|
||||
rbwork->wait_index++;
|
||||
/* make sure the waiters see the new index */
|
||||
smp_wmb();
|
||||
|
||||
rb_wake_up_waiters(&rbwork->work);
|
||||
}
|
||||
|
||||
/**
|
||||
* ring_buffer_wait - wait for input to the ring buffer
|
||||
* @buffer: buffer to wait on
|
||||
@@ -938,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
|
||||
struct ring_buffer_per_cpu *cpu_buffer;
|
||||
DEFINE_WAIT(wait);
|
||||
struct rb_irq_work *work;
|
||||
long wait_index;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
@@ -956,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
|
||||
work = &cpu_buffer->irq_work;
|
||||
}
|
||||
|
||||
wait_index = READ_ONCE(work->wait_index);
|
||||
|
||||
while (true) {
|
||||
if (full)
|
||||
@@ -1011,7 +1046,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
|
||||
nr_pages = cpu_buffer->nr_pages;
|
||||
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
|
||||
if (!cpu_buffer->shortest_full ||
|
||||
cpu_buffer->shortest_full < full)
|
||||
cpu_buffer->shortest_full > full)
|
||||
cpu_buffer->shortest_full = full;
|
||||
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
|
||||
if (!pagebusy &&
|
||||
@@ -1020,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
|
||||
}
|
||||
|
||||
schedule();
|
||||
|
||||
/* Make sure to see the new wait index */
|
||||
smp_rmb();
|
||||
if (wait_index != work->wait_index)
|
||||
break;
|
||||
}
|
||||
|
||||
if (full)
|
||||
@@ -2608,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
/* Mark the rest of the page with padding */
|
||||
rb_event_set_padding(event);
|
||||
|
||||
/* Make sure the padding is visible before the write update */
|
||||
smp_wmb();
|
||||
|
||||
/* Set the write back to the previous setting */
|
||||
local_sub(length, &tail_page->write);
|
||||
return;
|
||||
@@ -2619,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
/* time delta must be non zero */
|
||||
event->time_delta = 1;
|
||||
|
||||
/* Make sure the padding is visible before the tail_page->write update */
|
||||
smp_wmb();
|
||||
|
||||
/* Set write to end of buffer */
|
||||
length = (tail + length) - BUF_PAGE_SIZE;
|
||||
local_sub(length, &tail_page->write);
|
||||
@@ -4587,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
arch_spin_unlock(&cpu_buffer->lock);
|
||||
local_irq_restore(flags);
|
||||
|
||||
/*
|
||||
* The writer has preempt disable, wait for it. But not forever
|
||||
* Although, 1 second is pretty much "forever"
|
||||
*/
|
||||
#define USECS_WAIT 1000000
|
||||
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
|
||||
/* If the write is past the end of page, a writer is still updating it */
|
||||
if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
|
||||
break;
|
||||
|
||||
udelay(1);
|
||||
|
||||
/* Get the latest version of the reader write value */
|
||||
smp_rmb();
|
||||
}
|
||||
|
||||
/* The writer is not moving forward? Something is wrong */
|
||||
if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
|
||||
reader = NULL;
|
||||
|
||||
/*
|
||||
* Make sure we see any padding after the write update
|
||||
* (see rb_reset_tail())
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
|
||||
return reader;
|
||||
}
|
||||
|
||||
@@ -5232,7 +5305,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
|
||||
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
|
||||
|
||||
/**
|
||||
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
|
||||
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
|
||||
* @buffer: The ring buffer to reset a per cpu buffer of
|
||||
* @cpu: The CPU buffer to be reset
|
||||
*/
|
||||
@@ -5302,7 +5375,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
|
||||
EXPORT_SYMBOL_GPL(ring_buffer_reset);
|
||||
|
||||
/**
|
||||
* rind_buffer_empty - is the ring buffer empty?
|
||||
* ring_buffer_empty - is the ring buffer empty?
|
||||
* @buffer: The ring buffer to test
|
||||
*/
|
||||
bool ring_buffer_empty(struct trace_buffer *buffer)
|
||||
@@ -5616,7 +5689,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
|
||||
unsigned int pos = 0;
|
||||
unsigned int size;
|
||||
|
||||
if (full)
|
||||
/*
|
||||
* If a full page is expected, this can still be returned
|
||||
* if there's been a previous partial read and the
|
||||
* rest of the page can be read and the commit page is off
|
||||
* the reader page.
|
||||
*/
|
||||
if (full &&
|
||||
(!read || (len < (commit - read)) ||
|
||||
cpu_buffer->reader_page == cpu_buffer->commit_page))
|
||||
goto out_unlock;
|
||||
|
||||
if (len > (commit - read))
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
#include "wip.h"
|
||||
|
||||
struct rv_monitor rv_wip;
|
||||
static struct rv_monitor rv_wip;
|
||||
DECLARE_DA_MON_PER_CPU(wip, unsigned char);
|
||||
|
||||
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
|
||||
@@ -60,7 +60,7 @@ static void disable_wip(void)
|
||||
da_monitor_destroy_wip();
|
||||
}
|
||||
|
||||
struct rv_monitor rv_wip = {
|
||||
static struct rv_monitor rv_wip = {
|
||||
.name = "wip",
|
||||
.description = "wakeup in preemptive per-cpu testing monitor.",
|
||||
.enable = enable_wip,
|
||||
@@ -69,13 +69,13 @@ struct rv_monitor rv_wip = {
|
||||
.enabled = 0,
|
||||
};
|
||||
|
||||
static int register_wip(void)
|
||||
static int __init register_wip(void)
|
||||
{
|
||||
rv_register_monitor(&rv_wip);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unregister_wip(void)
|
||||
static void __exit unregister_wip(void)
|
||||
{
|
||||
rv_unregister_monitor(&rv_wip);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
#include "wwnr.h"
|
||||
|
||||
struct rv_monitor rv_wwnr;
|
||||
static struct rv_monitor rv_wwnr;
|
||||
DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
|
||||
|
||||
static void handle_switch(void *data, bool preempt, struct task_struct *p,
|
||||
@@ -59,7 +59,7 @@ static void disable_wwnr(void)
|
||||
da_monitor_destroy_wwnr();
|
||||
}
|
||||
|
||||
struct rv_monitor rv_wwnr = {
|
||||
static struct rv_monitor rv_wwnr = {
|
||||
.name = "wwnr",
|
||||
.description = "wakeup while not running per-task testing model.",
|
||||
.enable = enable_wwnr,
|
||||
@@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = {
|
||||
.enabled = 0,
|
||||
};
|
||||
|
||||
static int register_wwnr(void)
|
||||
static int __init register_wwnr(void)
|
||||
{
|
||||
rv_register_monitor(&rv_wwnr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unregister_wwnr(void)
|
||||
static void __exit unregister_wwnr(void)
|
||||
{
|
||||
rv_unregister_monitor(&rv_wwnr);
|
||||
}
|
||||
|
||||
@@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
|
||||
{
|
||||
void *cond_data = NULL;
|
||||
|
||||
local_irq_disable();
|
||||
arch_spin_lock(&tr->max_lock);
|
||||
|
||||
if (tr->cond_snapshot)
|
||||
cond_data = tr->cond_snapshot->cond_data;
|
||||
|
||||
arch_spin_unlock(&tr->max_lock);
|
||||
local_irq_enable();
|
||||
|
||||
return cond_data;
|
||||
}
|
||||
@@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
|
||||
goto fail_unlock;
|
||||
}
|
||||
|
||||
local_irq_disable();
|
||||
arch_spin_lock(&tr->max_lock);
|
||||
tr->cond_snapshot = cond_snapshot;
|
||||
arch_spin_unlock(&tr->max_lock);
|
||||
local_irq_enable();
|
||||
|
||||
mutex_unlock(&trace_types_lock);
|
||||
|
||||
@@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
local_irq_disable();
|
||||
arch_spin_lock(&tr->max_lock);
|
||||
|
||||
if (!tr->cond_snapshot)
|
||||
@@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
|
||||
}
|
||||
|
||||
arch_spin_unlock(&tr->max_lock);
|
||||
local_irq_enable();
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -2200,6 +2206,11 @@ static size_t tgid_map_max;
|
||||
|
||||
#define SAVED_CMDLINES_DEFAULT 128
|
||||
#define NO_CMDLINE_MAP UINT_MAX
|
||||
/*
|
||||
* Preemption must be disabled before acquiring trace_cmdline_lock.
|
||||
* The various trace_arrays' max_lock must be acquired in a context
|
||||
* where interrupt is disabled.
|
||||
*/
|
||||
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
|
||||
struct saved_cmdlines_buffer {
|
||||
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
|
||||
@@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
|
||||
* the lock, but we also don't want to spin
|
||||
* nor do we want to disable interrupts,
|
||||
* so if we miss here, then better luck next time.
|
||||
*
|
||||
* This is called within the scheduler and wake up, so interrupts
|
||||
* had better been disabled and run queue lock been held.
|
||||
*/
|
||||
lockdep_assert_preemption_disabled();
|
||||
if (!arch_spin_trylock(&trace_cmdline_lock))
|
||||
return 0;
|
||||
|
||||
@@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
|
||||
char buf[64];
|
||||
int r;
|
||||
|
||||
preempt_disable();
|
||||
arch_spin_lock(&trace_cmdline_lock);
|
||||
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
|
||||
arch_spin_unlock(&trace_cmdline_lock);
|
||||
preempt_enable();
|
||||
|
||||
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
|
||||
}
|
||||
@@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
arch_spin_lock(&trace_cmdline_lock);
|
||||
savedcmd_temp = savedcmd;
|
||||
savedcmd = s;
|
||||
arch_spin_unlock(&trace_cmdline_lock);
|
||||
preempt_enable();
|
||||
free_saved_cmdlines_buffer(savedcmd_temp);
|
||||
|
||||
return 0;
|
||||
@@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
|
||||
|
||||
#ifdef CONFIG_TRACER_SNAPSHOT
|
||||
if (t->use_max_tr) {
|
||||
local_irq_disable();
|
||||
arch_spin_lock(&tr->max_lock);
|
||||
if (tr->cond_snapshot)
|
||||
ret = -EBUSY;
|
||||
arch_spin_unlock(&tr->max_lock);
|
||||
local_irq_enable();
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
@@ -6407,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
|
||||
if (tr->current_trace->reset)
|
||||
tr->current_trace->reset(tr);
|
||||
|
||||
#ifdef CONFIG_TRACER_MAX_TRACE
|
||||
had_max_tr = tr->current_trace->use_max_tr;
|
||||
|
||||
/* Current trace needs to be nop_trace before synchronize_rcu */
|
||||
tr->current_trace = &nop_trace;
|
||||
|
||||
#ifdef CONFIG_TRACER_MAX_TRACE
|
||||
had_max_tr = tr->allocated_snapshot;
|
||||
|
||||
if (had_max_tr && !t->use_max_tr) {
|
||||
/*
|
||||
* We need to make sure that the update_max_tr sees that
|
||||
@@ -6425,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
|
||||
free_snapshot(tr);
|
||||
}
|
||||
|
||||
if (t->use_max_tr && !had_max_tr) {
|
||||
if (t->use_max_tr && !tr->allocated_snapshot) {
|
||||
ret = tracing_alloc_snapshot_instance(tr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
#else
|
||||
tr->current_trace = &nop_trace;
|
||||
#endif
|
||||
|
||||
if (t->init) {
|
||||
@@ -7436,10 +7459,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
|
||||
goto out;
|
||||
}
|
||||
|
||||
local_irq_disable();
|
||||
arch_spin_lock(&tr->max_lock);
|
||||
if (tr->cond_snapshot)
|
||||
ret = -EBUSY;
|
||||
arch_spin_unlock(&tr->max_lock);
|
||||
local_irq_enable();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -8137,6 +8162,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
|
||||
|
||||
__trace_array_put(iter->tr);
|
||||
|
||||
iter->wait_index++;
|
||||
/* Make sure the waiters see the new wait_index */
|
||||
smp_wmb();
|
||||
|
||||
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
|
||||
|
||||
if (info->spare)
|
||||
ring_buffer_free_read_page(iter->array_buffer->buffer,
|
||||
info->spare_cpu, info->spare);
|
||||
@@ -8290,6 +8321,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
|
||||
|
||||
/* did we read anything? */
|
||||
if (!spd.nr_pages) {
|
||||
long wait_index;
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -8297,10 +8330,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
|
||||
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
|
||||
goto out;
|
||||
|
||||
wait_index = READ_ONCE(iter->wait_index);
|
||||
|
||||
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* No need to wait after waking up when tracing is off */
|
||||
if (!tracer_tracing_is_on(iter->tr))
|
||||
goto out;
|
||||
|
||||
/* Make sure we see the new wait_index */
|
||||
smp_rmb();
|
||||
if (wait_index != iter->wait_index)
|
||||
goto out;
|
||||
|
||||
goto again;
|
||||
}
|
||||
|
||||
@@ -8311,12 +8355,34 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
|
||||
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct ftrace_buffer_info *info = file->private_data;
|
||||
struct trace_iterator *iter = &info->iter;
|
||||
|
||||
if (cmd)
|
||||
return -ENOIOCTLCMD;
|
||||
|
||||
mutex_lock(&trace_types_lock);
|
||||
|
||||
iter->wait_index++;
|
||||
/* Make sure the waiters see the new wait_index */
|
||||
smp_wmb();
|
||||
|
||||
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
|
||||
|
||||
mutex_unlock(&trace_types_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations tracing_buffers_fops = {
|
||||
.open = tracing_buffers_open,
|
||||
.read = tracing_buffers_read,
|
||||
.poll = tracing_buffers_poll,
|
||||
.release = tracing_buffers_release,
|
||||
.splice_read = tracing_buffers_splice_read,
|
||||
.unlocked_ioctl = tracing_buffers_ioctl,
|
||||
.llseek = no_llseek,
|
||||
};
|
||||
|
||||
@@ -9005,6 +9071,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
|
||||
tracer_tracing_off(tr);
|
||||
if (tr->current_trace->stop)
|
||||
tr->current_trace->stop(tr);
|
||||
/* Wake up any waiters */
|
||||
ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
|
||||
}
|
||||
mutex_unlock(&trace_types_lock);
|
||||
}
|
||||
@@ -10091,7 +10159,7 @@ __init static int tracer_alloc_buffers(void)
|
||||
* buffer. The memory will be removed once the "instance" is removed.
|
||||
*/
|
||||
ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
|
||||
"trace/RB:preapre", trace_rb_cpu_prepare,
|
||||
"trace/RB:prepare", trace_rb_cpu_prepare,
|
||||
NULL);
|
||||
if (ret < 0)
|
||||
goto out_free_cpumask;
|
||||
|
||||
@@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
|
||||
struct filter_pred;
|
||||
struct regex;
|
||||
|
||||
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
|
||||
|
||||
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
|
||||
|
||||
enum regex_type {
|
||||
@@ -1455,17 +1453,6 @@ struct regex {
|
||||
regex_match_func match;
|
||||
};
|
||||
|
||||
struct filter_pred {
|
||||
filter_pred_fn_t fn;
|
||||
u64 val;
|
||||
struct regex regex;
|
||||
unsigned short *ops;
|
||||
struct ftrace_event_field *field;
|
||||
int offset;
|
||||
int not;
|
||||
int op;
|
||||
};
|
||||
|
||||
static inline bool is_string_field(struct ftrace_event_field *field)
|
||||
{
|
||||
return field->filter_type == FILTER_DYN_STRING ||
|
||||
|
||||
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
|
||||
|
||||
local_irq_disable();
|
||||
start = trace_clock_local();
|
||||
trace_benchmark_event(bm_str);
|
||||
trace_benchmark_event(bm_str, bm_last);
|
||||
stop = trace_clock_local();
|
||||
local_irq_enable();
|
||||
|
||||
|
||||
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
|
||||
|
||||
TRACE_EVENT_FN(benchmark_event,
|
||||
|
||||
TP_PROTO(const char *str),
|
||||
TP_PROTO(const char *str, u64 delta),
|
||||
|
||||
TP_ARGS(str),
|
||||
TP_ARGS(str, delta),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array( char, str, BENCHMARK_EVENT_STRLEN )
|
||||
__field( u64, delta)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
|
||||
__entry->delta = delta;
|
||||
),
|
||||
|
||||
TP_printk("%s", __entry->str),
|
||||
TP_printk("%s delta=%llu", __entry->str, __entry->delta),
|
||||
|
||||
trace_benchmark_reg, trace_benchmark_unreg
|
||||
);
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "trace_dynevent.h"
|
||||
#include "trace_probe.h"
|
||||
#include "trace_probe_tmpl.h"
|
||||
#include "trace_probe_kernel.h"
|
||||
|
||||
#define EPROBE_EVENT_SYSTEM "eprobes"
|
||||
|
||||
@@ -26,6 +27,9 @@ struct trace_eprobe {
|
||||
/* tracepoint event */
|
||||
const char *event_name;
|
||||
|
||||
/* filter string for the tracepoint */
|
||||
char *filter_str;
|
||||
|
||||
struct trace_event_call *event;
|
||||
|
||||
struct dyn_event devent;
|
||||
@@ -453,29 +457,14 @@ NOKPROBE_SYMBOL(process_fetch_insn)
|
||||
static nokprobe_inline int
|
||||
fetch_store_strlen_user(unsigned long addr)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
|
||||
return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
|
||||
return kern_fetch_store_strlen_user(addr);
|
||||
}
|
||||
|
||||
/* Return the length of string -- including null terminal byte */
|
||||
static nokprobe_inline int
|
||||
fetch_store_strlen(unsigned long addr)
|
||||
{
|
||||
int ret, len = 0;
|
||||
u8 c;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if (addr < TASK_SIZE)
|
||||
return fetch_store_strlen_user(addr);
|
||||
#endif
|
||||
|
||||
do {
|
||||
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
|
||||
len++;
|
||||
} while (c && ret == 0 && len < MAX_STRING_SIZE);
|
||||
|
||||
return (ret < 0) ? ret : len;
|
||||
return kern_fetch_store_strlen(addr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -485,21 +474,7 @@ fetch_store_strlen(unsigned long addr)
|
||||
static nokprobe_inline int
|
||||
fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
|
||||
if (ret >= 0)
|
||||
*(u32 *)dest = make_data_loc(ret, __dest - base);
|
||||
|
||||
return ret;
|
||||
return kern_fetch_store_string_user(addr, dest, base);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -509,29 +484,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
||||
static nokprobe_inline int
|
||||
fetch_store_string(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if ((unsigned long)addr < TASK_SIZE)
|
||||
return fetch_store_string_user(addr, dest, base);
|
||||
#endif
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
/*
|
||||
* Try to get string again, since the string can be changed while
|
||||
* probing.
|
||||
*/
|
||||
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
|
||||
if (ret >= 0)
|
||||
*(u32 *)dest = make_data_loc(ret, __dest - base);
|
||||
|
||||
return ret;
|
||||
return kern_fetch_store_string(addr, dest, base);
|
||||
}
|
||||
|
||||
static nokprobe_inline int
|
||||
@@ -664,14 +617,15 @@ static struct event_trigger_data *
|
||||
new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
|
||||
{
|
||||
struct event_trigger_data *trigger;
|
||||
struct event_filter *filter = NULL;
|
||||
struct eprobe_data *edata;
|
||||
int ret;
|
||||
|
||||
edata = kzalloc(sizeof(*edata), GFP_KERNEL);
|
||||
trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
|
||||
if (!trigger || !edata) {
|
||||
kfree(edata);
|
||||
kfree(trigger);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
trigger->flags = EVENT_TRIGGER_FL_PROBE;
|
||||
@@ -686,13 +640,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
|
||||
trigger->cmd_ops = &event_trigger_cmd;
|
||||
|
||||
INIT_LIST_HEAD(&trigger->list);
|
||||
RCU_INIT_POINTER(trigger->filter, NULL);
|
||||
|
||||
if (ep->filter_str) {
|
||||
ret = create_event_filter(file->tr, file->event_call,
|
||||
ep->filter_str, false, &filter);
|
||||
if (ret)
|
||||
goto error;
|
||||
}
|
||||
RCU_INIT_POINTER(trigger->filter, filter);
|
||||
|
||||
edata->file = file;
|
||||
edata->ep = ep;
|
||||
trigger->private_data = edata;
|
||||
|
||||
return trigger;
|
||||
error:
|
||||
free_event_filter(filter);
|
||||
kfree(edata);
|
||||
kfree(trigger);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int enable_eprobe(struct trace_eprobe *ep,
|
||||
@@ -726,6 +692,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
|
||||
{
|
||||
struct event_trigger_data *trigger = NULL, *iter;
|
||||
struct trace_event_file *file;
|
||||
struct event_filter *filter;
|
||||
struct eprobe_data *edata;
|
||||
|
||||
file = find_event_file(tr, ep->event_system, ep->event_name);
|
||||
@@ -752,6 +719,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
|
||||
/* Make sure nothing is using the edata or trigger */
|
||||
tracepoint_synchronize_unregister();
|
||||
|
||||
filter = rcu_access_pointer(trigger->filter);
|
||||
|
||||
if (filter)
|
||||
free_event_filter(filter);
|
||||
kfree(edata);
|
||||
kfree(trigger);
|
||||
|
||||
@@ -927,12 +898,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
|
||||
{
|
||||
struct event_filter *dummy;
|
||||
int i, ret, len = 0;
|
||||
char *p;
|
||||
|
||||
if (argc == 0) {
|
||||
trace_probe_log_err(0, NO_EP_FILTER);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Recover the filter string */
|
||||
for (i = 0; i < argc; i++)
|
||||
len += strlen(argv[i]) + 1;
|
||||
|
||||
ep->filter_str = kzalloc(len, GFP_KERNEL);
|
||||
if (!ep->filter_str)
|
||||
return -ENOMEM;
|
||||
|
||||
p = ep->filter_str;
|
||||
for (i = 0; i < argc; i++) {
|
||||
ret = snprintf(p, len, "%s ", argv[i]);
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
if (ret > len) {
|
||||
ret = -E2BIG;
|
||||
goto error;
|
||||
}
|
||||
p += ret;
|
||||
len -= ret;
|
||||
}
|
||||
p[-1] = '\0';
|
||||
|
||||
/*
|
||||
* Ensure the filter string can be parsed correctly. Note, this
|
||||
* filter string is for the original event, not for the eprobe.
|
||||
*/
|
||||
ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
|
||||
true, &dummy);
|
||||
free_event_filter(dummy);
|
||||
if (ret)
|
||||
goto error;
|
||||
|
||||
return 0;
|
||||
error:
|
||||
kfree(ep->filter_str);
|
||||
ep->filter_str = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __trace_eprobe_create(int argc, const char *argv[])
|
||||
{
|
||||
/*
|
||||
* Argument syntax:
|
||||
* e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS]
|
||||
* Fetch args:
|
||||
* e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
|
||||
* Fetch args (no space):
|
||||
* <name>=$<field>[:TYPE]
|
||||
*/
|
||||
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
|
||||
@@ -942,8 +963,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
|
||||
char buf1[MAX_EVENT_NAME_LEN];
|
||||
char buf2[MAX_EVENT_NAME_LEN];
|
||||
char gbuf[MAX_EVENT_NAME_LEN];
|
||||
int ret = 0;
|
||||
int i;
|
||||
int ret = 0, filter_idx = 0;
|
||||
int i, filter_cnt;
|
||||
|
||||
if (argc < 2 || argv[0][0] != 'e')
|
||||
return -ECANCELED;
|
||||
@@ -968,11 +989,19 @@ static int __trace_eprobe_create(int argc, const char *argv[])
|
||||
}
|
||||
|
||||
if (!event) {
|
||||
strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
|
||||
sanitize_event_name(buf1);
|
||||
strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
|
||||
event = buf1;
|
||||
}
|
||||
|
||||
for (i = 2; i < argc; i++) {
|
||||
if (!strcmp(argv[i], "if")) {
|
||||
filter_idx = i + 1;
|
||||
filter_cnt = argc - filter_idx;
|
||||
argc = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_lock(&event_mutex);
|
||||
event_call = find_and_get_event(sys_name, sys_event);
|
||||
ep = alloc_event_probe(group, event, event_call, argc - 2);
|
||||
@@ -988,6 +1017,14 @@ static int __trace_eprobe_create(int argc, const char *argv[])
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (filter_idx) {
|
||||
trace_probe_log_set_index(filter_idx);
|
||||
ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
|
||||
if (ret)
|
||||
goto parse_error;
|
||||
} else
|
||||
ep->filter_str = NULL;
|
||||
|
||||
argc -= 2; argv += 2;
|
||||
/* parse arguments */
|
||||
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
|
||||
|
||||
@@ -43,6 +43,42 @@ enum filter_op_ids { OPS };
|
||||
|
||||
static const char * ops[] = { OPS };
|
||||
|
||||
enum filter_pred_fn {
|
||||
FILTER_PRED_FN_NOP,
|
||||
FILTER_PRED_FN_64,
|
||||
FILTER_PRED_FN_S64,
|
||||
FILTER_PRED_FN_U64,
|
||||
FILTER_PRED_FN_32,
|
||||
FILTER_PRED_FN_S32,
|
||||
FILTER_PRED_FN_U32,
|
||||
FILTER_PRED_FN_16,
|
||||
FILTER_PRED_FN_S16,
|
||||
FILTER_PRED_FN_U16,
|
||||
FILTER_PRED_FN_8,
|
||||
FILTER_PRED_FN_S8,
|
||||
FILTER_PRED_FN_U8,
|
||||
FILTER_PRED_FN_COMM,
|
||||
FILTER_PRED_FN_STRING,
|
||||
FILTER_PRED_FN_STRLOC,
|
||||
FILTER_PRED_FN_STRRELLOC,
|
||||
FILTER_PRED_FN_PCHAR_USER,
|
||||
FILTER_PRED_FN_PCHAR,
|
||||
FILTER_PRED_FN_CPU,
|
||||
FILTER_PRED_FN_,
|
||||
FILTER_PRED_TEST_VISITED,
|
||||
};
|
||||
|
||||
struct filter_pred {
|
||||
enum filter_pred_fn fn_num;
|
||||
u64 val;
|
||||
struct regex regex;
|
||||
unsigned short *ops;
|
||||
struct ftrace_event_field *field;
|
||||
int offset;
|
||||
int not;
|
||||
int op;
|
||||
};
|
||||
|
||||
/*
|
||||
* pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
|
||||
* pred_funcs_##type below must match the order of them above.
|
||||
@@ -590,45 +626,49 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
#define DEFINE_COMPARISON_PRED(type) \
|
||||
static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr < val; \
|
||||
} \
|
||||
static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr <= val; \
|
||||
} \
|
||||
static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr > val; \
|
||||
} \
|
||||
static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr >= val; \
|
||||
} \
|
||||
static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return !!(*addr & val); \
|
||||
} \
|
||||
static const filter_pred_fn_t pred_funcs_##type[] = { \
|
||||
filter_pred_LE_##type, \
|
||||
filter_pred_LT_##type, \
|
||||
filter_pred_GE_##type, \
|
||||
filter_pred_GT_##type, \
|
||||
filter_pred_BAND_##type, \
|
||||
enum pred_cmp_types {
|
||||
PRED_CMP_TYPE_NOP,
|
||||
PRED_CMP_TYPE_LT,
|
||||
PRED_CMP_TYPE_LE,
|
||||
PRED_CMP_TYPE_GT,
|
||||
PRED_CMP_TYPE_GE,
|
||||
PRED_CMP_TYPE_BAND,
|
||||
};
|
||||
|
||||
#define DEFINE_COMPARISON_PRED(type) \
|
||||
static int filter_pred_##type(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
switch (pred->op) { \
|
||||
case OP_LT: { \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr < val; \
|
||||
} \
|
||||
case OP_LE: { \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr <= val; \
|
||||
} \
|
||||
case OP_GT: { \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr > val; \
|
||||
} \
|
||||
case OP_GE: { \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return *addr >= val; \
|
||||
} \
|
||||
case OP_BAND: { \
|
||||
type *addr = (type *)(event + pred->offset); \
|
||||
type val = (type)pred->val; \
|
||||
return !!(*addr & val); \
|
||||
} \
|
||||
default: \
|
||||
return 0; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define DEFINE_EQUALITY_PRED(size) \
|
||||
static int filter_pred_##size(struct filter_pred *pred, void *event) \
|
||||
{ \
|
||||
@@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
|
||||
return cmp ^ pred->not;
|
||||
}
|
||||
|
||||
static int filter_pred_none(struct filter_pred *pred, void *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* regex_match_foo - Basic regex callbacks
|
||||
*
|
||||
@@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_FTRACE_STARTUP_TEST
|
||||
static int test_pred_visited_fn(struct filter_pred *pred, void *event);
|
||||
#else
|
||||
static int test_pred_visited_fn(struct filter_pred *pred, void *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static int filter_pred_fn_call(struct filter_pred *pred, void *event);
|
||||
|
||||
/* return 1 if event matches, 0 otherwise (discard) */
|
||||
int filter_match_preds(struct event_filter *filter, void *rec)
|
||||
{
|
||||
@@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
|
||||
|
||||
for (i = 0; prog[i].pred; i++) {
|
||||
struct filter_pred *pred = prog[i].pred;
|
||||
int match = pred->fn(pred, rec);
|
||||
int match = filter_pred_fn_call(pred, rec);
|
||||
if (match == prog[i].when_to_branch)
|
||||
i = prog[i].target;
|
||||
}
|
||||
@@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type)
|
||||
return FILTER_OTHER;
|
||||
}
|
||||
|
||||
static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
|
||||
int field_size, int field_is_signed)
|
||||
static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
|
||||
int field_size, int field_is_signed)
|
||||
{
|
||||
filter_pred_fn_t fn = NULL;
|
||||
enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
|
||||
int pred_func_index = -1;
|
||||
|
||||
switch (op) {
|
||||
@@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
|
||||
break;
|
||||
default:
|
||||
if (WARN_ON_ONCE(op < PRED_FUNC_START))
|
||||
return NULL;
|
||||
return fn;
|
||||
pred_func_index = op - PRED_FUNC_START;
|
||||
if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
|
||||
return NULL;
|
||||
return fn;
|
||||
}
|
||||
|
||||
switch (field_size) {
|
||||
case 8:
|
||||
if (pred_func_index < 0)
|
||||
fn = filter_pred_64;
|
||||
fn = FILTER_PRED_FN_64;
|
||||
else if (field_is_signed)
|
||||
fn = pred_funcs_s64[pred_func_index];
|
||||
fn = FILTER_PRED_FN_S64;
|
||||
else
|
||||
fn = pred_funcs_u64[pred_func_index];
|
||||
fn = FILTER_PRED_FN_U64;
|
||||
break;
|
||||
case 4:
|
||||
if (pred_func_index < 0)
|
||||
fn = filter_pred_32;
|
||||
fn = FILTER_PRED_FN_32;
|
||||
else if (field_is_signed)
|
||||
fn = pred_funcs_s32[pred_func_index];
|
||||
fn = FILTER_PRED_FN_S32;
|
||||
else
|
||||
fn = pred_funcs_u32[pred_func_index];
|
||||
fn = FILTER_PRED_FN_U32;
|
||||
break;
|
||||
case 2:
|
||||
if (pred_func_index < 0)
|
||||
fn = filter_pred_16;
|
||||
fn = FILTER_PRED_FN_16;
|
||||
else if (field_is_signed)
|
||||
fn = pred_funcs_s16[pred_func_index];
|
||||
fn = FILTER_PRED_FN_S16;
|
||||
else
|
||||
fn = pred_funcs_u16[pred_func_index];
|
||||
fn = FILTER_PRED_FN_U16;
|
||||
break;
|
||||
case 1:
|
||||
if (pred_func_index < 0)
|
||||
fn = filter_pred_8;
|
||||
fn = FILTER_PRED_FN_8;
|
||||
else if (field_is_signed)
|
||||
fn = pred_funcs_s8[pred_func_index];
|
||||
fn = FILTER_PRED_FN_S8;
|
||||
else
|
||||
fn = pred_funcs_u8[pred_func_index];
|
||||
fn = FILTER_PRED_FN_U8;
|
||||
break;
|
||||
}
|
||||
|
||||
return fn;
|
||||
}
|
||||
|
||||
|
||||
static int filter_pred_fn_call(struct filter_pred *pred, void *event)
|
||||
{
|
||||
switch (pred->fn_num) {
|
||||
case FILTER_PRED_FN_64:
|
||||
return filter_pred_64(pred, event);
|
||||
case FILTER_PRED_FN_S64:
|
||||
return filter_pred_s64(pred, event);
|
||||
case FILTER_PRED_FN_U64:
|
||||
return filter_pred_u64(pred, event);
|
||||
case FILTER_PRED_FN_32:
|
||||
return filter_pred_32(pred, event);
|
||||
case FILTER_PRED_FN_S32:
|
||||
return filter_pred_s32(pred, event);
|
||||
case FILTER_PRED_FN_U32:
|
||||
return filter_pred_u32(pred, event);
|
||||
case FILTER_PRED_FN_16:
|
||||
return filter_pred_16(pred, event);
|
||||
case FILTER_PRED_FN_S16:
|
||||
return filter_pred_s16(pred, event);
|
||||
case FILTER_PRED_FN_U16:
|
||||
return filter_pred_u16(pred, event);
|
||||
case FILTER_PRED_FN_8:
|
||||
return filter_pred_8(pred, event);
|
||||
case FILTER_PRED_FN_S8:
|
||||
return filter_pred_s8(pred, event);
|
||||
case FILTER_PRED_FN_U8:
|
||||
return filter_pred_u8(pred, event);
|
||||
case FILTER_PRED_FN_COMM:
|
||||
return filter_pred_comm(pred, event);
|
||||
case FILTER_PRED_FN_STRING:
|
||||
return filter_pred_string(pred, event);
|
||||
case FILTER_PRED_FN_STRLOC:
|
||||
return filter_pred_strloc(pred, event);
|
||||
case FILTER_PRED_FN_STRRELLOC:
|
||||
return filter_pred_strrelloc(pred, event);
|
||||
case FILTER_PRED_FN_PCHAR_USER:
|
||||
return filter_pred_pchar_user(pred, event);
|
||||
case FILTER_PRED_FN_PCHAR:
|
||||
return filter_pred_pchar(pred, event);
|
||||
case FILTER_PRED_FN_CPU:
|
||||
return filter_pred_cpu(pred, event);
|
||||
case FILTER_PRED_TEST_VISITED:
|
||||
return test_pred_visited_fn(pred, event);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Called when a predicate is encountered by predicate_parse() */
|
||||
static int parse_pred(const char *str, void *data,
|
||||
int pos, struct filter_parse_error *pe,
|
||||
@@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
|
||||
parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
|
||||
goto err_free;
|
||||
}
|
||||
pred->fn = filter_pred_none;
|
||||
pred->fn_num = FILTER_PRED_FN_NOP;
|
||||
|
||||
/*
|
||||
* Quotes are not required, but if they exist then we need
|
||||
@@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
|
||||
filter_build_regex(pred);
|
||||
|
||||
if (field->filter_type == FILTER_COMM) {
|
||||
pred->fn = filter_pred_comm;
|
||||
pred->fn_num = FILTER_PRED_FN_COMM;
|
||||
|
||||
} else if (field->filter_type == FILTER_STATIC_STRING) {
|
||||
pred->fn = filter_pred_string;
|
||||
pred->fn_num = FILTER_PRED_FN_STRING;
|
||||
pred->regex.field_len = field->size;
|
||||
|
||||
} else if (field->filter_type == FILTER_DYN_STRING) {
|
||||
pred->fn = filter_pred_strloc;
|
||||
pred->fn_num = FILTER_PRED_FN_STRLOC;
|
||||
} else if (field->filter_type == FILTER_RDYN_STRING)
|
||||
pred->fn = filter_pred_strrelloc;
|
||||
pred->fn_num = FILTER_PRED_FN_STRRELLOC;
|
||||
else {
|
||||
|
||||
if (!ustring_per_cpu) {
|
||||
@@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
|
||||
}
|
||||
|
||||
if (ustring)
|
||||
pred->fn = filter_pred_pchar_user;
|
||||
pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
|
||||
else
|
||||
pred->fn = filter_pred_pchar;
|
||||
pred->fn_num = FILTER_PRED_FN_PCHAR;
|
||||
}
|
||||
/* go past the last quote */
|
||||
i++;
|
||||
@@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
|
||||
pred->val = val;
|
||||
|
||||
if (field->filter_type == FILTER_CPU)
|
||||
pred->fn = filter_pred_cpu;
|
||||
pred->fn_num = FILTER_PRED_FN_CPU;
|
||||
else {
|
||||
pred->fn = select_comparison_fn(pred->op, field->size,
|
||||
field->is_signed);
|
||||
pred->fn_num = select_comparison_fn(pred->op, field->size,
|
||||
field->is_signed);
|
||||
if (pred->op == OP_NE)
|
||||
pred->not = 1;
|
||||
}
|
||||
@@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
|
||||
struct filter_pred *pred = prog[i].pred;
|
||||
struct ftrace_event_field *field = pred->field;
|
||||
|
||||
WARN_ON_ONCE(!pred->fn);
|
||||
WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
|
||||
|
||||
if (!field) {
|
||||
WARN_ONCE(1, "all leafs should have field defined %d", i);
|
||||
@@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
|
||||
if (!strchr(fields, *field->name))
|
||||
continue;
|
||||
|
||||
pred->fn = test_pred_visited_fn;
|
||||
pred->fn_num = FILTER_PRED_TEST_VISITED;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -104,6 +104,38 @@ enum field_op_id {
|
||||
FIELD_OP_MULT,
|
||||
};
|
||||
|
||||
enum hist_field_fn {
|
||||
HIST_FIELD_FN_NOP,
|
||||
HIST_FIELD_FN_VAR_REF,
|
||||
HIST_FIELD_FN_COUNTER,
|
||||
HIST_FIELD_FN_CONST,
|
||||
HIST_FIELD_FN_LOG2,
|
||||
HIST_FIELD_FN_BUCKET,
|
||||
HIST_FIELD_FN_TIMESTAMP,
|
||||
HIST_FIELD_FN_CPU,
|
||||
HIST_FIELD_FN_STRING,
|
||||
HIST_FIELD_FN_DYNSTRING,
|
||||
HIST_FIELD_FN_RELDYNSTRING,
|
||||
HIST_FIELD_FN_PSTRING,
|
||||
HIST_FIELD_FN_S64,
|
||||
HIST_FIELD_FN_U64,
|
||||
HIST_FIELD_FN_S32,
|
||||
HIST_FIELD_FN_U32,
|
||||
HIST_FIELD_FN_S16,
|
||||
HIST_FIELD_FN_U16,
|
||||
HIST_FIELD_FN_S8,
|
||||
HIST_FIELD_FN_U8,
|
||||
HIST_FIELD_FN_UMINUS,
|
||||
HIST_FIELD_FN_MINUS,
|
||||
HIST_FIELD_FN_PLUS,
|
||||
HIST_FIELD_FN_DIV,
|
||||
HIST_FIELD_FN_MULT,
|
||||
HIST_FIELD_FN_DIV_POWER2,
|
||||
HIST_FIELD_FN_DIV_NOT_POWER2,
|
||||
HIST_FIELD_FN_DIV_MULT_SHIFT,
|
||||
HIST_FIELD_FN_EXECNAME,
|
||||
};
|
||||
|
||||
/*
|
||||
* A hist_var (histogram variable) contains variable information for
|
||||
* hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
|
||||
@@ -123,15 +155,15 @@ struct hist_var {
|
||||
struct hist_field {
|
||||
struct ftrace_event_field *field;
|
||||
unsigned long flags;
|
||||
hist_field_fn_t fn;
|
||||
unsigned int ref;
|
||||
unsigned int size;
|
||||
unsigned int offset;
|
||||
unsigned int is_signed;
|
||||
unsigned long buckets;
|
||||
const char *type;
|
||||
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
|
||||
struct hist_trigger_data *hist_data;
|
||||
enum hist_field_fn fn_num;
|
||||
unsigned int ref;
|
||||
unsigned int size;
|
||||
unsigned int offset;
|
||||
unsigned int is_signed;
|
||||
|
||||
/*
|
||||
* Variable fields contain variable-specific info in var.
|
||||
@@ -166,14 +198,11 @@ struct hist_field {
|
||||
u64 div_multiplier;
|
||||
};
|
||||
|
||||
static u64 hist_field_none(struct hist_field *field,
|
||||
struct tracing_map_elt *elt,
|
||||
struct trace_buffer *buffer,
|
||||
struct ring_buffer_event *rbe,
|
||||
void *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static u64 hist_fn_call(struct hist_field *hist_field,
|
||||
struct tracing_map_elt *elt,
|
||||
struct trace_buffer *buffer,
|
||||
struct ring_buffer_event *rbe,
|
||||
void *event);
|
||||
|
||||
static u64 hist_field_const(struct hist_field *field,
|
||||
struct tracing_map_elt *elt,
|
||||
@@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
|
||||
{
|
||||
struct hist_field *operand = hist_field->operands[0];
|
||||
|
||||
u64 val = operand->fn(operand, elt, buffer, rbe, event);
|
||||
u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
|
||||
|
||||
return (u64) ilog2(roundup_pow_of_two(val));
|
||||
}
|
||||
@@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
|
||||
struct hist_field *operand = hist_field->operands[0];
|
||||
unsigned long buckets = hist_field->buckets;
|
||||
|
||||
u64 val = operand->fn(operand, elt, buffer, rbe, event);
|
||||
u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
|
||||
|
||||
if (WARN_ON_ONCE(!buckets))
|
||||
return val;
|
||||
@@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
|
||||
|
||||
return val1 + val2;
|
||||
}
|
||||
@@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
|
||||
|
||||
return val1 - val2;
|
||||
}
|
||||
@@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
|
||||
|
||||
/* Return -1 for the undefined case */
|
||||
if (!val2)
|
||||
@@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
|
||||
return val1 >> __ffs64(operand2->constant);
|
||||
}
|
||||
@@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
|
||||
return div64_u64(val1, operand2->constant);
|
||||
}
|
||||
@@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
|
||||
/*
|
||||
* If the divisor is a constant, do a multiplication and shift instead.
|
||||
@@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
|
||||
struct hist_field *operand1 = hist_field->operands[0];
|
||||
struct hist_field *operand2 = hist_field->operands[1];
|
||||
|
||||
u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
|
||||
u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
|
||||
u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
|
||||
|
||||
return val1 * val2;
|
||||
}
|
||||
@@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
|
||||
{
|
||||
struct hist_field *operand = hist_field->operands[0];
|
||||
|
||||
s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
|
||||
s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
|
||||
u64 val = (u64)-sval;
|
||||
|
||||
return val;
|
||||
@@ -657,19 +686,19 @@ struct snapshot_context {
|
||||
* Returns the specific division function to use if the divisor
|
||||
* is constant. This avoids extra branches when the trigger is hit.
|
||||
*/
|
||||
static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
|
||||
static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
|
||||
{
|
||||
u64 div = divisor->constant;
|
||||
|
||||
if (!(div & (div - 1)))
|
||||
return div_by_power_of_two;
|
||||
return HIST_FIELD_FN_DIV_POWER2;
|
||||
|
||||
/* If the divisor is too large, do a regular division */
|
||||
if (div > (1 << HIST_DIV_SHIFT))
|
||||
return div_by_not_power_of_two;
|
||||
return HIST_FIELD_FN_DIV_NOT_POWER2;
|
||||
|
||||
divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
|
||||
return div_by_mult_and_shift;
|
||||
return HIST_FIELD_FN_DIV_MULT_SHIFT;
|
||||
}
|
||||
|
||||
static void track_data_free(struct track_data *track_data)
|
||||
@@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field,
|
||||
return field_name;
|
||||
}
|
||||
|
||||
static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
|
||||
static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
|
||||
{
|
||||
hist_field_fn_t fn = NULL;
|
||||
|
||||
switch (field_size) {
|
||||
case 8:
|
||||
if (field_is_signed)
|
||||
fn = hist_field_s64;
|
||||
return HIST_FIELD_FN_S64;
|
||||
else
|
||||
fn = hist_field_u64;
|
||||
break;
|
||||
return HIST_FIELD_FN_U64;
|
||||
case 4:
|
||||
if (field_is_signed)
|
||||
fn = hist_field_s32;
|
||||
return HIST_FIELD_FN_S32;
|
||||
else
|
||||
fn = hist_field_u32;
|
||||
break;
|
||||
return HIST_FIELD_FN_U32;
|
||||
case 2:
|
||||
if (field_is_signed)
|
||||
fn = hist_field_s16;
|
||||
return HIST_FIELD_FN_S16;
|
||||
else
|
||||
fn = hist_field_u16;
|
||||
break;
|
||||
return HIST_FIELD_FN_U16;
|
||||
case 1:
|
||||
if (field_is_signed)
|
||||
fn = hist_field_s8;
|
||||
return HIST_FIELD_FN_S8;
|
||||
else
|
||||
fn = hist_field_u8;
|
||||
break;
|
||||
return HIST_FIELD_FN_U8;
|
||||
}
|
||||
|
||||
return fn;
|
||||
return HIST_FIELD_FN_NOP;
|
||||
}
|
||||
|
||||
static int parse_map_size(char *str)
|
||||
@@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
|
||||
goto out; /* caller will populate */
|
||||
|
||||
if (flags & HIST_FIELD_FL_VAR_REF) {
|
||||
hist_field->fn = hist_field_var_ref;
|
||||
hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & HIST_FIELD_FL_HITCOUNT) {
|
||||
hist_field->fn = hist_field_counter;
|
||||
hist_field->fn_num = HIST_FIELD_FN_COUNTER;
|
||||
hist_field->size = sizeof(u64);
|
||||
hist_field->type = "u64";
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & HIST_FIELD_FL_CONST) {
|
||||
hist_field->fn = hist_field_const;
|
||||
hist_field->fn_num = HIST_FIELD_FN_CONST;
|
||||
hist_field->size = sizeof(u64);
|
||||
hist_field->type = kstrdup("u64", GFP_KERNEL);
|
||||
if (!hist_field->type)
|
||||
@@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
|
||||
}
|
||||
|
||||
if (flags & HIST_FIELD_FL_STACKTRACE) {
|
||||
hist_field->fn = hist_field_none;
|
||||
hist_field->fn_num = HIST_FIELD_FN_NOP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
|
||||
unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
|
||||
hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
|
||||
hist_field_bucket;
|
||||
hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
|
||||
HIST_FIELD_FN_BUCKET;
|
||||
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
|
||||
hist_field->size = hist_field->operands[0]->size;
|
||||
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
|
||||
@@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
|
||||
}
|
||||
|
||||
if (flags & HIST_FIELD_FL_TIMESTAMP) {
|
||||
hist_field->fn = hist_field_timestamp;
|
||||
hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
|
||||
hist_field->size = sizeof(u64);
|
||||
hist_field->type = "u64";
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & HIST_FIELD_FL_CPU) {
|
||||
hist_field->fn = hist_field_cpu;
|
||||
hist_field->fn_num = HIST_FIELD_FN_CPU;
|
||||
hist_field->size = sizeof(int);
|
||||
hist_field->type = "unsigned int";
|
||||
goto out;
|
||||
@@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
|
||||
goto free;
|
||||
|
||||
if (field->filter_type == FILTER_STATIC_STRING) {
|
||||
hist_field->fn = hist_field_string;
|
||||
hist_field->fn_num = HIST_FIELD_FN_STRING;
|
||||
hist_field->size = field->size;
|
||||
} else if (field->filter_type == FILTER_DYN_STRING) {
|
||||
hist_field->fn = hist_field_dynstring;
|
||||
hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
|
||||
} else if (field->filter_type == FILTER_RDYN_STRING)
|
||||
hist_field->fn = hist_field_reldynstring;
|
||||
hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
|
||||
else
|
||||
hist_field->fn = hist_field_pstring;
|
||||
hist_field->fn_num = HIST_FIELD_FN_PSTRING;
|
||||
} else {
|
||||
hist_field->size = field->size;
|
||||
hist_field->is_signed = field->is_signed;
|
||||
@@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
|
||||
if (!hist_field->type)
|
||||
goto free;
|
||||
|
||||
hist_field->fn = select_value_fn(field->size,
|
||||
field->is_signed);
|
||||
if (!hist_field->fn) {
|
||||
hist_field->fn_num = select_value_fn(field->size,
|
||||
field->is_signed);
|
||||
if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
|
||||
destroy_hist_field(hist_field, 0);
|
||||
return NULL;
|
||||
}
|
||||
@@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
|
||||
if (!alias)
|
||||
return NULL;
|
||||
|
||||
alias->fn = var_ref->fn;
|
||||
alias->fn_num = var_ref->fn_num;
|
||||
alias->operands[0] = var_ref;
|
||||
|
||||
if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
|
||||
@@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
|
||||
|
||||
expr->flags |= operand1->flags &
|
||||
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
|
||||
expr->fn = hist_field_unary_minus;
|
||||
expr->fn_num = HIST_FIELD_FN_UMINUS;
|
||||
expr->operands[0] = operand1;
|
||||
expr->size = operand1->size;
|
||||
expr->is_signed = operand1->is_signed;
|
||||
@@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
|
||||
unsigned long operand_flags, operand2_flags;
|
||||
int field_op, ret = -EINVAL;
|
||||
char *sep, *operand1_str;
|
||||
hist_field_fn_t op_fn;
|
||||
enum hist_field_fn op_fn;
|
||||
bool combine_consts;
|
||||
|
||||
if (*n_subexprs > 3) {
|
||||
@@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
|
||||
|
||||
switch (field_op) {
|
||||
case FIELD_OP_MINUS:
|
||||
op_fn = hist_field_minus;
|
||||
op_fn = HIST_FIELD_FN_MINUS;
|
||||
break;
|
||||
case FIELD_OP_PLUS:
|
||||
op_fn = hist_field_plus;
|
||||
op_fn = HIST_FIELD_FN_PLUS;
|
||||
break;
|
||||
case FIELD_OP_DIV:
|
||||
op_fn = hist_field_div;
|
||||
op_fn = HIST_FIELD_FN_DIV;
|
||||
break;
|
||||
case FIELD_OP_MULT:
|
||||
op_fn = hist_field_mult;
|
||||
op_fn = HIST_FIELD_FN_MULT;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
@@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
|
||||
op_fn = hist_field_get_div_fn(operand2);
|
||||
}
|
||||
|
||||
expr->fn_num = op_fn;
|
||||
|
||||
if (combine_consts) {
|
||||
if (var1)
|
||||
expr->operands[0] = var1;
|
||||
if (var2)
|
||||
expr->operands[1] = var2;
|
||||
|
||||
expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
|
||||
expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
|
||||
expr->fn_num = HIST_FIELD_FN_CONST;
|
||||
|
||||
expr->operands[0] = NULL;
|
||||
expr->operands[1] = NULL;
|
||||
@@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
|
||||
|
||||
expr->name = expr_str(expr, 0);
|
||||
} else {
|
||||
expr->fn = op_fn;
|
||||
|
||||
/* The operand sizes should be the same, so just pick one */
|
||||
expr->size = operand1->size;
|
||||
expr->is_signed = operand1->is_signed;
|
||||
@@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
|
||||
struct hist_field *var = field_var->var;
|
||||
struct hist_field *val = field_var->val;
|
||||
|
||||
var_val = val->fn(val, elt, buffer, rbe, rec);
|
||||
var_val = hist_fn_call(val, elt, buffer, rbe, rec);
|
||||
var_idx = var->var.idx;
|
||||
|
||||
if (val->flags & HIST_FIELD_FL_STRING) {
|
||||
@@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
|
||||
return (u64)(unsigned long)(elt_data->comm);
|
||||
}
|
||||
|
||||
static u64 hist_fn_call(struct hist_field *hist_field,
|
||||
struct tracing_map_elt *elt,
|
||||
struct trace_buffer *buffer,
|
||||
struct ring_buffer_event *rbe,
|
||||
void *event)
|
||||
{
|
||||
switch (hist_field->fn_num) {
|
||||
case HIST_FIELD_FN_VAR_REF:
|
||||
return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_COUNTER:
|
||||
return hist_field_counter(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_CONST:
|
||||
return hist_field_const(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_LOG2:
|
||||
return hist_field_log2(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_BUCKET:
|
||||
return hist_field_bucket(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_TIMESTAMP:
|
||||
return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_CPU:
|
||||
return hist_field_cpu(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_STRING:
|
||||
return hist_field_string(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_DYNSTRING:
|
||||
return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_RELDYNSTRING:
|
||||
return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_PSTRING:
|
||||
return hist_field_pstring(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_S64:
|
||||
return hist_field_s64(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_U64:
|
||||
return hist_field_u64(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_S32:
|
||||
return hist_field_s32(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_U32:
|
||||
return hist_field_u32(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_S16:
|
||||
return hist_field_s16(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_U16:
|
||||
return hist_field_u16(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_S8:
|
||||
return hist_field_s8(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_U8:
|
||||
return hist_field_u8(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_UMINUS:
|
||||
return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_MINUS:
|
||||
return hist_field_minus(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_PLUS:
|
||||
return hist_field_plus(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_DIV:
|
||||
return hist_field_div(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_MULT:
|
||||
return hist_field_mult(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_DIV_POWER2:
|
||||
return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_DIV_NOT_POWER2:
|
||||
return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_DIV_MULT_SHIFT:
|
||||
return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
|
||||
case HIST_FIELD_FN_EXECNAME:
|
||||
return hist_field_execname(hist_field, elt, buffer, rbe, event);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert a var that points to common_pid.execname to a string */
|
||||
static void update_var_execname(struct hist_field *hist_field)
|
||||
{
|
||||
@@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field)
|
||||
kfree_const(hist_field->type);
|
||||
hist_field->type = "char[]";
|
||||
|
||||
hist_field->fn = hist_field_execname;
|
||||
hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
|
||||
}
|
||||
|
||||
static int create_var_field(struct hist_trigger_data *hist_data,
|
||||
@@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
|
||||
|
||||
for_each_hist_val_field(i, hist_data) {
|
||||
hist_field = hist_data->fields[i];
|
||||
hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
|
||||
hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
|
||||
if (hist_field->flags & HIST_FIELD_FL_VAR) {
|
||||
var_idx = hist_field->var.idx;
|
||||
|
||||
@@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
|
||||
for_each_hist_key_field(i, hist_data) {
|
||||
hist_field = hist_data->fields[i];
|
||||
if (hist_field->flags & HIST_FIELD_FL_VAR) {
|
||||
hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
|
||||
hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
|
||||
var_idx = hist_field->var.idx;
|
||||
tracing_map_set_var(elt, var_idx, hist_val);
|
||||
}
|
||||
@@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
|
||||
HIST_STACKTRACE_SKIP);
|
||||
key = entries;
|
||||
} else {
|
||||
field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
|
||||
field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
|
||||
if (key_field->flags & HIST_FIELD_FL_STRING) {
|
||||
key = (void *)(unsigned long)field_contents;
|
||||
use_compound_key = true;
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
/* for gfp flag names */
|
||||
#include <linux/trace_events.h>
|
||||
#include <trace/events/mmflags.h>
|
||||
#include "trace_probe.h"
|
||||
#include "trace_probe_kernel.h"
|
||||
|
||||
#include "trace_synth.h"
|
||||
|
||||
@@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
|
||||
{
|
||||
unsigned int len = 0;
|
||||
char *str_field;
|
||||
int ret;
|
||||
|
||||
if (is_dynamic) {
|
||||
u32 data_offset;
|
||||
@@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry,
|
||||
data_offset += event->n_u64 * sizeof(u64);
|
||||
data_offset += data_size;
|
||||
|
||||
str_field = (char *)entry + data_offset;
|
||||
|
||||
len = strlen(str_val) + 1;
|
||||
strscpy(str_field, str_val, len);
|
||||
len = kern_fetch_store_strlen((unsigned long)str_val);
|
||||
|
||||
data_offset |= len << 16;
|
||||
*(u32 *)&entry->fields[*n_u64] = data_offset;
|
||||
|
||||
ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
|
||||
|
||||
(*n_u64)++;
|
||||
} else {
|
||||
str_field = (char *)&entry->fields[*n_u64];
|
||||
|
||||
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if ((unsigned long)str_val < TASK_SIZE)
|
||||
ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
|
||||
else
|
||||
#endif
|
||||
ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
|
||||
|
||||
if (ret < 0)
|
||||
strcpy(str_field, FAULT_STRING);
|
||||
|
||||
(*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
|
||||
}
|
||||
|
||||
@@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
|
||||
val_idx = var_ref_idx[field_pos];
|
||||
str_val = (char *)(long)var_ref_vals[val_idx];
|
||||
|
||||
len = strlen(str_val) + 1;
|
||||
len = kern_fetch_store_strlen((unsigned long)str_val);
|
||||
|
||||
fields_size += len;
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <linux/uio.h>
|
||||
#include <linux/ioctl.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/trace_events.h>
|
||||
#include <linux/tracefs.h>
|
||||
#include <linux/types.h>
|
||||
@@ -39,28 +40,69 @@
|
||||
*/
|
||||
#define MAX_PAGE_ORDER 0
|
||||
#define MAX_PAGES (1 << MAX_PAGE_ORDER)
|
||||
#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
|
||||
#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
|
||||
#define MAX_EVENTS (MAX_BYTES * 8)
|
||||
|
||||
/* Limit how long of an event name plus args within the subsystem. */
|
||||
#define MAX_EVENT_DESC 512
|
||||
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
|
||||
#define MAX_FIELD_ARRAY_SIZE 1024
|
||||
#define MAX_FIELD_ARG_NAME 256
|
||||
|
||||
static char *register_page_data;
|
||||
/*
|
||||
* The MAP_STATUS_* macros are used for taking a index and determining the
|
||||
* appropriate byte and the bit in the byte to set/reset for an event.
|
||||
*
|
||||
* The lower 3 bits of the index decide which bit to set.
|
||||
* The remaining upper bits of the index decide which byte to use for the bit.
|
||||
*
|
||||
* This is used when an event has a probe attached/removed to reflect live
|
||||
* status of the event wanting tracing or not to user-programs via shared
|
||||
* memory maps.
|
||||
*/
|
||||
#define MAP_STATUS_BYTE(index) ((index) >> 3)
|
||||
#define MAP_STATUS_MASK(index) BIT((index) & 7)
|
||||
|
||||
static DEFINE_MUTEX(reg_mutex);
|
||||
static DEFINE_HASHTABLE(register_table, 4);
|
||||
static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
|
||||
/*
|
||||
* Internal bits (kernel side only) to keep track of connected probes:
|
||||
* These are used when status is requested in text form about an event. These
|
||||
* bits are compared against an internal byte on the event to determine which
|
||||
* probes to print out to the user.
|
||||
*
|
||||
* These do not reflect the mapped bytes between the user and kernel space.
|
||||
*/
|
||||
#define EVENT_STATUS_FTRACE BIT(0)
|
||||
#define EVENT_STATUS_PERF BIT(1)
|
||||
#define EVENT_STATUS_OTHER BIT(7)
|
||||
|
||||
/*
|
||||
* Stores the pages, tables, and locks for a group of events.
|
||||
* Each logical grouping of events has its own group, with a
|
||||
* matching page for status checks within user programs. This
|
||||
* allows for isolation of events to user programs by various
|
||||
* means.
|
||||
*/
|
||||
struct user_event_group {
|
||||
struct page *pages;
|
||||
char *register_page_data;
|
||||
char *system_name;
|
||||
struct hlist_node node;
|
||||
struct mutex reg_mutex;
|
||||
DECLARE_HASHTABLE(register_table, 8);
|
||||
DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
|
||||
};
|
||||
|
||||
/* Group for init_user_ns mapping, top-most group */
|
||||
static struct user_event_group *init_group;
|
||||
|
||||
/*
|
||||
* Stores per-event properties, as users register events
|
||||
* within a file a user_event might be created if it does not
|
||||
* already exist. These are globally used and their lifetime
|
||||
* is tied to the refcnt member. These cannot go away until the
|
||||
* refcnt reaches zero.
|
||||
* refcnt reaches one.
|
||||
*/
|
||||
struct user_event {
|
||||
struct user_event_group *group;
|
||||
struct tracepoint tracepoint;
|
||||
struct trace_event_call call;
|
||||
struct trace_event_class class;
|
||||
@@ -68,10 +110,11 @@ struct user_event {
|
||||
struct hlist_node node;
|
||||
struct list_head fields;
|
||||
struct list_head validators;
|
||||
atomic_t refcnt;
|
||||
refcount_t refcnt;
|
||||
int index;
|
||||
int flags;
|
||||
int min_size;
|
||||
char status;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -86,6 +129,11 @@ struct user_event_refs {
|
||||
struct user_event *events[];
|
||||
};
|
||||
|
||||
struct user_event_file_info {
|
||||
struct user_event_group *group;
|
||||
struct user_event_refs *refs;
|
||||
};
|
||||
|
||||
#define VALIDATOR_ENSURE_NULL (1 << 0)
|
||||
#define VALIDATOR_REL (1 << 1)
|
||||
|
||||
@@ -98,7 +146,8 @@ struct user_event_validator {
|
||||
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
|
||||
void *tpdata, bool *faulted);
|
||||
|
||||
static int user_event_parse(char *name, char *args, char *flags,
|
||||
static int user_event_parse(struct user_event_group *group, char *name,
|
||||
char *args, char *flags,
|
||||
struct user_event **newuser);
|
||||
|
||||
static u32 user_event_key(char *name)
|
||||
@@ -106,6 +155,144 @@ static u32 user_event_key(char *name)
|
||||
return jhash(name, strlen(name), 0);
|
||||
}
|
||||
|
||||
static void set_page_reservations(char *pages, bool set)
|
||||
{
|
||||
int page;
|
||||
|
||||
for (page = 0; page < MAX_PAGES; ++page) {
|
||||
void *addr = pages + (PAGE_SIZE * page);
|
||||
|
||||
if (set)
|
||||
SetPageReserved(virt_to_page(addr));
|
||||
else
|
||||
ClearPageReserved(virt_to_page(addr));
|
||||
}
|
||||
}
|
||||
|
||||
static void user_event_group_destroy(struct user_event_group *group)
|
||||
{
|
||||
if (group->register_page_data)
|
||||
set_page_reservations(group->register_page_data, false);
|
||||
|
||||
if (group->pages)
|
||||
__free_pages(group->pages, MAX_PAGE_ORDER);
|
||||
|
||||
kfree(group->system_name);
|
||||
kfree(group);
|
||||
}
|
||||
|
||||
static char *user_event_group_system_name(struct user_namespace *user_ns)
|
||||
{
|
||||
char *system_name;
|
||||
int len = sizeof(USER_EVENTS_SYSTEM) + 1;
|
||||
|
||||
if (user_ns != &init_user_ns) {
|
||||
/*
|
||||
* Unexpected at this point:
|
||||
* We only currently support init_user_ns.
|
||||
* When we enable more, this will trigger a failure so log.
|
||||
*/
|
||||
pr_warn("user_events: Namespace other than init_user_ns!\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
system_name = kmalloc(len, GFP_KERNEL);
|
||||
|
||||
if (!system_name)
|
||||
return NULL;
|
||||
|
||||
snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
|
||||
|
||||
return system_name;
|
||||
}
|
||||
|
||||
static inline struct user_event_group
|
||||
*user_event_group_from_user_ns(struct user_namespace *user_ns)
|
||||
{
|
||||
if (user_ns == &init_user_ns)
|
||||
return init_group;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct user_event_group *current_user_event_group(void)
|
||||
{
|
||||
struct user_namespace *user_ns = current_user_ns();
|
||||
struct user_event_group *group = NULL;
|
||||
|
||||
while (user_ns) {
|
||||
group = user_event_group_from_user_ns(user_ns);
|
||||
|
||||
if (group)
|
||||
break;
|
||||
|
||||
user_ns = user_ns->parent;
|
||||
}
|
||||
|
||||
return group;
|
||||
}
|
||||
|
||||
static struct user_event_group
|
||||
*user_event_group_create(struct user_namespace *user_ns)
|
||||
{
|
||||
struct user_event_group *group;
|
||||
|
||||
group = kzalloc(sizeof(*group), GFP_KERNEL);
|
||||
|
||||
if (!group)
|
||||
return NULL;
|
||||
|
||||
group->system_name = user_event_group_system_name(user_ns);
|
||||
|
||||
if (!group->system_name)
|
||||
goto error;
|
||||
|
||||
group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
|
||||
|
||||
if (!group->pages)
|
||||
goto error;
|
||||
|
||||
group->register_page_data = page_address(group->pages);
|
||||
|
||||
set_page_reservations(group->register_page_data, true);
|
||||
|
||||
/* Zero all bits beside 0 (which is reserved for failures) */
|
||||
bitmap_zero(group->page_bitmap, MAX_EVENTS);
|
||||
set_bit(0, group->page_bitmap);
|
||||
|
||||
mutex_init(&group->reg_mutex);
|
||||
hash_init(group->register_table);
|
||||
|
||||
return group;
|
||||
error:
|
||||
if (group)
|
||||
user_event_group_destroy(group);
|
||||
|
||||
return NULL;
|
||||
};
|
||||
|
||||
static __always_inline
|
||||
void user_event_register_set(struct user_event *user)
|
||||
{
|
||||
int i = user->index;
|
||||
|
||||
user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void user_event_register_clear(struct user_event *user)
|
||||
{
|
||||
int i = user->index;
|
||||
|
||||
user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
|
||||
}
|
||||
|
||||
static __always_inline __must_check
|
||||
bool user_event_last_ref(struct user_event *user)
|
||||
{
|
||||
return refcount_read(&user->refcnt) == 1;
|
||||
}
|
||||
|
||||
static __always_inline __must_check
|
||||
size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
@@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
|
||||
*
|
||||
* Upon success user_event has its ref count increased by 1.
|
||||
*/
|
||||
static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
|
||||
static int user_event_parse_cmd(struct user_event_group *group,
|
||||
char *raw_command, struct user_event **newuser)
|
||||
{
|
||||
char *name = raw_command;
|
||||
char *args = strpbrk(name, " ");
|
||||
@@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
|
||||
if (flags)
|
||||
*flags++ = '\0';
|
||||
|
||||
return user_event_parse(name, args, flags, newuser);
|
||||
return user_event_parse(group, name, args, flags, newuser);
|
||||
}
|
||||
|
||||
static int user_field_array_size(const char *type)
|
||||
@@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
|
||||
goto add_field;
|
||||
|
||||
add_validator:
|
||||
if (strstr(type, "char") != 0)
|
||||
if (strstr(type, "char") != NULL)
|
||||
validator_flags |= VALIDATOR_ENSURE_NULL;
|
||||
|
||||
validator = kmalloc(sizeof(*validator), GFP_KERNEL);
|
||||
@@ -458,7 +646,7 @@ static const char *user_field_format(const char *type)
|
||||
return "%d";
|
||||
if (strcmp(type, "unsigned char") == 0)
|
||||
return "%u";
|
||||
if (strstr(type, "char[") != 0)
|
||||
if (strstr(type, "char[") != NULL)
|
||||
return "%s";
|
||||
|
||||
/* Unknown, likely struct, allowed treat as 64-bit */
|
||||
@@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
|
||||
|
||||
return false;
|
||||
check:
|
||||
return strstr(type, "char") != 0;
|
||||
return strstr(type, "char") != NULL;
|
||||
}
|
||||
|
||||
#define LEN_OR_ZERO (len ? len - pos : 0)
|
||||
static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
|
||||
char *buf, int len, bool *colon)
|
||||
{
|
||||
int pos = 0, i = *iout;
|
||||
|
||||
*colon = false;
|
||||
|
||||
for (; i < argc; ++i) {
|
||||
if (i != *iout)
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
|
||||
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
|
||||
|
||||
if (strchr(argv[i], ';')) {
|
||||
++i;
|
||||
*colon = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Actual set, advance i */
|
||||
if (len != 0)
|
||||
*iout = i;
|
||||
|
||||
return pos + 1;
|
||||
}
|
||||
|
||||
static int user_field_set_string(struct ftrace_event_field *field,
|
||||
char *buf, int len, bool colon)
|
||||
{
|
||||
int pos = 0;
|
||||
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
|
||||
|
||||
if (colon)
|
||||
pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
|
||||
|
||||
return pos + 1;
|
||||
}
|
||||
|
||||
static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
|
||||
{
|
||||
struct ftrace_event_field *field, *next;
|
||||
@@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user)
|
||||
|
||||
dyn_event_remove(&user->devent);
|
||||
|
||||
register_page_data[user->index] = 0;
|
||||
clear_bit(user->index, page_bitmap);
|
||||
user_event_register_clear(user);
|
||||
clear_bit(user->index, user->group->page_bitmap);
|
||||
hash_del(&user->node);
|
||||
|
||||
user_event_destroy_validators(user);
|
||||
@@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct user_event *find_user_event(char *name, u32 *outkey)
|
||||
static struct user_event *find_user_event(struct user_event_group *group,
|
||||
char *name, u32 *outkey)
|
||||
{
|
||||
struct user_event *user;
|
||||
u32 key = user_event_key(name);
|
||||
|
||||
*outkey = key;
|
||||
|
||||
hash_for_each_possible(register_table, user, node, key)
|
||||
hash_for_each_possible(group->register_table, user, node, key)
|
||||
if (!strcmp(EVENT_NAME(user), name)) {
|
||||
atomic_inc(&user->refcnt);
|
||||
refcount_inc(&user->refcnt);
|
||||
return user;
|
||||
}
|
||||
|
||||
@@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user)
|
||||
rcu_read_unlock_sched();
|
||||
}
|
||||
|
||||
register_page_data[user->index] = status;
|
||||
if (status)
|
||||
user_event_register_set(user);
|
||||
else
|
||||
user_event_register_clear(user);
|
||||
|
||||
user->status = status;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call,
|
||||
|
||||
return ret;
|
||||
inc:
|
||||
atomic_inc(&user->refcnt);
|
||||
refcount_inc(&user->refcnt);
|
||||
update_reg_page_for(user);
|
||||
return 0;
|
||||
dec:
|
||||
update_reg_page_for(user);
|
||||
atomic_dec(&user->refcnt);
|
||||
refcount_dec(&user->refcnt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int user_event_create(const char *raw_command)
|
||||
{
|
||||
struct user_event_group *group;
|
||||
struct user_event *user;
|
||||
char *name;
|
||||
int ret;
|
||||
@@ -861,14 +1098,19 @@ static int user_event_create(const char *raw_command)
|
||||
if (!name)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(®_mutex);
|
||||
group = current_user_event_group();
|
||||
|
||||
ret = user_event_parse_cmd(name, &user);
|
||||
if (!group)
|
||||
return -ENOENT;
|
||||
|
||||
mutex_lock(&group->reg_mutex);
|
||||
|
||||
ret = user_event_parse_cmd(group, name, &user);
|
||||
|
||||
if (!ret)
|
||||
atomic_dec(&user->refcnt);
|
||||
refcount_dec(&user->refcnt);
|
||||
|
||||
mutex_unlock(®_mutex);
|
||||
mutex_unlock(&group->reg_mutex);
|
||||
|
||||
if (ret)
|
||||
kfree(name);
|
||||
@@ -910,14 +1152,14 @@ static bool user_event_is_busy(struct dyn_event *ev)
|
||||
{
|
||||
struct user_event *user = container_of(ev, struct user_event, devent);
|
||||
|
||||
return atomic_read(&user->refcnt) != 0;
|
||||
return !user_event_last_ref(user);
|
||||
}
|
||||
|
||||
static int user_event_free(struct dyn_event *ev)
|
||||
{
|
||||
struct user_event *user = container_of(ev, struct user_event, devent);
|
||||
|
||||
if (atomic_read(&user->refcnt) != 0)
|
||||
if (!user_event_last_ref(user))
|
||||
return -EBUSY;
|
||||
|
||||
return destroy_user_event(user);
|
||||
@@ -926,49 +1168,35 @@ static int user_event_free(struct dyn_event *ev)
|
||||
static bool user_field_match(struct ftrace_event_field *field, int argc,
|
||||
const char **argv, int *iout)
|
||||
{
|
||||
char *field_name, *arg_name;
|
||||
int len, pos, i = *iout;
|
||||
char *field_name = NULL, *dyn_field_name = NULL;
|
||||
bool colon = false, match = false;
|
||||
int dyn_len, len;
|
||||
|
||||
if (i >= argc)
|
||||
if (*iout >= argc)
|
||||
return false;
|
||||
|
||||
len = MAX_FIELD_ARG_NAME;
|
||||
field_name = kmalloc(len, GFP_KERNEL);
|
||||
arg_name = kmalloc(len, GFP_KERNEL);
|
||||
dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
|
||||
0, &colon);
|
||||
|
||||
if (!arg_name || !field_name)
|
||||
len = user_field_set_string(field, field_name, 0, colon);
|
||||
|
||||
if (dyn_len != len)
|
||||
return false;
|
||||
|
||||
dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
|
||||
field_name = kmalloc(len, GFP_KERNEL);
|
||||
|
||||
if (!dyn_field_name || !field_name)
|
||||
goto out;
|
||||
|
||||
pos = 0;
|
||||
user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
|
||||
dyn_len, &colon);
|
||||
|
||||
for (; i < argc; ++i) {
|
||||
if (i != *iout)
|
||||
pos += snprintf(arg_name + pos, len - pos, " ");
|
||||
user_field_set_string(field, field_name, len, colon);
|
||||
|
||||
pos += snprintf(arg_name + pos, len - pos, argv[i]);
|
||||
|
||||
if (strchr(argv[i], ';')) {
|
||||
++i;
|
||||
colon = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pos = 0;
|
||||
|
||||
pos += snprintf(field_name + pos, len - pos, field->type);
|
||||
pos += snprintf(field_name + pos, len - pos, " ");
|
||||
pos += snprintf(field_name + pos, len - pos, field->name);
|
||||
|
||||
if (colon)
|
||||
pos += snprintf(field_name + pos, len - pos, ";");
|
||||
|
||||
*iout = i;
|
||||
|
||||
match = strcmp(arg_name, field_name) == 0;
|
||||
match = strcmp(dyn_field_name, field_name) == 0;
|
||||
out:
|
||||
kfree(arg_name);
|
||||
kfree(dyn_field_name);
|
||||
kfree(field_name);
|
||||
|
||||
return match;
|
||||
@@ -1036,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user)
|
||||
* The name buffer lifetime is owned by this method for success cases only.
|
||||
* Upon success the returned user_event has its ref count increased by 1.
|
||||
*/
|
||||
static int user_event_parse(char *name, char *args, char *flags,
|
||||
static int user_event_parse(struct user_event_group *group, char *name,
|
||||
char *args, char *flags,
|
||||
struct user_event **newuser)
|
||||
{
|
||||
int ret;
|
||||
@@ -1046,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
|
||||
/* Prevent dyn_event from racing */
|
||||
mutex_lock(&event_mutex);
|
||||
user = find_user_event(name, &key);
|
||||
user = find_user_event(group, name, &key);
|
||||
mutex_unlock(&event_mutex);
|
||||
|
||||
if (user) {
|
||||
@@ -1059,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
return 0;
|
||||
}
|
||||
|
||||
index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
|
||||
index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
|
||||
|
||||
if (index == MAX_EVENTS)
|
||||
return -EMFILE;
|
||||
@@ -1073,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
INIT_LIST_HEAD(&user->fields);
|
||||
INIT_LIST_HEAD(&user->validators);
|
||||
|
||||
user->group = group;
|
||||
user->tracepoint.name = name;
|
||||
|
||||
ret = user_event_parse_fields(user, args);
|
||||
@@ -1091,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
|
||||
user->call.tp = &user->tracepoint;
|
||||
user->call.event.funcs = &user_event_funcs;
|
||||
user->class.system = group->system_name;
|
||||
|
||||
user->class.system = USER_EVENTS_SYSTEM;
|
||||
user->class.fields_array = user_event_fields_array;
|
||||
user->class.get_fields = user_event_get_fields;
|
||||
user->class.reg = user_event_reg;
|
||||
@@ -1110,13 +1340,13 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
|
||||
user->index = index;
|
||||
|
||||
/* Ensure we track ref */
|
||||
atomic_inc(&user->refcnt);
|
||||
/* Ensure we track self ref and caller ref (2) */
|
||||
refcount_set(&user->refcnt, 2);
|
||||
|
||||
dyn_event_init(&user->devent, &user_event_dops);
|
||||
dyn_event_add(&user->devent, &user->call);
|
||||
set_bit(user->index, page_bitmap);
|
||||
hash_add(register_table, &user->node, key);
|
||||
set_bit(user->index, group->page_bitmap);
|
||||
hash_add(group->register_table, &user->node, key);
|
||||
|
||||
mutex_unlock(&event_mutex);
|
||||
|
||||
@@ -1134,32 +1364,20 @@ static int user_event_parse(char *name, char *args, char *flags,
|
||||
/*
|
||||
* Deletes a previously created event if it is no longer being used.
|
||||
*/
|
||||
static int delete_user_event(char *name)
|
||||
static int delete_user_event(struct user_event_group *group, char *name)
|
||||
{
|
||||
u32 key;
|
||||
int ret;
|
||||
struct user_event *user = find_user_event(name, &key);
|
||||
struct user_event *user = find_user_event(group, name, &key);
|
||||
|
||||
if (!user)
|
||||
return -ENOENT;
|
||||
|
||||
/* Ensure we are the last ref */
|
||||
if (atomic_read(&user->refcnt) != 1) {
|
||||
ret = -EBUSY;
|
||||
goto put_ref;
|
||||
}
|
||||
refcount_dec(&user->refcnt);
|
||||
|
||||
ret = destroy_user_event(user);
|
||||
if (!user_event_last_ref(user))
|
||||
return -EBUSY;
|
||||
|
||||
if (ret)
|
||||
goto put_ref;
|
||||
|
||||
return ret;
|
||||
put_ref:
|
||||
/* No longer have this ref */
|
||||
atomic_dec(&user->refcnt);
|
||||
|
||||
return ret;
|
||||
return destroy_user_event(user);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1167,6 +1385,7 @@ static int delete_user_event(char *name)
|
||||
*/
|
||||
static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
|
||||
{
|
||||
struct user_event_file_info *info = file->private_data;
|
||||
struct user_event_refs *refs;
|
||||
struct user_event *user = NULL;
|
||||
struct tracepoint *tp;
|
||||
@@ -1178,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
|
||||
|
||||
rcu_read_lock_sched();
|
||||
|
||||
refs = rcu_dereference_sched(file->private_data);
|
||||
refs = rcu_dereference_sched(info->refs);
|
||||
|
||||
/*
|
||||
* The refs->events array is protected by RCU, and new items may be
|
||||
@@ -1236,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int user_events_open(struct inode *node, struct file *file)
|
||||
{
|
||||
struct user_event_group *group;
|
||||
struct user_event_file_info *info;
|
||||
|
||||
group = current_user_event_group();
|
||||
|
||||
if (!group)
|
||||
return -ENOENT;
|
||||
|
||||
info = kzalloc(sizeof(*info), GFP_KERNEL);
|
||||
|
||||
if (!info)
|
||||
return -ENOMEM;
|
||||
|
||||
info->group = group;
|
||||
|
||||
file->private_data = info;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
@@ -1245,7 +1486,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
|
||||
if (unlikely(*ppos != 0))
|
||||
return -EFAULT;
|
||||
|
||||
if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
|
||||
if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
|
||||
count, &iov, &i)))
|
||||
return -EFAULT;
|
||||
|
||||
return user_events_write_core(file, &i);
|
||||
@@ -1256,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
|
||||
return user_events_write_core(kp->ki_filp, i);
|
||||
}
|
||||
|
||||
static int user_events_ref_add(struct file *file, struct user_event *user)
|
||||
static int user_events_ref_add(struct user_event_file_info *info,
|
||||
struct user_event *user)
|
||||
{
|
||||
struct user_event_group *group = info->group;
|
||||
struct user_event_refs *refs, *new_refs;
|
||||
int i, size, count = 0;
|
||||
|
||||
refs = rcu_dereference_protected(file->private_data,
|
||||
lockdep_is_held(®_mutex));
|
||||
refs = rcu_dereference_protected(info->refs,
|
||||
lockdep_is_held(&group->reg_mutex));
|
||||
|
||||
if (refs) {
|
||||
count = refs->count;
|
||||
@@ -1286,9 +1530,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
|
||||
|
||||
new_refs->events[i] = user;
|
||||
|
||||
atomic_inc(&user->refcnt);
|
||||
refcount_inc(&user->refcnt);
|
||||
|
||||
rcu_assign_pointer(file->private_data, new_refs);
|
||||
rcu_assign_pointer(info->refs, new_refs);
|
||||
|
||||
if (refs)
|
||||
kfree_rcu(refs, rcu);
|
||||
@@ -1309,13 +1553,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
|
||||
if (size > PAGE_SIZE)
|
||||
return -E2BIG;
|
||||
|
||||
return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
|
||||
if (size < offsetofend(struct user_reg, write_index))
|
||||
return -EINVAL;
|
||||
|
||||
ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
kreg->size = size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Registers a user_event on behalf of a user process.
|
||||
*/
|
||||
static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
|
||||
static long user_events_ioctl_reg(struct user_event_file_info *info,
|
||||
unsigned long uarg)
|
||||
{
|
||||
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
|
||||
struct user_reg reg;
|
||||
@@ -1336,24 +1591,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = user_event_parse_cmd(name, &user);
|
||||
ret = user_event_parse_cmd(info->group, name, &user);
|
||||
|
||||
if (ret) {
|
||||
kfree(name);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = user_events_ref_add(file, user);
|
||||
ret = user_events_ref_add(info, user);
|
||||
|
||||
/* No longer need parse ref, ref_add either worked or not */
|
||||
atomic_dec(&user->refcnt);
|
||||
refcount_dec(&user->refcnt);
|
||||
|
||||
/* Positive number is index and valid */
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
put_user((u32)ret, &ureg->write_index);
|
||||
put_user(user->index, &ureg->status_index);
|
||||
put_user(user->index, &ureg->status_bit);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1361,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
|
||||
/*
|
||||
* Deletes a user_event on behalf of a user process.
|
||||
*/
|
||||
static long user_events_ioctl_del(struct file *file, unsigned long uarg)
|
||||
static long user_events_ioctl_del(struct user_event_file_info *info,
|
||||
unsigned long uarg)
|
||||
{
|
||||
void __user *ubuf = (void __user *)uarg;
|
||||
char *name;
|
||||
@@ -1374,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
|
||||
|
||||
/* event_mutex prevents dyn_event from racing */
|
||||
mutex_lock(&event_mutex);
|
||||
ret = delete_user_event(name);
|
||||
ret = delete_user_event(info->group, name);
|
||||
mutex_unlock(&event_mutex);
|
||||
|
||||
kfree(name);
|
||||
@@ -1388,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
|
||||
static long user_events_ioctl(struct file *file, unsigned int cmd,
|
||||
unsigned long uarg)
|
||||
{
|
||||
struct user_event_file_info *info = file->private_data;
|
||||
struct user_event_group *group = info->group;
|
||||
long ret = -ENOTTY;
|
||||
|
||||
switch (cmd) {
|
||||
case DIAG_IOCSREG:
|
||||
mutex_lock(®_mutex);
|
||||
ret = user_events_ioctl_reg(file, uarg);
|
||||
mutex_unlock(®_mutex);
|
||||
mutex_lock(&group->reg_mutex);
|
||||
ret = user_events_ioctl_reg(info, uarg);
|
||||
mutex_unlock(&group->reg_mutex);
|
||||
break;
|
||||
|
||||
case DIAG_IOCSDEL:
|
||||
mutex_lock(®_mutex);
|
||||
ret = user_events_ioctl_del(file, uarg);
|
||||
mutex_unlock(®_mutex);
|
||||
mutex_lock(&group->reg_mutex);
|
||||
ret = user_events_ioctl_del(info, uarg);
|
||||
mutex_unlock(&group->reg_mutex);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1412,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
|
||||
*/
|
||||
static int user_events_release(struct inode *node, struct file *file)
|
||||
{
|
||||
struct user_event_file_info *info = file->private_data;
|
||||
struct user_event_group *group;
|
||||
struct user_event_refs *refs;
|
||||
struct user_event *user;
|
||||
int i;
|
||||
|
||||
if (!info)
|
||||
return -EINVAL;
|
||||
|
||||
group = info->group;
|
||||
|
||||
/*
|
||||
* Ensure refs cannot change under any situation by taking the
|
||||
* register mutex during the final freeing of the references.
|
||||
*/
|
||||
mutex_lock(®_mutex);
|
||||
mutex_lock(&group->reg_mutex);
|
||||
|
||||
refs = file->private_data;
|
||||
refs = info->refs;
|
||||
|
||||
if (!refs)
|
||||
goto out;
|
||||
@@ -1436,37 +1701,56 @@ static int user_events_release(struct inode *node, struct file *file)
|
||||
user = refs->events[i];
|
||||
|
||||
if (user)
|
||||
atomic_dec(&user->refcnt);
|
||||
refcount_dec(&user->refcnt);
|
||||
}
|
||||
out:
|
||||
file->private_data = NULL;
|
||||
|
||||
mutex_unlock(®_mutex);
|
||||
mutex_unlock(&group->reg_mutex);
|
||||
|
||||
kfree(refs);
|
||||
kfree(info);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations user_data_fops = {
|
||||
.open = user_events_open,
|
||||
.write = user_events_write,
|
||||
.write_iter = user_events_write_iter,
|
||||
.unlocked_ioctl = user_events_ioctl,
|
||||
.release = user_events_release,
|
||||
};
|
||||
|
||||
static struct user_event_group *user_status_group(struct file *file)
|
||||
{
|
||||
struct seq_file *m = file->private_data;
|
||||
|
||||
if (!m)
|
||||
return NULL;
|
||||
|
||||
return m->private;
|
||||
}
|
||||
|
||||
/*
|
||||
* Maps the shared page into the user process for checking if event is enabled.
|
||||
*/
|
||||
static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
char *pages;
|
||||
struct user_event_group *group = user_status_group(file);
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
||||
if (size != MAX_EVENTS)
|
||||
if (size != MAX_BYTES)
|
||||
return -EINVAL;
|
||||
|
||||
if (!group)
|
||||
return -EINVAL;
|
||||
|
||||
pages = group->register_page_data;
|
||||
|
||||
return remap_pfn_range(vma, vma->vm_start,
|
||||
virt_to_phys(register_page_data) >> PAGE_SHIFT,
|
||||
virt_to_phys(pages) >> PAGE_SHIFT,
|
||||
size, vm_get_page_prot(VM_READ));
|
||||
}
|
||||
|
||||
@@ -1490,14 +1774,18 @@ static void user_seq_stop(struct seq_file *m, void *p)
|
||||
|
||||
static int user_seq_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct user_event_group *group = m->private;
|
||||
struct user_event *user;
|
||||
char status;
|
||||
int i, active = 0, busy = 0, flags;
|
||||
|
||||
mutex_lock(®_mutex);
|
||||
if (!group)
|
||||
return -EINVAL;
|
||||
|
||||
hash_for_each(register_table, i, user, node) {
|
||||
status = register_page_data[user->index];
|
||||
mutex_lock(&group->reg_mutex);
|
||||
|
||||
hash_for_each(group->register_table, i, user, node) {
|
||||
status = user->status;
|
||||
flags = user->flags;
|
||||
|
||||
seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
|
||||
@@ -1520,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p)
|
||||
active++;
|
||||
}
|
||||
|
||||
mutex_unlock(®_mutex);
|
||||
mutex_unlock(&group->reg_mutex);
|
||||
|
||||
seq_puts(m, "\n");
|
||||
seq_printf(m, "Active: %d\n", active);
|
||||
@@ -1539,7 +1827,24 @@ static const struct seq_operations user_seq_ops = {
|
||||
|
||||
static int user_status_open(struct inode *node, struct file *file)
|
||||
{
|
||||
return seq_open(file, &user_seq_ops);
|
||||
struct user_event_group *group;
|
||||
int ret;
|
||||
|
||||
group = current_user_event_group();
|
||||
|
||||
if (!group)
|
||||
return -ENOENT;
|
||||
|
||||
ret = seq_open(file, &user_seq_ops);
|
||||
|
||||
if (!ret) {
|
||||
/* Chain group to seq_file */
|
||||
struct seq_file *m = file->private_data;
|
||||
|
||||
m->private = group;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct file_operations user_status_fops = {
|
||||
@@ -1580,42 +1885,21 @@ static int create_user_tracefs(void)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static void set_page_reservations(bool set)
|
||||
{
|
||||
int page;
|
||||
|
||||
for (page = 0; page < MAX_PAGES; ++page) {
|
||||
void *addr = register_page_data + (PAGE_SIZE * page);
|
||||
|
||||
if (set)
|
||||
SetPageReserved(virt_to_page(addr));
|
||||
else
|
||||
ClearPageReserved(virt_to_page(addr));
|
||||
}
|
||||
}
|
||||
|
||||
static int __init trace_events_user_init(void)
|
||||
{
|
||||
struct page *pages;
|
||||
int ret;
|
||||
|
||||
/* Zero all bits beside 0 (which is reserved for failures) */
|
||||
bitmap_zero(page_bitmap, MAX_EVENTS);
|
||||
set_bit(0, page_bitmap);
|
||||
init_group = user_event_group_create(&init_user_ns);
|
||||
|
||||
pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
|
||||
if (!pages)
|
||||
if (!init_group)
|
||||
return -ENOMEM;
|
||||
register_page_data = page_address(pages);
|
||||
|
||||
set_page_reservations(true);
|
||||
|
||||
ret = create_user_tracefs();
|
||||
|
||||
if (ret) {
|
||||
pr_warn("user_events could not register with tracefs\n");
|
||||
set_page_reservations(false);
|
||||
__free_pages(pages, MAX_PAGE_ORDER);
|
||||
user_event_group_destroy(init_group);
|
||||
init_group = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "trace_kprobe_selftest.h"
|
||||
#include "trace_probe.h"
|
||||
#include "trace_probe_tmpl.h"
|
||||
#include "trace_probe_kernel.h"
|
||||
|
||||
#define KPROBE_EVENT_SYSTEM "kprobes"
|
||||
#define KRETPROBE_MAXACTIVE_MAX 4096
|
||||
@@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = {
|
||||
static nokprobe_inline int
|
||||
fetch_store_strlen_user(unsigned long addr)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
|
||||
return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
|
||||
return kern_fetch_store_strlen_user(addr);
|
||||
}
|
||||
|
||||
/* Return the length of string -- including null terminal byte */
|
||||
static nokprobe_inline int
|
||||
fetch_store_strlen(unsigned long addr)
|
||||
{
|
||||
int ret, len = 0;
|
||||
u8 c;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if (addr < TASK_SIZE)
|
||||
return fetch_store_strlen_user(addr);
|
||||
#endif
|
||||
|
||||
do {
|
||||
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
|
||||
len++;
|
||||
} while (c && ret == 0 && len < MAX_STRING_SIZE);
|
||||
|
||||
return (ret < 0) ? ret : len;
|
||||
return kern_fetch_store_strlen(addr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr)
|
||||
static nokprobe_inline int
|
||||
fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
|
||||
if (ret >= 0)
|
||||
*(u32 *)dest = make_data_loc(ret, __dest - base);
|
||||
|
||||
return ret;
|
||||
return kern_fetch_store_string_user(addr, dest, base);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
||||
static nokprobe_inline int
|
||||
fetch_store_string(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if ((unsigned long)addr < TASK_SIZE)
|
||||
return fetch_store_string_user(addr, dest, base);
|
||||
#endif
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
/*
|
||||
* Try to get string again, since the string can be changed while
|
||||
* probing.
|
||||
*/
|
||||
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
|
||||
if (ret >= 0)
|
||||
*(u32 *)dest = make_data_loc(ret, __dest - base);
|
||||
|
||||
return ret;
|
||||
return kern_fetch_store_string(addr, dest, base);
|
||||
}
|
||||
|
||||
static nokprobe_inline int
|
||||
|
||||
@@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void)
|
||||
for_each_cpu(cpu, current_mask) {
|
||||
retval = start_kthread(cpu);
|
||||
if (retval) {
|
||||
cpus_read_unlock();
|
||||
stop_per_cpu_kthreads();
|
||||
break;
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
|
||||
C(SAME_PROBE, "There is already the exact same probe event"),\
|
||||
C(NO_EVENT_INFO, "This requires both group and event name to attach"),\
|
||||
C(BAD_ATTACH_EVENT, "Attached event does not exist"),\
|
||||
C(BAD_ATTACH_ARG, "Attached event does not have this field"),
|
||||
C(BAD_ATTACH_ARG, "Attached event does not have this field"),\
|
||||
C(NO_EP_FILTER, "No filter rule after 'if'"),
|
||||
|
||||
#undef C
|
||||
#define C(a, b) TP_ERR_##a
|
||||
|
||||
115
kernel/trace/trace_probe_kernel.h
Normal file
115
kernel/trace/trace_probe_kernel.h
Normal file
@@ -0,0 +1,115 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __TRACE_PROBE_KERNEL_H_
|
||||
#define __TRACE_PROBE_KERNEL_H_
|
||||
|
||||
#define FAULT_STRING "(fault)"
|
||||
|
||||
/*
|
||||
* This depends on trace_probe.h, but can not include it due to
|
||||
* the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
|
||||
* Which means that any other user must include trace_probe.h before including
|
||||
* this file.
|
||||
*/
|
||||
/* Return the length of string -- including null terminal byte */
|
||||
static nokprobe_inline int
|
||||
kern_fetch_store_strlen_user(unsigned long addr)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
int ret;
|
||||
|
||||
ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
|
||||
/*
|
||||
* strnlen_user_nofault returns zero on fault, insert the
|
||||
* FAULT_STRING when that occurs.
|
||||
*/
|
||||
if (ret <= 0)
|
||||
return strlen(FAULT_STRING) + 1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Return the length of string -- including null terminal byte */
|
||||
static nokprobe_inline int
|
||||
kern_fetch_store_strlen(unsigned long addr)
|
||||
{
|
||||
int ret, len = 0;
|
||||
u8 c;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if (addr < TASK_SIZE)
|
||||
return kern_fetch_store_strlen_user(addr);
|
||||
#endif
|
||||
|
||||
do {
|
||||
ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
|
||||
len++;
|
||||
} while (c && ret == 0 && len < MAX_STRING_SIZE);
|
||||
|
||||
/* For faults, return enough to hold the FAULT_STRING */
|
||||
return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
|
||||
}
|
||||
|
||||
static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
|
||||
{
|
||||
if (ret >= 0) {
|
||||
*(u32 *)dest = make_data_loc(ret, __dest - base);
|
||||
} else {
|
||||
strscpy(__dest, FAULT_STRING, len);
|
||||
ret = strlen(__dest) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
|
||||
* with max length and relative data location.
|
||||
*/
|
||||
static nokprobe_inline int
|
||||
kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
const void __user *uaddr = (__force const void __user *)addr;
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
|
||||
set_data_loc(ret, dest, __dest, base, maxlen);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
|
||||
* length and relative data location.
|
||||
*/
|
||||
static nokprobe_inline int
|
||||
kern_fetch_store_string(unsigned long addr, void *dest, void *base)
|
||||
{
|
||||
int maxlen = get_loc_len(*(u32 *)dest);
|
||||
void *__dest;
|
||||
long ret;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
if ((unsigned long)addr < TASK_SIZE)
|
||||
return kern_fetch_store_string_user(addr, dest, base);
|
||||
#endif
|
||||
|
||||
if (unlikely(!maxlen))
|
||||
return -ENOMEM;
|
||||
|
||||
__dest = get_loc_data(dest, base);
|
||||
|
||||
/*
|
||||
* Try to get string again, since the string can be changed while
|
||||
* probing.
|
||||
*/
|
||||
ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
|
||||
set_data_loc(ret, dest, __dest, base, maxlen);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* __TRACE_PROBE_KERNEL_H_ */
|
||||
@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
|
||||
static void detect_dups(struct tracing_map_sort_entry **sort_entries,
|
||||
int n_entries, unsigned int key_size)
|
||||
{
|
||||
unsigned int dups = 0, total_dups = 0;
|
||||
unsigned int total_dups = 0;
|
||||
int i;
|
||||
void *key;
|
||||
|
||||
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
|
||||
key = sort_entries[0]->key;
|
||||
for (i = 1; i < n_entries; i++) {
|
||||
if (!memcmp(sort_entries[i]->key, key, key_size)) {
|
||||
dups++; total_dups++;
|
||||
total_dups++;
|
||||
continue;
|
||||
}
|
||||
key = sort_entries[i]->key;
|
||||
dups = 0;
|
||||
}
|
||||
|
||||
WARN_ONCE(total_dups > 0,
|
||||
|
||||
@@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
|
||||
static int tracepoint_module_coming(struct module *mod)
|
||||
{
|
||||
struct tp_module *tp_mod;
|
||||
int ret = 0;
|
||||
|
||||
if (!mod->num_tracepoints)
|
||||
return 0;
|
||||
@@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod)
|
||||
*/
|
||||
if (trace_module_has_bad_taint(mod))
|
||||
return 0;
|
||||
mutex_lock(&tracepoint_module_list_mutex);
|
||||
|
||||
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
|
||||
if (!tp_mod) {
|
||||
ret = -ENOMEM;
|
||||
goto end;
|
||||
}
|
||||
if (!tp_mod)
|
||||
return -ENOMEM;
|
||||
tp_mod->mod = mod;
|
||||
|
||||
mutex_lock(&tracepoint_module_list_mutex);
|
||||
list_add_tail(&tp_mod->list, &tracepoint_module_list);
|
||||
blocking_notifier_call_chain(&tracepoint_notify_list,
|
||||
MODULE_STATE_COMING, tp_mod);
|
||||
end:
|
||||
mutex_unlock(&tracepoint_module_list_mutex);
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void tracepoint_module_going(struct module *mod)
|
||||
|
||||
@@ -75,6 +75,13 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll);
|
||||
static DEFINE_CTL_TABLE_POLL(domainname_poll);
|
||||
|
||||
static struct ctl_table uts_kern_table[] = {
|
||||
{
|
||||
.procname = "arch",
|
||||
.data = init_uts_ns.name.machine,
|
||||
.maxlen = sizeof(init_uts_ns.name.machine),
|
||||
.mode = 0444,
|
||||
.proc_handler = proc_do_uts_string,
|
||||
},
|
||||
{
|
||||
.procname = "ostype",
|
||||
.data = init_uts_ns.name.sysname,
|
||||
|
||||
Reference in New Issue
Block a user