Merge tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing updates from Steven Rostedt:

 - Use READ_ONCE() and WRITE_ONCE() instead of RCU for syscall
   tracepoints

   Individual system call trace events are pseudo events attached to the
   raw_syscall trace events that just trace the entry and exit of all
   system calls. When any of these individual system call trace events
   get enabled, an element in an array indexed by the system call number
   is assigned to the trace file that defines how to trace it. When the
   trace event triggers, it reads this array and if the array has an
   element, it uses that trace file to know what to write it (the trace
   file defines the output format of the corresponding system call).

   The issue is that it uses rcu_dereference_ptr() and marks the
   elements of the array as using RCU. This is incorrect. There is no
   RCU synchronization here. The event file that is pointed to has a
   completely different way to make sure its freed properly. The reading
   of the array during the system call trace event is only to know if
   there is a value or not. If not, it does nothing (it means this
   system call isn't being traced). If it does, it uses the information
   to store the system call data.

   The RCU usage here can simply be replaced by READ_ONCE() and
   WRITE_ONCE() macros.

 - Have the system call trace events use "0x" for hex values

   Some system call trace events display hex values but do not have "0x"
   in front of it. Seeing "count: 44" can be assumed that it is 44
   decimal when in actuality it is 44 hex (68 decimal). Display "0x44"
   instead.

 - Use vmalloc_array() in tracing_map_sort_entries()

   The function tracing_map_sort_entries() used array_size() and
   vmalloc() when it could have simply used vmalloc_array().

 - Use for_each_online_cpu() in trace_osnoise.c()

   Instead of open coding for_each_cpu(cpu, cpu_online_mask), use
   for_each_online_cpu().

 - Move the buffer field in struct trace_seq to the end

   The buffer field in struct trace_seq is architecture dependent in
   size, and caused padding for the fields after it. By moving the
   buffer to the end of the structure, it compacts the trace_seq
   structure better.

 - Remove redundant zeroing of cmdline_idx field in
   saved_cmdlines_buffer()

   The structure that contains cmdline_idx is zeroed by memset(), no
   need to explicitly zero any of its fields after that.

 - Use system_percpu_wq instead of system_wq in user_event_mm_remove()

   As system_wq is being deprecated, use the new wq.

 - Add cond_resched() is ftrace_module_enable()

   Some modules have a lot of functions (thousands of them), and the
   enabling of those functions can take some time. On non preemtable
   kernels, it was triggering a watchdog timeout. Add a cond_resched()
   to prevent that.

 - Add a BUILD_BUG_ON() to make sure PID_MAX_DEFAULT is always a power
   of 2

   There's code that depends on PID_MAX_DEFAULT being a power of 2 or it
   will break. If in the future that changes, make sure the build fails
   to ensure that the code is fixed that depends on this.

 - Grab mutex_lock() before ever exiting s_start()

   The s_start() function is a seq_file start routine. As s_stop() is
   always called even if s_start() fails, and s_stop() expects the
   event_mutex to be held as it will always release it. That mutex must
   always be taken in s_start() even if that function fails.

* tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  tracing: Fix lock imbalance in s_start() memory allocation failure path
  tracing: Ensure optimized hashing works
  ftrace: Fix softlockup in ftrace_module_enable
  tracing: replace use of system_wq with system_percpu_wq
  tracing: Remove redundant 0 value initialization
  tracing: Move buffer in trace_seq to end of struct
  tracing/osnoise: Use for_each_online_cpu() instead of for_each_cpu()
  tracing: Use vmalloc_array() to improve code
  tracing: Have syscall trace events show "0x" for values greater than 10
  tracing: Replace syscall RCU pointer assignment with READ/WRITE_ONCE()
This commit is contained in:
Linus Torvalds
2025-10-05 09:43:36 -07:00
9 changed files with 27 additions and 21 deletions

View File

@@ -21,10 +21,10 @@
(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
struct trace_seq {
char buffer[TRACE_SEQ_BUFFER_SIZE];
struct seq_buf seq;
size_t readpos;
int full;
char buffer[TRACE_SEQ_BUFFER_SIZE];
};
static inline void

View File

@@ -7535,6 +7535,8 @@ void ftrace_module_enable(struct module *mod)
if (!within_module(rec->ip, mod))
break;
cond_resched();
/* Weak functions should still be ignored */
if (!test_for_valid_rec(rec)) {
/* Clear all other flags. Should not be enabled anyway */

View File

@@ -380,8 +380,8 @@ struct trace_array {
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
struct trace_event_file *enter_syscall_files[NR_syscalls];
struct trace_event_file *exit_syscall_files[NR_syscalls];
#endif
int stop_count;
int clock_id;

View File

@@ -1629,11 +1629,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t l;
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
mutex_lock(&event_mutex);
if (!iter)
return NULL;
mutex_lock(&event_mutex);
iter->type = SET_EVENT_FILE;
iter->file = list_entry(&tr->events, struct trace_event_file, list);

View File

@@ -835,7 +835,7 @@ void user_event_mm_remove(struct task_struct *t)
* so we use a work queue after call_rcu() to run within.
*/
INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
queue_rcu_work(system_wq, &mm->put_rwork);
queue_rcu_work(system_percpu_wq, &mm->put_rwork);
}
void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)

View File

@@ -271,7 +271,7 @@ static inline void tlat_var_reset(void)
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
for_each_online_cpu(cpu) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
if (tlat_var->kthread)
hrtimer_cancel(&tlat_var->timer);
@@ -295,7 +295,7 @@ static inline void osn_var_reset(void)
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
for_each_online_cpu(cpu) {
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
memset(osn_var, 0, sizeof(*osn_var));
}

View File

@@ -224,7 +224,6 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
/* Place map_cmdline_to_pid array right after saved_cmdlines */
s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
s->cmdline_idx = 0;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
@@ -248,6 +247,8 @@ int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid)
return 1;
BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT));
tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*

View File

@@ -153,14 +153,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
if (trace_seq_has_overflowed(s))
goto end;
if (i)
trace_seq_puts(s, ", ");
/* parameter types */
if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
trace_seq_printf(s, "%s: %lx%s", entry->args[i],
trace->args[i],
i == entry->nb_args - 1 ? "" : ", ");
if (trace->args[i] < 10)
trace_seq_printf(s, "%s: %lu", entry->args[i],
trace->args[i]);
else
trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
trace->args[i]);
}
trace_seq_putc(s, ')');
@@ -310,8 +316,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -356,8 +361,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
if (!trace_file)
return;
@@ -393,7 +397,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
if (!tr->sys_refcount_enter)
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
rcu_assign_pointer(tr->enter_syscall_files[num], file);
WRITE_ONCE(tr->enter_syscall_files[num], file);
tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
@@ -411,7 +415,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -431,7 +435,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file,
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
rcu_assign_pointer(tr->exit_syscall_files[num], file);
WRITE_ONCE(tr->exit_syscall_files[num], file);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
@@ -449,7 +453,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
WRITE_ONCE(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);

View File

@@ -1076,7 +1076,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts));
entries = vmalloc_array(map->max_elts, sizeof(sort_entry));
if (!entries)
return -ENOMEM;