mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2026-01-02 20:12:30 +00:00
This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on arm64. This allows each ftrace callsite to provide an ftrace_ops to the common ftrace trampoline, allowing each callsite to invoke distinct tracer functions without the need to fall back to list processing or to allocate custom trampolines for each callsite. This significantly speeds up cases where multiple distinct trace functions are used and callsites are mostly traced by a single tracer. The main idea is to place a pointer to the ftrace_ops as a literal at a fixed offset from the function entry point, which can be recovered by the common ftrace trampoline. Using a 64-bit literal avoids branch range limitations, and permits the ops to be swapped atomically without special considerations that apply to code-patching. In future this will also allow for the implementation of DYNAMIC_FTRACE_WITH_DIRECT_CALLS without branch range limitations by using additional fields in struct ftrace_ops. As noted in the core patch adding support for DYNAMIC_FTRACE_WITH_CALL_OPS, this approach allows for directly invoking ftrace_ops::func even for ftrace_ops which are dynamically-allocated (or part of a module), without going via ftrace_ops_list_func. Currently, this approach is not compatible with CLANG_CFI, as the presence/absence of pre-function NOPs changes the offset of the pre-function type hash, and there's no existing mechanism to ensure a consistent offset for instrumented and uninstrumented functions. When CLANG_CFI is enabled, the existing scheme with a global ops->func pointer is used, and there should be no functional change. I am currently working with others to allow the two to work together in future (though this will liekly require updated compiler support). I've benchamrked this with the ftrace_ops sample module [1], which is not currently upstream, but available at: https://lore.kernel.org/lkml/20230103124912.2948963-1-mark.rutland@arm.com git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git ftrace-ops-sample-20230109 Using that module I measured the total time taken for 100,000 calls to a trivial instrumented function, with a number of tracers enabled with relevant filters (which would apply to the instrumented function) and a number of tracers enabled with irrelevant filters (which would not apply to the instrumented function). I tested on an M1 MacBook Pro, running under a HVF-accelerated QEMU VM (i.e. on real hardware). Before this patch: Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,583 | 0.95 | - 0 | 1 || 93,709 | 0.94 | - 0 | 2 || 93,666 | 0.94 | - 0 | 10 || 93,709 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 6,467,833 | 64.68 | 63.73 1 | 2 || 7,509,708 | 75.10 | 74.15 1 | 10 || 23,786,792 | 237.87 | 236.92 1 | 100 || 106,432,500 | 1,064.43 | 1063.38 ---------+------------++-------------+--------------+------------ 1 | 0 || 1,431,875 | 14.32 | 13.37 2 | 0 || 6,456,334 | 64.56 | 63.62 10 | 0 || 22,717,000 | 227.17 | 226.22 100 | 0 || 103,293,667 | 1032.94 | 1031.99 ---------+------------++-------------+--------------+-------------- Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. After this patch Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,541 | 0.95 | - 0 | 1 || 93,666 | 0.94 | - 0 | 2 || 93,709 | 0.94 | - 0 | 10 || 93,667 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 281,000 | 2.81 | 1.86 1 | 2 || 281,042 | 2.81 | 1.87 1 | 10 || 280,958 | 2.81 | 1.86 1 | 100 || 281,250 | 2.81 | 1.87 ---------+------------++-------------+--------------+------------ 1 | 0 || 280,959 | 2.81 | 1.86 2 | 0 || 6,502,708 | 65.03 | 64.08 10 | 0 || 18,681,209 | 186.81 | 185.87 100 | 0 || 103,550,458 | 1,035.50 | 1034.56 ---------+------------++-------------+--------------+------------ Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. As can be seen from the above: a) Whenever there is a single relevant tracer function associated with a tracee, the overhead of invoking the tracer is constant, and does not scale with the number of tracers which are *not* associated with that tracee. b) The overhead for a single relevant tracer has dropped to ~1/7 of the overhead prior to this series (from 13.37ns to 1.86ns). This is largely due to permitting calls to dynamically-allocated ftrace_ops without going through ftrace_ops_list_func. I've run the ftrace selftests from v6.2-rc3, which reports: | # of passed: 110 | # of failed: 0 | # of unresolved: 3 | # of untested: 0 | # of unsupported: 0 | # of xfailed: 1 | # of undefined(test bug): 0 ... where the unresolved entries were the tests for DIRECT functions (which are not supported), and the checkbashisms selftest (which is irrelevant here): | [8] Test ftrace direct functions against tracers [UNRESOLVED] | [9] Test ftrace direct functions against kprobes [UNRESOLVED] | [62] Meta-selftest: Checkbashisms [UNRESOLVED] ... with all other tests passing (or failing as expected). Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Florent Revest <revest@chromium.org> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230123134603.1064407-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
174 lines
4.3 KiB
C
174 lines
4.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* arch/arm64/include/asm/ftrace.h
|
|
*
|
|
* Copyright (C) 2013 Linaro Limited
|
|
* Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
|
|
*/
|
|
#ifndef __ASM_FTRACE_H
|
|
#define __ASM_FTRACE_H
|
|
|
|
#include <asm/insn.h>
|
|
|
|
#define HAVE_FUNCTION_GRAPH_FP_TEST
|
|
|
|
/*
|
|
* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR means that the architecture can provide a
|
|
* "return address pointer" which can be used to uniquely identify a return
|
|
* address which has been overwritten.
|
|
*
|
|
* On arm64 we use the address of the caller's frame record, which remains the
|
|
* same for the lifetime of the instrumented function, unlike the return
|
|
* address in the LR.
|
|
*/
|
|
#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
|
|
#define ARCH_SUPPORTS_FTRACE_OPS 1
|
|
#else
|
|
#define MCOUNT_ADDR ((unsigned long)_mcount)
|
|
#endif
|
|
|
|
/* The BL at the callsite's adjusted rec->ip */
|
|
#define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE
|
|
|
|
#define FTRACE_PLT_IDX 0
|
|
#define NR_FTRACE_PLTS 1
|
|
|
|
/*
|
|
* Currently, gcc tends to save the link register after the local variables
|
|
* on the stack. This causes the max stack tracer to report the function
|
|
* frame sizes for the wrong functions. By defining
|
|
* ARCH_FTRACE_SHIFT_STACK_TRACER, it will tell the stack tracer to expect
|
|
* to find the return address on the stack after the local variables have
|
|
* been set up.
|
|
*
|
|
* Note, this may change in the future, and we will need to deal with that
|
|
* if it were to happen.
|
|
*/
|
|
#define ARCH_FTRACE_SHIFT_STACK_TRACER 1
|
|
|
|
#ifndef __ASSEMBLY__
|
|
#include <linux/compat.h>
|
|
|
|
extern void _mcount(unsigned long);
|
|
extern void *return_address(unsigned int);
|
|
|
|
struct dyn_arch_ftrace {
|
|
/* No extra data needed for arm64 */
|
|
};
|
|
|
|
extern unsigned long ftrace_graph_call;
|
|
|
|
extern void return_to_handler(void);
|
|
|
|
unsigned long ftrace_call_adjust(unsigned long addr);
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
|
|
struct dyn_ftrace;
|
|
struct ftrace_ops;
|
|
|
|
#define arch_ftrace_get_regs(regs) NULL
|
|
|
|
struct ftrace_regs {
|
|
/* x0 - x8 */
|
|
unsigned long regs[9];
|
|
unsigned long __unused;
|
|
|
|
unsigned long fp;
|
|
unsigned long lr;
|
|
|
|
unsigned long sp;
|
|
unsigned long pc;
|
|
};
|
|
|
|
static __always_inline unsigned long
|
|
ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
|
|
{
|
|
return fregs->pc;
|
|
}
|
|
|
|
static __always_inline void
|
|
ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
|
|
unsigned long pc)
|
|
{
|
|
fregs->pc = pc;
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs)
|
|
{
|
|
return fregs->sp;
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n)
|
|
{
|
|
if (n < 8)
|
|
return fregs->regs[n];
|
|
return 0;
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
|
|
{
|
|
return fregs->regs[0];
|
|
}
|
|
|
|
static __always_inline void
|
|
ftrace_regs_set_return_value(struct ftrace_regs *fregs,
|
|
unsigned long ret)
|
|
{
|
|
fregs->regs[0] = ret;
|
|
}
|
|
|
|
static __always_inline void
|
|
ftrace_override_function_with_return(struct ftrace_regs *fregs)
|
|
{
|
|
fregs->pc = fregs->lr;
|
|
}
|
|
|
|
int ftrace_regs_query_register_offset(const char *name);
|
|
|
|
int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
|
|
#define ftrace_init_nop ftrace_init_nop
|
|
|
|
void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
|
|
struct ftrace_ops *op, struct ftrace_regs *fregs);
|
|
#define ftrace_graph_func ftrace_graph_func
|
|
#endif
|
|
|
|
#define ftrace_return_address(n) return_address(n)
|
|
|
|
/*
|
|
* Because AArch32 mode does not share the same syscall table with AArch64,
|
|
* tracing compat syscalls may result in reporting bogus syscalls or even
|
|
* hang-up, so just do not trace them.
|
|
* See kernel/trace/trace_syscalls.c
|
|
*
|
|
* x86 code says:
|
|
* If the user really wants these, then they should use the
|
|
* raw syscall tracepoints with filtering.
|
|
*/
|
|
#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
|
|
static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
|
|
{
|
|
return is_compat_task();
|
|
}
|
|
|
|
#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
|
|
|
|
static inline bool arch_syscall_match_sym_name(const char *sym,
|
|
const char *name)
|
|
{
|
|
/*
|
|
* Since all syscall functions have __arm64_ prefix, we must skip it.
|
|
* However, as we described above, we decided to ignore compat
|
|
* syscalls, so we don't care about __arm64_compat_ prefix here.
|
|
*/
|
|
return !strcmp(sym + 8, name);
|
|
}
|
|
#endif /* ifndef __ASSEMBLY__ */
|
|
|
|
#endif /* __ASM_FTRACE_H */
|