mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-11-19 08:37:08 +00:00
- Parallel CPU bringup
The reason why people are interested in parallel bringup is to shorten
the (kexec) reboot time of cloud servers to reduce the downtime of the
VM tenants.
The current fully serialized bringup does the following per AP:
1) Prepare callbacks (allocate, intialize, create threads)
2) Kick the AP alive (e.g. INIT/SIPI on x86)
3) Wait for the AP to report alive state
4) Let the AP continue through the atomic bringup
5) Let the AP run the threaded bringup to full online state
There are two significant delays:
#3 The time for an AP to report alive state in start_secondary() on
x86 has been measured in the range between 350us and 3.5ms
depending on vendor and CPU type, BIOS microcode size etc.
#4 The atomic bringup does the microcode update. This has been
measured to take up to ~8ms on the primary threads depending on
the microcode patch size to apply.
On a two socket SKL server with 56 cores (112 threads) the boot CPU
spends on current mainline about 800ms busy waiting for the APs to come
up and apply microcode. That's more than 80% of the actual onlining
procedure.
This can be reduced significantly by splitting the bringup mechanism
into two parts:
1) Run the prepare callbacks and kick the AP alive for each AP which
needs to be brought up.
The APs wake up, do their firmware initialization and run the low
level kernel startup code including microcode loading in parallel
up to the first synchronization point. (#1 and #2 above)
2) Run the rest of the bringup code strictly serialized per CPU
(#3 - #5 above) as it's done today.
Parallelizing that stage of the CPU bringup might be possible in
theory, but it's questionable whether required surgery would be
justified for a pretty small gain.
If the system is large enough the first AP is already waiting at the
first synchronization point when the boot CPU finished the wake-up of
the last AP. That reduces the AP bringup time on that SKL from ~800ms
to ~80ms, i.e. by a factor ~10x.
The actual gain varies wildly depending on the system, CPU, microcode
patch size and other factors. There are some opportunities to reduce
the overhead further, but that needs some deep surgery in the x86 CPU
bringup code.
For now this is only enabled on x86, but the core functionality
obviously works for all SMP capable architectures.
- Enhancements for SMP function call tracing so it is possible to locate
the scheduling and the actual execution points. That allows to measure
IPI delivery time precisely.
-----BEGIN PGP SIGNATURE-----
iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmSZb/YTHHRnbHhAbGlu
dXRyb25peC5kZQAKCRCmGPVMDXSYoRoOD/9vAiGI3IhGyZcX/RjXxauSHf8Pmqll
05jUubFi5Vi3tKI1ubMOsnMmJTw2yy5xDyS/iGj7AcbRLq9uQd3iMtsXXHNBzo/X
FNxnuWTXYUj0vcOYJ+j4puBumFzzpRCprqccMInH0kUnSWzbnaQCeelicZORAf+w
zUYrswK4HpBXHDOnvPw6Z7MYQe+zyDQSwjSftstLyROzu+lCEw/9KUaysY2epShJ
wHClxS2XqMnpY4rJ/CmJAlRhD0Plb89zXyo6k9YZYVDWoAcmBZy6vaTO4qoR171L
37ApqrgsksMkjFycCMnmrFIlkeb7bkrYDQ5y+xqC3JPTlYDKOYmITV5fZ83HD77o
K7FAhl/CgkPq2Ec+d82GFLVBKR1rijbwHf7a0nhfUy0yMeaJCxGp4uQ45uQ09asi
a/VG2T38EgxVdseC92HRhcdd3pipwCb5wqjCH/XdhdlQrk9NfeIeP+TxF4QhADhg
dApp3ifhHSnuEul7+HNUkC6U+Zc8UeDPdu5lvxSTp2ooQ0JwaGgC5PJq3nI9RUi2
Vv826NHOknEjFInOQcwvp6SJPfcuSTF75Yx6xKz8EZ3HHxpvlolxZLq+3ohSfOKn
2efOuZO5bEu4S/G2tRDYcy+CBvNVSrtZmCVqSOS039c8quBWQV7cj0334cjzf+5T
TRiSzvssbYYmaw==
=Y8if
-----END PGP SIGNATURE-----
Merge tag 'smp-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull SMP updates from Thomas Gleixner:
"A large update for SMP management:
- Parallel CPU bringup
The reason why people are interested in parallel bringup is to
shorten the (kexec) reboot time of cloud servers to reduce the
downtime of the VM tenants.
The current fully serialized bringup does the following per AP:
1) Prepare callbacks (allocate, intialize, create threads)
2) Kick the AP alive (e.g. INIT/SIPI on x86)
3) Wait for the AP to report alive state
4) Let the AP continue through the atomic bringup
5) Let the AP run the threaded bringup to full online state
There are two significant delays:
#3 The time for an AP to report alive state in start_secondary()
on x86 has been measured in the range between 350us and 3.5ms
depending on vendor and CPU type, BIOS microcode size etc.
#4 The atomic bringup does the microcode update. This has been
measured to take up to ~8ms on the primary threads depending
on the microcode patch size to apply.
On a two socket SKL server with 56 cores (112 threads) the boot CPU
spends on current mainline about 800ms busy waiting for the APs to
come up and apply microcode. That's more than 80% of the actual
onlining procedure.
This can be reduced significantly by splitting the bringup
mechanism into two parts:
1) Run the prepare callbacks and kick the AP alive for each AP
which needs to be brought up.
The APs wake up, do their firmware initialization and run the
low level kernel startup code including microcode loading in
parallel up to the first synchronization point. (#1 and #2
above)
2) Run the rest of the bringup code strictly serialized per CPU
(#3 - #5 above) as it's done today.
Parallelizing that stage of the CPU bringup might be possible
in theory, but it's questionable whether required surgery
would be justified for a pretty small gain.
If the system is large enough the first AP is already waiting at
the first synchronization point when the boot CPU finished the
wake-up of the last AP. That reduces the AP bringup time on that
SKL from ~800ms to ~80ms, i.e. by a factor ~10x.
The actual gain varies wildly depending on the system, CPU,
microcode patch size and other factors. There are some
opportunities to reduce the overhead further, but that needs some
deep surgery in the x86 CPU bringup code.
For now this is only enabled on x86, but the core functionality
obviously works for all SMP capable architectures.
- Enhancements for SMP function call tracing so it is possible to
locate the scheduling and the actual execution points. That allows
to measure IPI delivery time precisely"
* tag 'smp-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
trace,smp: Add tracepoints for scheduling remotelly called functions
trace,smp: Add tracepoints around remotelly called functions
MAINTAINERS: Add CPU HOTPLUG entry
x86/smpboot: Fix the parallel bringup decision
x86/realmode: Make stack lock work in trampoline_compat()
x86/smp: Initialize cpu_primary_thread_mask late
cpu/hotplug: Fix off by one in cpuhp_bringup_mask()
x86/apic: Fix use of X{,2}APIC_ENABLE in asm with older binutils
x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it
x86/smpboot: Support parallel startup of secondary CPUs
x86/smpboot: Implement a bit spinlock to protect the realmode stack
x86/apic: Save the APIC virtual base address
cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE
x86/apic: Provide cpu_primary_thread mask
x86/smpboot: Enable split CPU startup
cpu/hotplug: Provide a split up CPUHP_BRINGUP mechanism
cpu/hotplug: Reset task stack state in _cpu_up()
cpu/hotplug: Remove unused state functions
riscv: Switch to hotplug core state synchronization
parisc: Switch to hotplug core state synchronization
...
301 lines
7.9 KiB
C
301 lines
7.9 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* PARISC Architecture-dependent parts of process handling
|
|
* based on the work for i386
|
|
*
|
|
* Copyright (C) 1999-2003 Matthew Wilcox <willy at parisc-linux.org>
|
|
* Copyright (C) 2000 Martin K Petersen <mkp at mkp.net>
|
|
* Copyright (C) 2000 John Marvin <jsm at parisc-linux.org>
|
|
* Copyright (C) 2000 David Huggins-Daines <dhd with pobox.org>
|
|
* Copyright (C) 2000-2003 Paul Bame <bame at parisc-linux.org>
|
|
* Copyright (C) 2000 Philipp Rumpf <prumpf with tux.org>
|
|
* Copyright (C) 2000 David Kennedy <dkennedy with linuxcare.com>
|
|
* Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
|
|
* Copyright (C) 2000 Grant Grundler <grundler with parisc-linux.org>
|
|
* Copyright (C) 2001 Alan Modra <amodra at parisc-linux.org>
|
|
* Copyright (C) 2001-2002 Ryan Bradetich <rbrad at parisc-linux.org>
|
|
* Copyright (C) 2001-2014 Helge Deller <deller@gmx.de>
|
|
* Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
|
|
*/
|
|
#include <linux/elf.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/module.h>
|
|
#include <linux/personality.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/reboot.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/debug.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/sched/task_stack.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/unistd.h>
|
|
#include <linux/kallsyms.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/random.h>
|
|
#include <linux/nmi.h>
|
|
#include <linux/sched/hotplug.h>
|
|
|
|
#include <asm/io.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/assembly.h>
|
|
#include <asm/pdc.h>
|
|
#include <asm/pdc_chassis.h>
|
|
#include <asm/unwind.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/cacheflush.h>
|
|
|
|
#define COMMAND_GLOBAL F_EXTEND(0xfffe0030)
|
|
#define CMD_RESET 5 /* reset any module */
|
|
|
|
/*
|
|
** The Wright Brothers and Gecko systems have a H/W problem
|
|
** (Lasi...'nuf said) may cause a broadcast reset to lockup
|
|
** the system. An HVERSION dependent PDC call was developed
|
|
** to perform a "safe", platform specific broadcast reset instead
|
|
** of kludging up all the code.
|
|
**
|
|
** Older machines which do not implement PDC_BROADCAST_RESET will
|
|
** return (with an error) and the regular broadcast reset can be
|
|
** issued. Obviously, if the PDC does implement PDC_BROADCAST_RESET
|
|
** the PDC call will not return (the system will be reset).
|
|
*/
|
|
void machine_restart(char *cmd)
|
|
{
|
|
#ifdef FASTBOOT_SELFTEST_SUPPORT
|
|
/*
|
|
** If user has modified the Firmware Selftest Bitmap,
|
|
** run the tests specified in the bitmap after the
|
|
** system is rebooted w/PDC_DO_RESET.
|
|
**
|
|
** ftc_bitmap = 0x1AUL "Skip destructive memory tests"
|
|
**
|
|
** Using "directed resets" at each processor with the MEM_TOC
|
|
** vector cleared will also avoid running destructive
|
|
** memory self tests. (Not implemented yet)
|
|
*/
|
|
if (ftc_bitmap) {
|
|
pdc_do_firm_test_reset(ftc_bitmap);
|
|
}
|
|
#endif
|
|
/* set up a new led state on systems shipped with a LED State panel */
|
|
pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN);
|
|
|
|
/* "Normal" system reset */
|
|
pdc_do_reset();
|
|
|
|
/* Nope...box should reset with just CMD_RESET now */
|
|
gsc_writel(CMD_RESET, COMMAND_GLOBAL);
|
|
|
|
/* Wait for RESET to lay us to rest. */
|
|
while (1) ;
|
|
|
|
}
|
|
|
|
void (*chassis_power_off)(void);
|
|
|
|
/*
|
|
* This routine is called from sys_reboot to actually turn off the
|
|
* machine
|
|
*/
|
|
void machine_power_off(void)
|
|
{
|
|
/* If there is a registered power off handler, call it. */
|
|
if (chassis_power_off)
|
|
chassis_power_off();
|
|
|
|
/* Put the soft power button back under hardware control.
|
|
* If the user had already pressed the power button, the
|
|
* following call will immediately power off. */
|
|
pdc_soft_power_button(0);
|
|
|
|
pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN);
|
|
|
|
/* ipmi_poweroff may have been installed. */
|
|
do_kernel_power_off();
|
|
|
|
/* It seems we have no way to power the system off via
|
|
* software. The user has to press the button himself. */
|
|
|
|
printk("Power off or press RETURN to reboot.\n");
|
|
|
|
/* prevent soft lockup/stalled CPU messages for endless loop. */
|
|
rcu_sysrq_start();
|
|
lockup_detector_soft_poweroff();
|
|
while (1) {
|
|
/* reboot if user presses RETURN key */
|
|
if (pdc_iodc_getc() == 13) {
|
|
printk("Rebooting...\n");
|
|
machine_restart(NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
void (*pm_power_off)(void);
|
|
EXPORT_SYMBOL(pm_power_off);
|
|
|
|
void machine_halt(void)
|
|
{
|
|
machine_power_off();
|
|
}
|
|
|
|
void flush_thread(void)
|
|
{
|
|
/* Only needs to handle fpu stuff or perf monitors.
|
|
** REVISIT: several arches implement a "lazy fpu state".
|
|
*/
|
|
}
|
|
|
|
/*
|
|
* Idle thread support
|
|
*
|
|
* Detect when running on QEMU with SeaBIOS PDC Firmware and let
|
|
* QEMU idle the host too.
|
|
*/
|
|
|
|
int running_on_qemu __ro_after_init;
|
|
EXPORT_SYMBOL(running_on_qemu);
|
|
|
|
/*
|
|
* Called from the idle thread for the CPU which has been shutdown.
|
|
*/
|
|
void __noreturn arch_cpu_idle_dead(void)
|
|
{
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
idle_task_exit();
|
|
|
|
local_irq_disable();
|
|
|
|
/* Tell the core that this CPU is now safe to dispose of. */
|
|
cpuhp_ap_report_dead();
|
|
|
|
/* Ensure that the cache lines are written out. */
|
|
flush_cache_all_local();
|
|
flush_tlb_all_local(NULL);
|
|
|
|
/* Let PDC firmware put CPU into firmware idle loop. */
|
|
__pdc_cpu_rendezvous();
|
|
|
|
pr_warn("PDC does not provide rendezvous function.\n");
|
|
#endif
|
|
while (1);
|
|
}
|
|
|
|
void __cpuidle arch_cpu_idle(void)
|
|
{
|
|
/* nop on real hardware, qemu will idle sleep. */
|
|
asm volatile("or %%r10,%%r10,%%r10\n":::);
|
|
}
|
|
|
|
static int __init parisc_idle_init(void)
|
|
{
|
|
if (!running_on_qemu)
|
|
cpu_idle_poll_ctrl(1);
|
|
|
|
return 0;
|
|
}
|
|
arch_initcall(parisc_idle_init);
|
|
|
|
/*
|
|
* Copy architecture-specific thread state
|
|
*/
|
|
int
|
|
copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
|
|
{
|
|
unsigned long clone_flags = args->flags;
|
|
unsigned long usp = args->stack;
|
|
unsigned long tls = args->tls;
|
|
struct pt_regs *cregs = &(p->thread.regs);
|
|
void *stack = task_stack_page(p);
|
|
|
|
/* We have to use void * instead of a function pointer, because
|
|
* function pointers aren't a pointer to the function on 64-bit.
|
|
* Make them const so the compiler knows they live in .text */
|
|
extern void * const ret_from_kernel_thread;
|
|
extern void * const child_return;
|
|
|
|
if (unlikely(args->fn)) {
|
|
/* kernel thread */
|
|
memset(cregs, 0, sizeof(struct pt_regs));
|
|
if (args->idle) /* idle thread */
|
|
return 0;
|
|
/* Must exit via ret_from_kernel_thread in order
|
|
* to call schedule_tail()
|
|
*/
|
|
cregs->ksp = (unsigned long) stack + FRAME_SIZE + PT_SZ_ALGN;
|
|
cregs->kpc = (unsigned long) &ret_from_kernel_thread;
|
|
/*
|
|
* Copy function and argument to be called from
|
|
* ret_from_kernel_thread.
|
|
*/
|
|
#ifdef CONFIG_64BIT
|
|
cregs->gr[27] = ((unsigned long *)args->fn)[3];
|
|
cregs->gr[26] = ((unsigned long *)args->fn)[2];
|
|
#else
|
|
cregs->gr[26] = (unsigned long) args->fn;
|
|
#endif
|
|
cregs->gr[25] = (unsigned long) args->fn_arg;
|
|
} else {
|
|
/* user thread */
|
|
/* usp must be word aligned. This also prevents users from
|
|
* passing in the value 1 (which is the signal for a special
|
|
* return for a kernel thread) */
|
|
if (usp) {
|
|
usp = ALIGN(usp, 4);
|
|
if (likely(usp))
|
|
cregs->gr[30] = usp;
|
|
}
|
|
cregs->ksp = (unsigned long) stack + FRAME_SIZE;
|
|
cregs->kpc = (unsigned long) &child_return;
|
|
|
|
/* Setup thread TLS area */
|
|
if (clone_flags & CLONE_SETTLS)
|
|
cregs->cr27 = tls;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
unsigned long
|
|
__get_wchan(struct task_struct *p)
|
|
{
|
|
struct unwind_frame_info info;
|
|
unsigned long ip;
|
|
int count = 0;
|
|
|
|
/*
|
|
* These bracket the sleeping functions..
|
|
*/
|
|
|
|
unwind_frame_init_from_blocked_task(&info, p);
|
|
do {
|
|
if (unwind_once(&info) < 0)
|
|
return 0;
|
|
if (task_is_running(p))
|
|
return 0;
|
|
ip = info.ip;
|
|
if (!in_sched_functions(ip))
|
|
return ip;
|
|
} while (count++ < MAX_UNWIND_ENTRIES);
|
|
return 0;
|
|
}
|
|
|
|
static inline unsigned long brk_rnd(void)
|
|
{
|
|
return (get_random_u32() & BRK_RND_MASK) << PAGE_SHIFT;
|
|
}
|
|
|
|
unsigned long arch_randomize_brk(struct mm_struct *mm)
|
|
{
|
|
unsigned long ret = PAGE_ALIGN(mm->brk + brk_rnd());
|
|
|
|
if (ret < mm->brk)
|
|
return mm->brk;
|
|
return ret;
|
|
}
|