Commit 3adee777 authored by Brian Gerst's avatar Brian Gerst Committed by Thomas Gleixner
Browse files

x86/smpboot: Remove initial_stack on 64-bit



In order to facilitate parallel startup, start to eliminate some of the
global variables passing information to CPUs in the startup path.

However, start by introducing one more: smpboot_control. For now this
merely holds the CPU# of the CPU which is coming up. Each CPU can then
find its own per-cpu data, and everything else it needs can be found
from there, allowing the other global variables to be removed.

First to be removed is initial_stack. Each CPU can load %rsp from its
current_task->thread.sp instead. That is already set up with the correct
idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
the BSP also finds a suitable stack pointer in the static per-cpu data
when coming up on first boot.

On resume from S3, the CPU needs a temporary stack because its idle task
is already active. Instead of setting initial_stack, the sleep code can
simply set its own current->thread.sp to point to the temporary stack.
Nobody else cares about ->thread.sp for a thread which is currently on
a CPU, because the true value is actually in the %rsp register. Which
is restored with the rest of the CPU context in do_suspend_lowlevel().

Signed-off-by: default avatarBrian Gerst <brgerst@gmail.com>
Signed-off-by: default avatarDavid Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: default avatarUsama Arif <usama.arif@bytedance.com>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Tested-by: default avatarUsama Arif <usama.arif@bytedance.com>
Tested-by: default avatarGuilherme G. Piccoli <gpiccoli@igalia.com>
Reviewed-by: default avatarDavid Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20230316222109.1940300-7-usama.arif@bytedance.com
parent cefad862
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -647,7 +647,11 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task)		(task_pt_regs(task)->sp)

#else
#define INIT_THREAD { }
extern unsigned long __end_init_task[];

#define INIT_THREAD {							    \
	.sp	= (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
}

extern unsigned long KSTK_ESP(struct task_struct *task);

+4 −1
Original line number Diff line number Diff line
@@ -199,5 +199,8 @@ extern void nmi_selftest(void);
#define nmi_selftest() do { } while (0)
#endif

#endif /* __ASSEMBLY__ */
extern unsigned int smpboot_control;

#endif /* !__ASSEMBLY__ */

#endif /* _ASM_X86_SMP_H */
+18 −2
Original line number Diff line number Diff line
@@ -111,10 +111,26 @@ int x86_acpi_suspend_lowlevel(void)
	saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
	initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
	/*
	 * As each CPU starts up, it will find its own stack pointer
	 * from its current_task->thread.sp. Typically that will be
	 * the idle thread for a newly-started AP, or even the boot
	 * CPU which will find it set to &init_task in the static
	 * per-cpu data.
	 *
	 * Make the resuming CPU use the temporary stack at startup
	 * by setting current->thread.sp to point to that. The true
	 * %rsp will be restored with the rest of the CPU context,
	 * by do_suspend_lowlevel(). And unwinders don't care about
	 * the abuse of ->thread.sp because it's a dead variable
	 * while the thread is running on the CPU anyway; the true
	 * value is in the actual %rsp register.
	 */
	current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
	early_gdt_descr.address =
			(unsigned long)get_cpu_gdt_rw(smp_processor_id());
	initial_gs = per_cpu_offset(smp_processor_id());
	smpboot_control = smp_processor_id();
#endif
	initial_code = (unsigned long)wakeup_long64;
	saved_magic = 0x123456789abcdef0L;
+1 −0
Original line number Diff line number Diff line
@@ -115,6 +115,7 @@ static void __used common(void)
	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
	OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
	OFFSET(X86_current_task, pcpu_hot, current_task);
#ifdef CONFIG_CALL_DEPTH_TRACKING
	OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
+28 −15
Original line number Diff line number Diff line
@@ -61,8 +61,8 @@ SYM_CODE_START_NOALIGN(startup_64)
	 * tables and then reload them.
	 */

	/* Set up the stack for verify_cpu(), similar to initial_stack below */
	leaq	(__end_init_task - FRAME_SIZE)(%rip), %rsp
	/* Set up the stack for verify_cpu() */
	leaq	(__end_init_task - PTREGS_SIZE)(%rip), %rsp

	leaq	_text(%rip), %rdi

@@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
	UNWIND_HINT_EMPTY
	ANNOTATE_NOENDBR // above

#ifdef CONFIG_SMP
	movl	smpboot_control(%rip), %ecx

	/* Get the per cpu offset for the given CPU# which is in ECX */
	movq	__per_cpu_offset(,%rcx,8), %rdx
#else
	xorl	%edx, %edx /* zero-extended to clear all of RDX */
#endif /* CONFIG_SMP */

	/*
	 * Setup a boot time stack - Any secondary CPU will have lost its stack
	 * by now because the cr3-switch above unmaps the real-mode stack.
	 *
	 * RDX contains the per-cpu offset
	 */
	movq	pcpu_hot + X86_current_task(%rdx), %rax
	movq	TASK_threadsp(%rax), %rsp

	/*
	 * We must switch to a new descriptor in kernel space for the GDT
	 * because soon the kernel won't have access anymore to the userspace
@@ -275,12 +293,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
	movl	initial_gs+4(%rip),%edx
	wrmsr

	/*
	 * Setup a boot time stack - Any secondary CPU will have lost its stack
	 * by now because the cr3-switch above unmaps the real-mode stack
	 */
	movq initial_stack(%rip), %rsp

	/* Setup and Load IDT */
	pushq	%rsi
	call	early_setup_idt
@@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64)
SYM_CODE_START(start_cpu0)
	ANNOTATE_NOENDBR
	UNWIND_HINT_EMPTY
	movq	initial_stack(%rip), %rsp

	/* Find the idle task stack */
	movq	PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
	movq	TASK_threadsp(%rcx), %rsp

	jmp	.Ljump_to_C_code
SYM_CODE_END(start_cpu0)
#endif
@@ -420,12 +436,6 @@ SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb)
#endif

/*
 * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
 * reliably detect the end of the stack.
 */
SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
	__FINITDATA

	__INIT
@@ -660,6 +670,9 @@ SYM_DATA_END(level1_fixmap_pgt)
SYM_DATA(early_gdt_descr,		.word GDT_ENTRIES*8-1)
SYM_DATA_LOCAL(early_gdt_descr_base,	.quad INIT_PER_CPU_VAR(gdt_page))

	.align 16
SYM_DATA(smpboot_control,		.long 0)

	.align 16
/* This must match the first entry in level2_kernel_pgt */
SYM_DATA(phys_base, .quad 0x0)
Loading