Page 1 of 1

SMP: Problems with starting APs

Posted: Sun Aug 07, 2022 8:06 am
by dbstream
Hi,

I am writing a long mode OS kernel and I have implemented quite a few features (memory allocation, parsing ACPI tables, task switching) but I have had a few problems with multiprocessing. I have successfully initialized the local APIC on the BSP and I can send self-IPIs. I have also implemented the IO-APIC for the purposes of calibrating the time-stamp counter (for udelay()). Right now, I am trying to boot application processors.

This is the code I use for starting an AP (apboot.c):

Code: Select all


extern volatile u8 __ap_flag;

void start_ap(struct cpu *ap)
{
	printk("starting CPU%u...\n", ap->cpu_id);

	__ap_flag = 0;

	apic_write_icr(ap->lapic_id, 0xc500); /* send INIT level assert */
	apic_wait_icr();

	apic_write_icr(ap->lapic_id, 0x8500); /* send INIT level de-assert */
	apic_wait_icr();

	udelay(10000);

	u32 apic_error;

	apic_read_esr();

	apic_write_icr(ap->lapic_id, 0x608);

	udelay(200);
	apic_wait_icr();

	apic_error = apic_read_esr();
	if(apic_error)
		panic("APIC error 0x%x when starting CPU%u", apic_error, ap->cpu_id);

	udelay(10000);

	if(!__ap_flag)
		panic("failed to start CPU%u", ap->cpu_id);

	printk("started CPU%u\n", ap->cpu_id);
}

Here are my implementations of APIC functions (if you'd need them, but you probably won't). The implementations for xAPIC and x2APIC reside in different files and I use a struct apic_interface to call these functions.

Code: Select all


static void normal_apic_write_icr(u32 dst, u32 flags)
{
	mmapic_write_reg(MMAPIC_ICR_HIGH, dst << 24);
	mmapic_write_reg(MMAPIC_ICR_LOW, flags);
}

static void normal_apic_wait_icr(void)
{
	while(mmapic_read_reg(MMAPIC_ICR_LOW) & (1 << 12))
		asm volatile("pause" : : : "memory");
}

static void x2apic_write_icr(u32 dst, u32 flags)
{
	write_msr(MSR_X2APIC_ICR, ((u64) dst << 32) | flags);
}

static void x2apic_wait_icr(void) {}

My code panics and prints
failed to start CPU1
This is the code that should run on the AP, shamelessly copied from SMP.
The linker script places it at 0x8000. (apboot.S, AT&T syntax):

Code: Select all


.code16
.section .realmode

__ap_entry:	/* 0x8000 */
	cli
	cld
	ljmp $0, $0x8040

.align 16
__gdt32:	/* 0x8010 */
	.long 0, 0
	.long 0x0000ffff, 0x00cf9a00
	.long 0x0000ffff, 0x008f9200
	.long 0x00000068, 0x00cf8900
__gdt_ptr:	/* 0x8030 */
	.word 0x1f
	.long 0x8010
	.long 0, 0

.align 64
__ap_jmp_target: /* 0x8040 */
	xorw %ax, %ax
	movw %ax, %ds
	lgdtl 0x8030
	movl %cr0, %eax
	orl $1, %eax
	movl %eax, %cr0
	ljmp $8, $8060

.code32
.align 32
__ap_pm: /* 0x8060 */
	movw $16, %ax
	movw %ax, %ds
	movw %ax, %ss

	movb $1, __ap_flag

__hlt_loop:
	cli
	hlt
	jmp __hlt_loop

.align 16
.globl __ap_flag
__ap_flag:
	.byte 0

This part of the linker script causes .realmode to be placed at 0x8000:

Code: Select all


ENTRY(_start)
SECTIONS
{
	/*
	 * AP startup code needs to be placed below 1MB.
	 * Therefore we give it a separate segment.
	 */
	. = 0x8000;
	__realmode_start = .;
	.realmode :
	{
		*(.realmode)
	}

	. = ALIGN(4K);
	__realmode_end = .;

        ... (boot, text, data etc)

}

In bochs, I get the following logs:

Code: Select all


00133390123i[CPU0  ] WRMSR: wrote 00000000:fee00800 to MSR_APICBASE
00133390123i[APIC0 ] allocate APIC id=0 (MMIO enabled) to 0x0000fee00000
00134628551i[APIC1 ] Deliver INIT IPI
00134628551i[CPU1  ] cpu software reset
00134628551i[APIC1 ] allocate APIC id=1 (MMIO enabled) to 0x0000fee00000
00134628551i[CPU1  ] CPU[1] is an application processor. Halting until SIPI.

// I removed some CPUID things, they're just unneeded

00134668576i[APIC1 ] Deliver Start Up IPI
00134668576i[CPU1  ] CPU 1 started up at 0800:00000000 by APIC

When I turn off the machine, I get the following logs:

Code: Select all


00259752000p[XGUI  ] >>PANIC<< POWER button turned off.
========================================================================
Bochs is exiting with the following message:
[XGUI  ] POWER button turned off.
========================================================================
00259752000i[CPU0  ] CPU is in long mode (active)
00259752000i[CPU0  ] CS.mode = 64 bit
00259752000i[CPU0  ] SS.mode = 64 bit
00259752000i[CPU0  ] EFER   = 0x00000500
00259752000i[CPU0  ] | RAX=0000000000000000  RBX=ffff8000007ade28
00259752000i[CPU0  ] | RCX=000000000000000a  RDX=ffffffff81071c20
00259752000i[CPU0  ] | RSP=ffffffff81088eda  RBP=ffffffff81088f2a
00259752000i[CPU0  ] | RSI=0000000000000000  RDI=0000000000000004
00259752000i[CPU0  ] |  R8=0000000000000000   R9=0000000000000000
00259752000i[CPU0  ] | R10=0000000000000000  R11=ffffffff81088d8a
00259752000i[CPU0  ] | R12=ffffffff8106ea51  R13=ffffffff803b3814
00259752000i[CPU0  ] | R14=0000000080000000  R15=00000000003b326a
00259752000i[CPU0  ] | IOPL=0 ID vip vif ac vm rf nt of df if tf sf ZF af PF cf
00259752000i[CPU0  ] | SEG sltr(index|ti|rpl)     base    limit G D
00259752000i[CPU0  ] |  CS:0008( 0001| 0|  0) 00000000 ffffffff 1 0
00259752000i[CPU0  ] |  DS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  SS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  ES:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  FS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  GS:0010( 0002| 0|  0) 7f746000 ffffffff 1 1
00259752000i[CPU0  ] |  MSR_FS_BASE:0000000000000000
00259752000i[CPU0  ] |  MSR_GS_BASE:ffff80007f746000
00259752000i[CPU0  ] | RIP=ffffffff8104f100 (ffffffff8104f100)
00259752000i[CPU0  ] | CR0=0xe0000011 CR2=0x0000000000000000
00259752000i[CPU0  ] | CR3=0x0000000001002000 CR4=0x000000a0
00259752000i[CPU1  ] CPU is in real mode (active)
00259752000i[CPU1  ] CS.mode = 16 bit
00259752000i[CPU1  ] SS.mode = 16 bit
00259752000i[CPU1  ] EFER   = 0x00000000
00259752000i[CPU1  ] | EAX=00000000  EBX=00000000  ECX=00000000  EDX=00000000
00259752000i[CPU1  ] | ESP=0000fff6  EBP=00005a8d  ESI=00000000  EDI=000081f4
00259752000i[CPU1  ] | IOPL=0 id vip vif ac vm rf nt of df if tf sf zf af pf cf
00259752000i[CPU1  ] | SEG sltr(index|ti|rpl)     base    limit G D
00259752000i[CPU1  ] |  CS:0000( 1e00| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  DS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  SS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  ES:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  FS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  GS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] | EIP=0000000b (0000000b)
00259752000i[CPU1  ] | CR0=0x60000010 CR2=0x00000000
00259752000i[CPU1  ] | CR3=0x00000000 CR4=0x00000000
00259752000i[CMOS  ] Last time is 1659879937 (Sun Aug  7 15:45:37 2022)
00259752000i[XGUI  ] Exit
00259752000i[SIM   ] quit_sim called with exit code 1

I genuinely don't know how to debug this, so any help would be appreciated.

This is my first post on these forums, so if I did anything wrong, please say it.

Re: SMP: Problems with starting APs

Posted: Mon Aug 08, 2022 10:45 pm
by Octocontrabass
dbstream wrote:The linker script places it at 0x8000.
Which bootloader are you using? GRUB does not support loading anything below 0x100000.

Re: SMP: Problems with starting APs

Posted: Tue Aug 09, 2022 1:28 am
by nullplan
I see what the problem is: You are not copying the startup code to the correct page. You think your bootloader loads the startup code to 0x8000, but it won't. What you have to do instead is to actively copy the trampoline to the destination.

That is why I have the trampoline as part of ".rodata". All memory references are relative to its start, and it has a header that gets filled out after loading. Also, that code you have there never goes to 64-bit mode. So here's my trampoline:

Code: Select all

/* AMD-64 SMP boot trampoline */
.section ".rodata", "a", @progbits
.align 4
.global trampoline_size
trampoline_size: .long trampoline_end - trampoline
.code16
.global trampoline
trampoline:
/* Assumption: At run time, we start with CS:IP = xx00:0000.
 *
 * We have to:
 *  -load a GDT
 *  -load an IDT
 *  -switch to PM
 *  -enable LM+PAE
 *  -enable paging (switch to LM)
 *
 * In order to do the first, we have to LGDT with an address usable in real
 * mode. We have no guarantee that any of the kernel is loaded into the first
 * 4GB, except everything in this trampoline. so the GDT itself has to be part
 * of the trampoline. Same for the IDT pointer.
 * So let us start with a data section:
 */
 jmp start16

 /* first the stuff to talk to the booting kernel: */
 .align 4
kcr3: .long 0   /* kernel CR3. Must be within the first 4GB */
kcode: .quad 0 /* kernel code */
kstack: .quad 0 /* kernel stack */
kgs: .quad 0 /* kernel GS pointer */
commword: .word 0   /* communication word. We set bit 0 to indicate awakeness, and wait for bit 1 to indicate the booter has seen it. */

/* IDT pointer (misaligned, but 3 words long, so it aligns everything after it)
 * Set IDTR to 0/0, so any exception causes a tripple fault.
 */
idt_ptr: .word 0
    .long 0

/* GDT: pointer embedded in first element */
/* Code 32 segment: Offset 8.
 * Data 32 segment: Offset 16.
 * Code 64 segment: Offset 24.
 */
gdt:
    .word 0, gdt_end - gdt - 1
    .long gdt - trampoline
gdtcode32: .quad 0x00cf9a000000ffff
gdtdata32: .quad 0x00cf92000000ffff
gdtcode64: .quad 0x00af9a000000ffff
gdt_end:

pmptr: .long start32 - trampoline, 0x08
lmptr: .long start64 - trampoline, 0x18

start16:
    xorl %ebx, %ebx
    movw %cs, %bx
    movw %bx, %ds
    lock orw $1, commword - trampoline
1:
    pause
    testw $2, commword - trampoline
    jz 1b

    shll $4, %ebx
    addl %ebx, gdt + 4 - trampoline
    addl %ebx, pmptr - trampoline
    addl %ebx, lmptr - trampoline

    lidtl idt_ptr - trampoline
    lgdtl gdt + 2 - trampoline

    movl %cr0, %eax
    btsl $0, %eax       /* enable PE bit */
    movl %eax, %cr0     /* now we are in protected mode */
    ljmpl *(pmptr - trampoline)  /* and now we are in 32-bit protected mode */

.code32
start32:
    movw $0x10, %ax
    movw %ax, %ds
    movw %ax, %es
    movw %ax, %fs
    movw %ax, %gs
    movw %ax, %ss
    leal 4096(%ebx), %esp       /* load temporary stack pointer */
    pushl $0
    popfl                       /* initialize all flags (including I and D flag) to 0 */
    movl %cr4, %eax
    btsl $5, %eax
    movl %eax, %cr4             /* set PAE bit in CR4 */
    movl $0xc0000080, %ecx
    rdmsr
    btsl $8, %eax               /* set LME bit in EFER */
    wrmsr
    movl kcr3 - trampoline(%ebx), %eax
    movl %eax, %cr3             /* load CR3 */
    movl %cr0, %eax
    btsl $31, %eax              /* set PG bit in CR0 */
    movl %eax, %cr0             /* now we are in Long Compatibility mode */
    ljmpl *(lmptr - trampoline) /* and now we are in Long 64-bit mode */

.code64
start64:
    orl %ebx, %ebx              /* clear upper half of RBX */
    movq kstack - trampoline(%rbx), %rsp
    movq kgs - trampoline(%rbx), %rax
    movq %rax, %rdx
    movl $0xc0000101, %ecx
    shrq $32, %rdx
    wrmsr                       /* set GS.base */
    callq   *(kcode - trampoline)(%ebx)
1:
    cli
    hlt
    jmp 1b                      /* emergency catch loop. Should never run */
trampoline_end:
There, that does it. It is position independent. It is supposed to be used like this:
  1. Allocate a physical page below 1MB
  2. Copy trampoline there
  3. Allocate a physical page below 4GB
  4. Copy PML4 there
  5. Identity map at least the trampoline inside the copied PML4
  6. Fill out kcr3, kcode, kstack, and kgs at the start
  7. Do the IPI dance (setting the SIPI vector such that startup happens at the allocated page)
  8. Wait for commword to get its lowest bit set
  9. Set the second lowest bit. The AP is now on the way
Sure, I could invent a relocation format such that the loading CPU would relocate all the pointers, then I wouldn't have to do it inside the trampoline, but I think doing it this way uncouples trampoline and kernel better. And I like loosely coupled components, it affords flexibility.

The commword is there to detect if the first SIPI worked or if you need a second one. Goes back to an idea by Brendan. The loop until acknowledgement is about memory ownership. The memory for the copied PML4 and the copied trampoline belongs to the booting routine until the AP boots up. Then it is owned by the booting AP. So the booting routine must relinquish ownership, must indicate that it has seen what is going on. Therefore the handshake.

Re: SMP: Problems with starting APs

Posted: Tue Aug 09, 2022 6:29 am
by dbstream
Problem solved!

I debugged in bochs and dumped raw physical memory at 0x8000, turns out grub didn't load the .realmode segment.

I changed my linker script:

Code: Select all

-	/*
-	 * AP startup code needs to be placed below 1MB.
-	 * Therefore we give it a separate segment.
-	 */
-	. = 0x8000;
-	__realmode_start = .;
-	.realmode :
-	{
-		*(.realmode)
-	}
-
-	. = ALIGN(4K);
-	__realmode_end = .;
	.text : AT(ADDR(.text) - 0xffffffff80000000)
	{
+		__realmode_start = .;
+		*(.realmode)
+		. = ALIGN(4K);
+		__realmode_end = .;
... and memcpy'd the realmode page to 0x8000:

Code: Select all

+	void prepare_start_aps(void)
+	{
+		memcpy((void *) 0x8000, __realmode_start, __realmode_end - __realmode_start);
+	}
I also implemented a better algorithm for sending INIT-SIPI-SIPI:

Code: Select all

void start_ap(struct cpu *ap)
{
	__write_ap_flag(0);
	__write_ap_task(get_cpuvar_address_on(ap, idle_task));

	apic_write_icr(ap->lapic_id, 0x4500); /* send INIT level assert */
	apic_wait_icr();

	apic_write_icr(ap->lapic_id, 0x8500); /* send INIT level de-assert */
	apic_wait_icr();

	udelay(500);

	u32 apic_error;

	for(int i = 0; i < 2; i++) {
		apic_write_icr(ap->lapic_id, 0x4608);

		udelay(200);

		apic_wait_icr();

		udelay(300);

		apic_error = apic_read_esr();
		if(apic_error)
			panic("APIC error 0x%x when starting CPU%u", apic_error, ap->cpu_id);

		if(__read_ap_flag() == 1)
			goto done;
	}

	udelay(10000);

	if(__read_ap_flag() != 1)
		panic("failed to start CPU%u", ap->cpu_id);
done:
	...
}
... and actually implemented the switch to 64-bit mode (that I had left out to reduce how much I needed to debug. My original __ap_trampoline did set the __ap_flag before it went into a cli-hlt-loop):

Code: Select all

__ap_trampoline:
	cli
	hlt
	jmp __ap_entry
.size __ap_entry, . - __ap_entry
	ljmp $0, $(__ap_jmp_target - __ap_trampoline + 0x8000)

__ap_init_gdt_table:
	.long 0, 0			/* null segment */
	.long 0x0000ffff, 0x00cf9a00	/* code segment */
	.long 0x0000ffff, 0x008f9200	/* data segment */
__ap_init_gdt_ptr:
	.word __ap_init_gdt_ptr - __ap_init_gdt_table - 1
	.long __ap_init_gdt_table - __ap_trampoline + 0x8000
__ap_jmp_target:
	xorw %ax, %ax
	movw %ax, %ds
	lgdtl __ap_init_gdt_ptr - __ap_trampoline + 0x8000
	movl %cr0, %eax
	orl $1, %eax
	movl %eax, %cr0
	ljmp $8, $(__ap32 - __ap_trampoline + 0x8000)

.code32
__ap32:
	movw $16, %ax
	movw %ax, %ds
	movw %ax, %es
	movw %ax, %fs
	movw %ax, %gs
	movw %ax, %ss

	movb $1, (__ap_flag - __ap_trampoline + 0x8000)

/* switch to 64-bit mode */

	movl %cr4, %eax
	orl $(1 << 5), %eax
	andl $~(1 << 7), %eax
	movl %eax, %cr4

	movl $__boot_pml4, %eax
	movl %eax, %cr3

	movl $0xc0000080, %ecx
	rdmsr
	orl $(1 << 8), %eax
	wrmsr

	movl %cr0, %eax
	orl $((1 << 31) | 1), %eax
	movl %eax, %cr0

	movl %cr4, %eax
	orl $(1 << 7), %eax
	movl %eax, %cr4

	lgdt (__ap_init_gdt64_ptr - __ap_trampoline + 0x8000)(,1)
	jmp $8, $(__ap64 - __ap_trampoline + 0x8000)

.code64
__ap64:
	movl $16, %eax
	movl %eax, %ds
	movl %eax, %es
	movl %eax, %fs
	movl %eax, %gs
	movl %eax, %ss

	movq (__ap_task - __ap_trampoline + 0x8000), %rsi

	movq (%rsi), %rsp

	popq %rdi
	popq %rsi
	popq %r15
	popq %r14
	popq %r13
	popq %r12
	popq %rbx
	popq %rbp
	ret

__ap_init_gdt64:

	.quad 0

/* kernel code segment - 8 */
	.long 0xffff
	.byte 0
	.byte 0b10011010
	.byte 0b10101111
	.byte 0

/* kernel data segment - 16 */
	.long 0xffff
	.byte 0
	.byte 0b10010010
	.byte 0b11001111
	.byte 0

__ap_init_gdt64_ptr:
	.word __ap_init_gdt64_ptr - __ap_init_gdt64
	.quad __ap_init_gdt64 - __ap_trampoline + 0x8000

.align 8
.globl __ap_task
__ap_task:
	.quad 0 /* filled in by C */
.globl __ap_flag
__ap_flag:
	.byte 0
I have a struct task for each CPU, a pointer to which I store in __ap_task. From that structure, I load the stack pointer and pop a bunch of registers before jumping to C code.

Code: Select all

struct task {
	u64 sp;
	... (things like mm, state, LL node for scheduler runqueue)
};

// task->sp points to this
struct initial_stack_frame {
	u64 rdi;
	u64 rsi;
	u64 r15;
	u64 r14;
	u64 r13;
	u64 r12;
	u64 rbx;
	u64 rbp;
	u64 rip;

	// for stack trace to work, these must be 0
	u64 null_rbp;
	u64 null_rip;
};
And now everything works as it should.
:-)