Stack alignment issue with using floating point

Question about which tools to use, bugs, the best way to implement a function, etc should go here. Don't forget to see if your question is answered in the wiki first! When in doubt post here.
Post Reply
User avatar
zhiayang
Member
Member
Posts: 368
Joined: Tue Dec 27, 2011 7:57 am
Libera.chat IRC: zhiayang

Stack alignment issue with using floating point

Post by zhiayang »

So I've got an issue. Apparently, if a memory operand is involved with a floating-point operation, it has to be 16-byte aligned. If not, a GPF gets issued...

This is the problem code:

Code: Select all

movaps dqword ptr ss:[rbp-160], xmm0
Wherein rbp-160 isn't 16-byte aligned.

Code: Select all

{
	// decimal numbers, because why not.
	double mem = K_SystemMemoryInBytes;
	uint8_t inx = 0;

	while(mem > 1024)
	{
		mem /= 1024;
		inx++;
	}
	PrintFormatted("%.2f %s of memory detected.\n\n", mem, K_BinaryUnits[inx]);
}
That's the vicinity of the problem code.

According to helpful individuals on IRC, the compiler is supposed to preserve the alignment of the stack during function calls, whereby RSP+8 is 16-byte aligned at the call instruction.

This seems to work. However, I suspect that my context-switching business is trashing some of that alignment...

This is the context switching code:

Code: Select all


TaskSwitcher:
	push %r15
	push %r14
	push %r13
	push %r12
	push %r11
	push %r10
	push %r9
	push %r8
	push %rdx
	push %rcx
	push %rbx
	push %rax
	push %rbp
	push %rsi
	push %rdi


	// Load the kernel data segment
	movw $0x10, %bp
	movl %ebp, %ds
	movl %ebp, %es
	movl %ebp, %fs
	movl %ebp, %gs


	// Modify the tss to change rsp0 to point here
	// movq %rsp, 0x504
	movq %rsp, %rdi

	call SwitchProcess
	movq %rax, %rsp

	// xchg %bx, %bx

	mov $0x20, %al
	outb %al, $0x20






	cmp $0xFADE, %r11
	jne NoRing3




DoRing3:
	cmpq $0x0, 0x600
	je DoRetR3


	// if not zero, do a CR3 switch.
	movq 0x600, %rax
	mov %rax, %cr3

	// if this is ring3, we need to...
	// do ring3!
	jmp PopRegR3

DoRetR3:
	// reset to the kernel's PML4.
	mov $0x1000, %rax
	mov %rax, %cr3


PopRegR3:
	pop %rdi
	pop %rsi
	pop %rbp
	pop %rax
	pop %rbx
	pop %rcx
	pop %rdx
	pop %r8
	pop %r9
	pop %r10
	pop %r11
	pop %r12
	pop %r13
	pop %r14
	pop %r15


	// screw around with the stack
	// change the data-segment to user-mode
	// also modify the eflags and code-segment values.

	mov $0x23, %ax
	mov %ax, %ds
	mov %ax, %es
	mov %ax, %fs
	mov %ax, %gs

	iretq






NoRing3:
	cmpq $0x0, 0x600
	je DoRetR0


	// if not zero, do a CR3 switch.
	movq 0x600, %rax
	mov %rax, %cr3
	jmp PopReg

DoRetR0:
	// reset to the kernel's PML4.
	mov $0x1000, %rax
	mov %rax, %cr3


PopReg:
	pop %rdi
	pop %rsi
	pop %rbp
	pop %rax
	pop %rbx
	pop %rcx
	pop %rdx
	pop %r8
	pop %r9
	pop %r10
	pop %r11
	pop %r12
	pop %r13
	pop %r14
	pop %r15
	iretq
(don't mind the inefficient code lol)

Here is the process setup code:

Code: Select all

        void CreateTask(const char name[64], uint8_t Flags, void (*Function)())
	{
		Task_type* Task = new ((void*)Kernel::HardwareAbstraction::MemoryManager::DynamicMinimal::Allocate(sizeof(Task_type))) Task_type();

		// Allocate kernel stack
		uint64_t f = Physical::AllocatePage_NoMap();
		for(int i = 0; i < DefaultRing3StackSize / 0x1000; i++)
			Virtual::MapAddr(f + (i * 0x1000), (i == 0) ? f : Physical::AllocatePage_NoMap(), 0x07);


		// allocate user stack.
		uint64_t k = Physical::AllocatePage_NoMap();
		for(int i = 0; i < DefaultRing3StackSize / 0x1000; i++)
			Virtual::MapAddr(k + (i * 0x1000), (i == 0) ? k : Physical::AllocatePage_NoMap(), Flags & 0x1 ? 0x7 : 0x3);





		Task->TopOfStack	= DefaultRing3StackSize + f;
		Task->StackPointer	        = Task->TopOfStack;
		Task->Flags			= Flags;
		Task->Thread		        = Function;
		Task->State			= 1;
		Task->ProcessID		= GetNumberOfTasks();
		Task->CR3			= 0;
		String::Copy(Task->Name, name);

		uint64_t* stack = (uint64_t*)Task->StackPointer;


		if(!(Task->Flags & 0x1))
		{
			// kernel thread
			*--stack = 0x10;							// SS
			*--stack = k + DefaultRing3StackSize;		        // User stack pointer
			*--stack = 0x202;							// RFLAGS
			*--stack = 0x08;							// CS
			*--stack = (uint64_t)Function;				        // RIP
		}
		else
		{
			// user thread
			*--stack = 0x23;							// SS
			*--stack = k + DefaultRing3StackSize;		        // User stack pointer
			*--stack = 0x202;							// RFLAGS
			*--stack = 0x1B;							// CS
			*--stack = (uint64_t)Function;					// RIP
		}



		*--stack = 0x0;								// R15
		*--stack = 0x0;								// R14
		*--stack = 0x0;								// R13
		*--stack = 0x0;								// R12
		*--stack = 0x0;								// R11
		*--stack = 0x0;								// R10
		*--stack = 0x0;								// R9
		*--stack = 0x0;								// R8

		*--stack = 0x0;								// RDX
		*--stack = 0x0;								// RCX
		*--stack = 0x0;								// RBX
		*--stack = 0x0;								// RAX


		*--stack = 0x0;								// RBP
		*--stack = 0x0;								// RSI
		*--stack = 0x0;								// RDI

		Task->StackPointer = (uint64_t)stack;
		TaskList->Add(Task);


		Log("Created new kernel thread: KStack: %x (%d b); UStack: %x(%d b); PID: %d",
			f, DefaultRing3StackSize, k, DefaultRing3StackSize, Task->ProcessID);
	}

And this is the scheduler code itself:

Code: Select all

	extern "C" uint64_t SwitchProcess(uint64_t context)
	{
		TicksSinceInit++;

		if(TaskList->Size() < 1)
			return context;

		if(!IsFirst)
		{
			TaskList->Get(CurrentPID)->StackPointer = context;
		}
		else
			IsFirst = false;


		if(CurrentPID < TaskList->Size() - 1 && NextPIDToSchedule == 0)
			CurrentPID++;

		else if(NextPIDToSchedule != 0)
		{
			CurrentPID = NextPIDToSchedule;
			NextPIDToSchedule = 0;
		}

		else
			CurrentPID = 0;


		// decrease the sleep counters of all threads.

		for(int i = 0; i < TaskList->Size(); i++)
		{
			if(TaskList->Get(i)->Sleep > 0 && i != CurrentPID)
			{
				TaskList->Get(i)->Sleep--;
				if(TaskList->Get(i)->Sleep == 0)
				{
					NextPIDToSchedule = i;
				}
			}
		}


		if(!TaskList->Get(CurrentPID)->State)
		{
			CurrentPID = 0;
			return TaskList->Get(CurrentPID)->StackPointer;
		}

		if(TaskList->Get(CurrentPID)->Sleep > 0)
		{
			TaskList->Get(CurrentPID)->Sleep--;
			CurrentPID = 0;
			return TaskList->Get(CurrentPID)->StackPointer;
		}


		if(TaskList->Get(CurrentPID)->Flags & 0x1)
		{
			// this tells switch.s (on return) that we need to return to user-mode.
			asm volatile("mov $0xFADE, %%r11" ::: "%r11");
		}

		if(TaskList->Get(CurrentPID)->CR3 != 0)
		{
			asm volatile("movq %[page], 0x600" :: [page]"r"(TaskList->Get(CurrentPID)->CR3): "memory");
		}
		else
		{
			asm volatile("movq $0x1000, 0x600" ::: "memory");
		}

		asm volatile("mov %[stacktop], 0x504" :: [stacktop]"r"(TaskList->Get(CurrentPID)->TopOfStack));
		return TaskList->Get(CurrentPID)->StackPointer;			// Return new task's context.
	}
Again don't mind the inefficient scheduler, it's mostly a placeholder for now.




Sorry for the codedump -- I really can't pinpoint the problem (ie. the exact point where the stack gets misaligned), except I have a hunch the scheduler is to blame, because if I execute the problem code before the scheduler starts scheduling, no problem.

Thanks.
User avatar
Combuster
Member
Member
Posts: 9301
Joined: Wed Oct 18, 2006 3:45 am
Libera.chat IRC: [com]buster
Location: On the balcony, where I can actually keep 1½m distance
Contact:

Re: Stack alignment issue with using floating point

Post by Combuster »

It appears that TaskSwitcher is called directly from an interrupt, in which case you get added onto the stack:

Code: Select all

SS  ESP EFL CS
EIP R15 R14 R13
R12 R11 R10 R9
R8  RDX RCX RBX
RAX RBP RSI RDI
CALL
Which is an odd amount of pushes, and therefore unaligns the stack if it was already aligned.

Being called from interrupt has some additional consequences as well. If you get an interrupt in 32-bit mode that doesn't include a stack switch, then the alignment of the stack is undefined, and you should explicitly realign the stack for the C end of the scheduler (typically, create a new stackframe and AND the stackpointer.) Additionally, the use of SSE within the interrupt handler requires that you save and restore that state as well before it gets used.

Edited as per Owen
Last edited by Combuster on Tue Dec 03, 2013 12:43 pm, edited 1 time in total.
"Certainly avoid yourself. He is a newbie and might not realize it. You'll hate his code deeply a few years down the road." - Sortie
[ My OS ] [ VDisk/SFS ]
User avatar
Owen
Member
Member
Posts: 1700
Joined: Fri Jun 13, 2008 3:21 pm
Location: Cambridge, United Kingdom
Contact:

Re: Stack alignment issue with using floating point

Post by Owen »

Combuster wrote:Being called from interrupt has some additional consequences as well. if you get an interrupt from kernel land which doesn't include a stack switch, then the alignment of RSP is undefined, and you should explicitly realign the stack for the C end of the scheduler (typically, create a new stackframe and AND the stackpointer.) Additionally, the use of SSE within the interrupt handler requires that you save and restore that state as well before it gets used.
The processor will align the stack to a 16 byte boundary before pushing the interrupt frame. See AMD74 Vol 2 Scn 8.9.

For interrupts and exceptions without error codes, 40 bytes are pushed (so the code must perform an odd number of following pushes to ensure alignment)
dschatz
Member
Member
Posts: 61
Joined: Wed Nov 10, 2010 10:55 pm

Re: Stack alignment issue with using floating point

Post by dschatz »

What does the entry for the task look like?

You do the following to start a task:

Code: Select all

*--stack = k + DefaultRing3StackSize;              // User stack pointer
 ...
*--stack = (uint64_t)Function;                    // RIP
This means that on entry to the user task, the RSP + 8 will not be 16 byte aligned. This if fine if the code that gets entered is assembly that assumes that to be the case and properly aligns the stack before calling a compiled function. If the entry to the task is a compiled function then this is your bug right here, the user stack pointer must be 8 less to have the correct alignment.
User avatar
zhiayang
Member
Member
Posts: 368
Joined: Tue Dec 27, 2011 7:57 am
Libera.chat IRC: zhiayang

Re: Stack alignment issue with using floating point

Post by zhiayang »

dschatz wrote:What does the entry for the task look like?

You do the following to start a task:

Code: Select all

*--stack = k + DefaultRing3StackSize;              // User stack pointer
 ...
*--stack = (uint64_t)Function;                    // RIP
This means that on entry to the user task, the RSP + 8 will not be 16 byte aligned. This if fine if the code that gets entered is assembly that assumes that to be the case and properly aligns the stack before calling a compiled function. If the entry to the task is a compiled function then this is your bug right here, the user stack pointer must be 8 less to have the correct alignment.

Yup, I figured it out eventually. There's some other bug involving tasks and the scheduler, but that's probably unrelated so I'll have to figure that out myself.
Post Reply