still problems with semaphore code and smp

Question about which tools to use, bugs, the best way to implement a function, etc should go here. Don't forget to see if your question is answered in the wiki first! When in doubt post here.
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

still problems with semaphore code and smp

Post by FlashBurn »

I still have the problem that my semaphore code doesn´t work on smp machines (I tested it on qemu and bochs, because at the moment I can´t test it on a real smp machine).

The problem seems to be that 2 threads execute the code which should be saved by the semaphore.

I will give you all the code you need to have a look at, but I think brendan is the only one who will understand my assembly "slang" ;)

semaphore code:

Code: Select all

;----------------------------
PROC semaphore_acquire_smp, sem
;----------------------------
BEGIN
;----------------------------
;	make sure we are the only one to work on the semaphore
	cli

	CALL spinlock_acquire, dword[sem]

	lock sub dword[esi+semaphore_t.count],1
	jz .end
;----------------------------
;	look if we have to wait or if we can go right away
	cmp dword[esi+semaphore_t.count],0
	jg .end
;----------------------------
;	we have to wait
;	APIC_GET_ID eax
;	mov edx,[CPU_PTR+4*eax]
	mov ebx,[fs:thread_t.tid]
	mov edi,[esi+semaphore_t.threads]		;edi= firstThread
;	mov eax,[edx+cpu_t.schdl_act_thread]
	mov eax,[THREAD_TABLE+4*ebx]			;eax= actThread
	
	test eax,eax
	jnz .temp
	
	int 0
;----------------------------
;	look if we are the first thread that need to wait
align 4
.temp:
	test edi,edi
	jz .first

	mov ebx,[edi+thread_t.prev]				;ebx= firstThread.prev= lastThread
	xor ecx,ecx
	mov [eax+thread_t.prev],ebx				;actThread.prev= lastThread
	mov [eax+thread_t.next],ecx				;actThread.next= NULL
	mov [edi+thread_t.prev],eax				;firstThread.prev= actThread
	mov [ebx+thread_t.next],eax				;lastThread.next= actThread

	jmp .scheduler
;----------------------------
;	we are the first thread
align 4
.first:
	mov [esi+semaphore_t.threads],eax

	mov [eax+thread_t.prev],eax				;actThread.prev= firstThread.prev= actThread
	mov [eax+thread_t.next],edi				;actThread.next= firstThread.next= NULL
;----------------------------
;	scheduler have to know that this thread wants to wait
align 4
.scheduler:
	or dword[eax+thread_t.flags],THREAD_WAIT or THREAD_RESCHEDULE

	CALL spinlock_release, esi

	CALLINT scheduler_reschedule_smp

	sti

	RETURN
;----------------------------
align 4
.end:
	CALL spinlock_release, esi

	sti

	RETURN
ENDP
;----------------------------

;----------------------------
PROC semaphore_release, sem
;----------------------------
BEGIN
;----------------------------
;	make sure we are the only one to work on the semaphore
	cli

	CALL spinlock_acquire, dword[sem]

	lock add dword[esi+semaphore_t.count],1
;----------------------------
;	look if we need to awake a thread
	cmp dword[esi+semaphore_t.count],0
	jg .end
;----------------------------
;	we have to awake the thread on the top of the queue
	mov eax,[esi+semaphore_t.threads]		;eax= firstThread
	mov ebx,[eax+thread_t.next]				;ebx= firstThread.next= secondThread
	mov ecx,[eax+thread_t.prev]				;ecx= firstThread.prev= lastThread

	test ebx,ebx
	jz .last
;----------------------------
;	put the 2nd thread onto the top of the queue and put the last thread onto the 2nd threads prev ptr
	mov [ebx+thread_t.prev],ecx				;secondThread.prev= lastThread
	mov [esi+semaphore_t.threads],ebx		;firstThread= secondThread

	jmp .scheduler
;----------------------------
;	there is no more thread on the queue
align 4
.last:
	mov [esi+semaphore_t.threads],ebx
;----------------------------
;	scheduler needs to awaken the thread in eax
align 4
.scheduler:
	CALL spinlock_release, esi

	sti

	CALL scheduler_add_scheduler, eax

	RETURN
;----------------------------
align 4
.end:
	CALL spinlock_release, esi

	sti

	RETURN
ENDP
;----------------------------
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

scheduler code:

Code: Select all

;----------------------------
PROC scheduler_add_scheduler, ptr2thread
;----------------------------
BEGIN
	cli

;	mov ebx,[ptr2thread]
;----------------------------
;	wait till thread isn´t running anymore
;align 4
;.loop:
;	test dword[ebx+thread_t.flags],THREAD_RUN
;	je .go_on

;	pause

;	jmp .loop
;----------------------------
;	get spinlock for the scheduler queues
;align 4
;.go_on:
	CALL spinlock_acquire, schdl_flags
;----------------------------
;	add thread to ready queue
	mov esi,[ptr2thread]					;esi= actThread
	mov ebx,1
	mov ecx,[esi+thread_t.dyn_prio]
	mov edi,ready_queue
	shl ebx,cl

	or [ready_queue_bitmap],ebx

	mov eax,[edi+4*ecx]						;eax= firstThread
	xor edx,edx

	test eax,eax
	jz .first

	mov ebx,[eax+thread_t.prev]				;ebx= firstThread.prev= lastThread
	mov [esi+thread_t.prev],ebx				;actThread.prev= lastThread
	mov [esi+thread_t.next],edx				;actThread.next= NULL
	mov [eax+thread_t.prev],esi				;firstThread.prev= actThread
	mov [ebx+thread_t.next],esi				;lastThread.next= actThread

	jmp .end
;----------------------------
;	it is the 1st thread in this priority queue
align 4
.first:
	mov [esi+thread_t.prev],esi				;actThread.prev= firstThread.prev= actThread
	mov [esi+thread_t.next],eax				;actThread.next= firstThread.next= NULL
	mov [edi+4*ecx],esi
;----------------------------
align 4
.end:
	and dword[esi+thread_t.flags],not (THREAD_WAIT or THREAD_SLEEP)
	or dword[schdl_flags],SCHEDULER_RESCHEDULE

	CALL spinlock_release, schdl_flags

	sti

	RETURN
ENDP
;----------------------------

;----------------------------
PROC scheduler_dequeue_intel_smp, this_cpu
;----------------------------
BEGIN
;----------------------------
;	get the thread with the highest priority
	mov eax,[ready_queue_bitmap]
	mov esi,ready_queue
;----------------------------
;	get the highest priority thread
.get_thread:
	bsr eax,eax
	jz .idle

	mov ebx,[esi+4*eax]						;ebx= firstThread
	mov edx,[ebx+thread_t.next]				;edx= firstThread.next= secondThread
	mov ecx,[ebx+thread_t.prev]				;ecx= firstThread.prev= lastThread

	test edx,edx
	jz .last

	mov [esi+4*eax],edx						;firstThread= secondThread
	mov [edx+thread_t.prev],ecx				;secondThread.prev= lastThread

	mov eax,ebx
;----------------------------
;	move the needed values from the thread struc in the right regs
.init:
	CALL spinlock_release, schdl_flags

	mov edx,[this_cpu]
	cmp eax,[edx+cpu_t.schdl_act_thread]
	je .set_time

	mov [edx+cpu_t.schdl_act_thread],eax
;----------------------------
;	set base addr for fs and gs regs
	push eax

	CALL gdt_set_base, dword[edx+cpu_t.fs], eax

	pop eax

	add eax,thread_t.free_start
	CALL gdt_set_base, dword[edx+cpu_t.gs], eax
;----------------------------
;	write the esp value for the ring0 code into the msr reg
	mov eax,[fs:thread_t.esp0]
	mov ecx,176h
	xor edx,edx

	wrmsr

	jmp .set_time
;----------------------------
align 4
.idle:
	mov edx,[this_cpu]
	mov eax,[edx+cpu_t.idle_thread]

	jmp .init
;----------------------------
;	this is the last thread in the queue, so del the bit in the bitmap
align 4
.last:
	xor ecx,ecx
	mov edx,1
	mov [esi+4*eax],ecx
	mov ecx,eax
	shl edx,cl

	xor [ready_queue_bitmap],edx

	mov eax,ebx

	jmp .init
;----------------------------
align 4
.set_time:
;----------------------------
;	we removed the thread from the queue and we need to give the thread some time 2 run
	mov edx,[this_cpu]
	xor ebx,ebx
	mov eax,[edx+cpu_t.schdl_act_thread]
	mov ecx,32

	mov [eax+thread_t.prev],ebx
	sub ecx,[eax+thread_t.dyn_prio]
	mov [eax+thread_t.next],ebx

	cmp ecx,32
	jne .end

	mov ecx,1
;----------------------------
.end:
	mov edx,[eax+thread_t.dyn_prio]
	mov edi,APIC_BASE_ADDR
	mov [eax+thread_t.time2run],ecx
	shr edx,1
	xor dword[eax+thread_t.flags],THREAD_RUN or THREAD_READY
	mov [edi+apic_regs_t.tpr],edx

	RETURN
ENDP
;----------------------------

;----------------------------
PROC scheduler_enqueue_smp
;----------------------------
BEGIN
;----------------------------
;	save esp
	mov ebx,cr0
	mov [eax+thread_t.esp3],ecx
;----------------------------
;	test if need to save the fpu
	test ebx,8
	jne .go_on

	mov edi,[eax+thread_t.ptr2fpu]
;----------------------------
;	save fpu env
.fpu_save:
	fxsave [edi]

	or ebx,8
	mov cr0,ebx
;----------------------------
;	look if we just ran the idle thread
align 4
.go_on:
	CALL spinlock_acquire, schdl_flags

	cmp eax,dword[edx+cpu_t.idle_thread]
	je .end
;----------------------------
;	get and check the flags of the thread
	mov ebx,[eax+thread_t.flags]

	test ebx,THREAD_KILL
	jne .kill
	test ebx,THREAD_WAIT
	jne .wait
	test ebx,THREAD_SLEEP
	jne .sleep
;----------------------------
;	calc new priority
	cmp dword[eax+thread_t.time2run],0
	jne .do_it

	mov ebx,[eax+thread_t.dyn_prio]
	sub ebx,1

	cmp ebx,[eax+thread_t.priority]
	jl .do_it

	mov [eax+thread_t.dyn_prio],ebx
;----------------------------
;	enqueue the thread into the ready queue
.do_it:
	mov ecx,[eax+thread_t.dyn_prio]
	mov ebx,1
	mov esi,ready_queue
	shl ebx,cl

	or [ready_queue_bitmap],ebx

	mov edi,[esi+4*ecx]						;edi= firstThread
	xor edx,edx

	test edi,edi
	jz .first

	mov ebx,[edi+thread_t.prev]				;ebx= firstThread.prev= lastThread
	mov [eax+thread_t.prev],ebx				;actThread.prev= lastThread
	mov [eax+thread_t.next],edx				;actThread.next= NULL
	mov [edi+thread_t.prev],eax				;firstThread.prev= actThread
	mov [ebx+thread_t.next],eax				;lastThread.next= actThread

	or dword[eax+thread_t.flags],THREAD_READY

	jmp .end
;----------------------------
;	it is the first thread in this priority
align 4
.first:
	mov [eax+thread_t.prev],eax				;actThread.prev= firstThread.prev= actThread
	mov [eax+thread_t.next],edi				;actThread.next= firstThread.next= NULL
	mov [esi+4*ecx],eax

	jmp .end
;----------------------------
;	the thread waits for something in a wait queue
align 4
.wait:
	cmp dword[eax+thread_t.dyn_prio],31
	je .end

	mov ebx,[eax+thread_t.dyn_prio]
	mov edx,[eax+thread_t.priority]
	add ebx,1
	add edx,5

	cmp ebx,edx
	jg .end

	mov [eax+thread_t.dyn_prio],ebx

	jmp .end
;----------------------------
;	the owner of the thread is going to be killed
align 4
.kill:
	xor ecx,ecx
	mov [eax+thread_t.prev],ecx
	mov [eax+thread_t.next],ecx

	jmp .end
;----------------------------
;	the thread wants to sleep
align 4
.sleep:
	;to-do

	jmp .end
;----------------------------
align 4
.end:
	and dword[eax+thread_t.flags],not (THREAD_RESCHEDULE or THREAD_RUN)
	and dword[schdl_flags],not SCHEDULER_RESCHEDULE

	RETURN
ENDP
;----------------------------

;----------------------------
PROC scheduler_smp
;----------------------------
BEGIN_IRQ
;----------------------------
;	get cpu ptr and tss ptr
	APIC_GET_ID eax
	mov edx,[CPU_PTR+4*eax]
	mov ebx,[TSS_PTR+4*eax]
;----------------------------
align 4
.entry:
;----------------------------
;	inc the timer
	add dword[edx+cpu_t.timer],1
	adc dword[edx+cpu_t.timer+4],0
;----------------------------
;	move the ptr of the act thrad into eax
	mov eax,[edx+cpu_t.schdl_act_thread]
;----------------------------
;	check if we need to reschedule
	test dword[schdl_flags],SCHEDULER_RESCHEDULE
	jne .next_thread
	test dword[eax+thread_t.flags],THREAD_RESCHEDULE
	jne .next_thread
;----------------------------
;	check if the thread has more time 2 run
	sub dword[eax+thread_t.time2run],1
	jnz .end
;----------------------------
;	put thread back on a queue and get a new one
.next_thread:
	mov ecx,esp
	mov esp,[edx+cpu_t.scheduler_esp]

	push ebx edx

	CALL scheduler_enqueue_smp
	CALL dword[scheduler_dequeue_smp]
;----------------------------
;	check if this is a ring0 thread
	mov ecx,[eax+thread_t.esp0]
	mov edx,cr3
	mov ebx,[eax+thread_t.pd]

	test ecx,ecx
	jz .stack_change
;----------------------------
;	check if we need to change cr3
	cmp ebx,edx
	je .stack_change

	mov cr3,ebx
;----------------------------
;	set new esp
.stack_change:
	pop edx ebx

	mov esp,[eax+thread_t.esp3]
	mov [ebx+tss_t.esp0],ecx
;----------------------------
;	test if we need to send an eoi
	btr dword[edx+cpu_t.scheduler_flags],1
	jnc .end

	RETURN_NOAPIC
;----------------------------
align 4
.end:
	RETURN_APIC
ENDP
;----------------------------
User avatar
Combuster
Member
Member
Posts: 9301
Joined: Wed Oct 18, 2006 3:45 am
Libera.chat IRC: [com]buster
Location: On the balcony, where I can actually keep 1½m distance
Contact:

Post by Combuster »

I tried to find the code for spinlock_acquire, to see what it does, but you didnt post that...
"Certainly avoid yourself. He is a newbie and might not realize it. You'll hate his code deeply a few years down the road." - Sortie
[ My OS ] [ VDisk/SFS ]
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

Here it is, but all my tests show me that this code works!

Code: Select all

;----------------------------
PROC spinlock_acquire, spin
;----------------------------
BEGIN
	mov esi,[spin]
;----------------------------
align 4
.test:
	lock bts dword[esi],0
	jnc .end

	pause

	jmp .test
;----------------------------
align 4
.end:
	RETURN
ENDP
;----------------------------

;----------------------------
PROC spinlock_release, spin
;----------------------------
BEGIN
	mov esi,[spin]

	lock btr dword[esi],0
	jc .end

	int 0
;----------------------------
align 4
.end:
	RETURN
ENDP
;----------------------------
User avatar
Combuster
Member
Member
Posts: 9301
Joined: Wed Oct 18, 2006 3:45 am
Libera.chat IRC: [com]buster
Location: On the balcony, where I can actually keep 1½m distance
Contact:

Post by Combuster »

atm i think you should check your semaphore_t structure to see wether count is the first element in the structure. If so, the semaphore code interferes with the spinlock code...
"Certainly avoid yourself. He is a newbie and might not realize it. You'll hate his code deeply a few years down the road." - Sortie
[ My OS ] [ VDisk/SFS ]
User avatar
Brendan
Member
Member
Posts: 8561
Joined: Sat Jan 15, 2005 12:00 am
Location: At his keyboard!
Contact:

Post by Brendan »

Hi,
FlashBurn wrote:I still have the problem that my semaphore code doesn´t work on smp machines (I tested it on qemu and bochs, because at the moment I can´t test it on a real smp machine).
An interesting thing about (single threaded) emulators is that only one CPU is being emulated at a time. For Bochs, the default setting is to do 5 instructions on one CPU, then 5 instructions on the next CPU, etc. This means that some things (like forgetting to use a "lock" prefix) never cause problems, and it's less likely that race conditions will cause problems too.

Qemu is much much worse. IIRC it does 30 ms on one CPU, then 30 ms on the next CPU, etc. This works out to (very roughly) thousands of emulated instructions on one CPU, then thousands on the next CPU, etc; and the chance of small race conditions causing problems is very very small.

Because you've said you tried it on Qemu and it didn't work right, then I'd assume there's significant problems (e.g. corrupted linked lists) rather than smaller problems (e.g. race conditions). This means it should be possible to stop the emulator at key spots (just after a semaphore is acquired, just after it's released, etc) and check if everything is correct.

I'm wondering if you'd mind posting the macros you're using - I'm having trouble understanding why you're using:

Code: Select all

	CALL spinlock_acquire, dword[sem]
But then using

Code: Select all

	CALL spinlock_release, esi
When the code for "spinlock_acquire" and "spinlock_release" look like any arguments should be the same.

I'm also wondering if you can localise the problem - for example, is the problem definately within the semaphore code (i.e can you guarantee that "scheduler_add_scheduler", "scheduler_dequeue_intel_smp", "scheduler_enqueue_smp" and "scheduler_smp" all work correctly).

BTW, for the spinlock code, if there's lock contention you'd be continually locking the bus. A better approach is to only lock the bus when necessary - for e.g.:

Code: Select all

	lock bts dword[esi],0
	jnc .end

.test:
	pause

	test dword[esi],1
        jne .test

	lock bts dword[esi],0
	jnc .test
.end:

Cheers,

Brendan
For all things; perfection is, and will always remain, impossible to achieve in practice. However; by striving for perfection we create things that are as perfect as practically possible. Let the pursuit of perfection be our guide.
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

@brendan

To answer your question, why I one time use "dword[sem]" and one time "esi", it is faster, I think so, to push a register as to push "dword[ebp+8]". The "spinlock_acquire" function puts the value in "[sem]" into the esi reg and I can use it and needn´t to load the value into esi myself.

I cannot be sure that the other code is really working, but if I test my putchar function with a spinlock instead of a semaphore, it is working, w/o problems.

I can give you the macros, but please don´t ask how it works in detail, because this is only copy and paste and some changes from me.

Code: Select all

;----------------------------
;	global consts
..OPEN= 0
..LOCALS= 0

;----------------------------
macro CALL proc, [arg]
{
common
		local __ARGC
		__ARGC= 0
	if ~ arg eq
forward
		__ARGC= __ARGC+1
reverse
		push arg
common
	end if
		call proc
	if ~ arg eq
		add esp,4*__ARGC
	end if
}
;----------------------------

;----------------------------
macro CALLINT proc
{
	pushfd
	push cs

	call proc
}
;----------------------------

;----------------------------
macro RETURN val
{
	if ~ val eq
		if val eq 0
			xor eax,eax
		else
			mov eax,val
		end if
	end if
		leave
		ret
}
;----------------------------

;----------------------------
macro PROC name,[arg]
{
common
	if used name
		prog_args equ arg
	if ..OPEN > 0
		display "ERROR: no endp before proc",13,10
		..OPEN= 0
	end if
		..OPEN= ..OPEN+1
	if ~ arg eq
		virtual at ebp+8
forward
		local ..arg
		..arg dd ?
		arg equ ..arg
common
		end virtual
	end if
		..LOCALS= 0
		align 16
		name:
}
;----------------------------

;----------------------------
macro VARS [arg]
{
common
	if ..OPEN <> 1
			display "ERROR: vars without proc",13,10
	end if
	if ~ arg eq
forward
		..LOCALS= ..LOCALS+1
		local ..var
		..var= ..LOCALS
		arg equ ebp-4*..var
common
	end if
}
;----------------------------

;----------------------------
macro BEGIN
{
	if ..OPEN <> 1
		display "ERROR: begin without proc",13,10
	end if
		push ebp
		mov ebp,esp
	if ..LOCALS > 0
		sub esp,4*..LOCALS
	end if
}
;----------------------------

;----------------------------
macro ENDP
{
		..OPEN= ..OPEN-1
	if ..OPEN <> 0
		display "ERROR: endp without proc",13,10
	end if
		match args, proc_args \{ restore args \}
	end if
}
;----------------------------
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

I have 2 version of my semaphore_acquire_smp code. As you can see there are some instruction commented out. I have the option to get the ptr 2 the actual thread over the APIC ID and the cpu structure and about the thread structure which the fs reg points to!

The version with the APIC ID is working, but the version with the fs reg isn´t working! Any idea where the problem can be?
User avatar
Brendan
Member
Member
Posts: 8561
Joined: Sat Jan 15, 2005 12:00 am
Location: At his keyboard!
Contact:

Post by Brendan »

Hi,
FlashBurn wrote:I can give you the macros, but please don´t ask how it works in detail, because this is only copy and paste and some changes from me.
Hmm - trying to pretend assembly is a high level language is usually a bad idea - it makes code harder to read and debug, and the assembler won't optimise like a high level language compiler would.

For an example, consider "CALL spinlock_release, esi". The macros hide the fact that you're pushing ESI onto the stack, then building a stack frame, then loading the value from the stack back into ESI, then destroying the stack frame. A decent high level language compiler would inline the function and skip all of that, and you'd end up with about 3 instructions to release a spinlock instead of about 20.

You can of course do the same with macros. For example (for NASM):

Code: Select all

%macro SPINLOCK_ACQUIRE 1
   lock bts dword %1,0
   jnc .end

.test:
   pause

   test dword %1,1
    jne .test

   lock bts dword %1,0
   jnc .test
.end:
%endmacro


%macro SPINLOCK_RELEASE 1
   lock btr dword %1,0
   jc .end
   int 0
.end:
%endmacro
If you're using assembly because you want your code to be better than compiled high level language code, then you'd want to stop using high level language calling conventions and pass values in registers. Otherwise you'd be better off using a high level language instead, so the compiler's optimizer can pass parameters in registers where it thinks it can (and inline functions where it think it makes sense, etc).
FlashBurn wrote:I have 2 version of my semaphore_acquire_smp code. As you can see there are some instruction commented out. I have the option to get the ptr 2 the actual thread over the APIC ID and the cpu structure and about the thread structure which the fs reg points to!

The version with the APIC ID is working, but the version with the fs reg isn´t working! Any idea where the problem can be?
Is the base address for your FS segment correct?

More specifically, does the "CALL gdt_set_base, dword[edx+cpu_t.fs], eax" line from your "scheduler_dequeue_intel_smp" function reload the FS register or just change the GDT entry without reloading the FS register?

You can change the GDT as much as you like, and the CPU will continue using the old base address and limit for FS (until FS is reloaded).


Cheers,

Brendan
For all things; perfection is, and will always remain, impossible to achieve in practice. However; by striving for perfection we create things that are as perfect as practically possible. Let the pursuit of perfection be our guide.
User avatar
mystran
Member
Member
Posts: 670
Joined: Thu Mar 08, 2007 11:08 am

Post by mystran »

I think it's these days kinda ignorant to use assembler in order to write code that is better than would be produced by a decent compiler. I can understand why people would want to use assembler, for example to reduce stack use by passing stuff in registers, or allow control structures and calling conventions not easily allowed by any suitable language implementation, but 9 times out of 10, unless you actually optimize every single instruction in your code, it's likely that an optimizing compiler will do a better job in average.

Modern compilers are pretty smart, and have the advantage that once an optimization algorithm has been implemented, it will automatically take care of the optimization in question for a large class of similar code. In fact, compilers that do global flow analysis, can compile to "faster than C" code that is indeed very hard to beat when writing assembler manually.
The real problem with goto is not with the control transfer, but with environments. Properly tail-recursive closures get both right.
User avatar
Brendan
Member
Member
Posts: 8561
Joined: Sat Jan 15, 2005 12:00 am
Location: At his keyboard!
Contact:

Post by Brendan »

Hi,
mystran wrote:I think it's these days kinda ignorant to use assembler in order to write code that is better than would be produced by a decent compiler. I can understand why people would want to use assembler, for example to reduce stack use by passing stuff in registers, or allow control structures and calling conventions not easily allowed by any suitable language implementation, but 9 times out of 10, unless you actually optimize every single instruction in your code, it's likely that an optimizing compiler will do a better job in average.
In general, for applications, it takes a lot of work to make assembly perform better than code optimized by a good compiler (and it usually isn't worth the effort, unless you need to use MMX and/or SSE). This is especially true for readable assembly code designed to work on several generations of CPU. For e.g. code optimized for an 80486 might suck badly on a Pentium 4, and code optimized for a Pentium 4 might suck on an 80486 (or not work at all).

For "core kernel code" it isn't as hard though, as parts of it needs to be in assembly anyway, and C can't even handle something as simple as an interrupt handler without (cycle-wasting) assembly hacks. There's other benefits too - debugging is easier (for e.g. single stepping with Bochs, where the instruction trace matches the source code line for line and you don't need to figure out which register or stack location corresponds to a certain variable) and there's no need to mix languages or fudge linker scripts to suit.

The "core kernel code" is also one of the places where a high level of optimization can be justified (once the algorithms used can't be optimized and the code is stable). The performance of things like memory management, IPC and thread switches effect the speed of everything else that ever runs on that OS.


Cheers,

Brendan
For all things; perfection is, and will always remain, impossible to achieve in practice. However; by striving for perfection we create things that are as perfect as practically possible. Let the pursuit of perfection be our guide.
User avatar
Candy
Member
Member
Posts: 3882
Joined: Tue Oct 17, 2006 11:33 pm
Location: Eindhoven

Post by Candy »

Brendan wrote:In general, for applications, it takes a lot of work to make assembly perform better than code optimized by a good compiler (and it usually isn't worth the effort, unless you need to use MMX and/or SSE). This is especially true for readable assembly code designed to work on several generations of CPU. For e.g. code optimized for an 80486 might suck badly on a Pentium 4, and code optimized for a Pentium 4 might suck on an 80486 (or not work at all).
That's actually amazingly correct. The P4 is very quick at doing multiplications and relatively slow at doing shifts - so instead of using lea + SIB addressing you'd better just multiply. The 486 is very quick at doing shifts & slow at doing multiplications - so you're better off doing lea + SIB addressing. These (allegedly) are both on the order of multiple times performance.
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

I solved the problem in some way. I rewrote some of my code, so that the gs reg is pointing to a 4kbyte page where the cpu dependent data is stored. So I don´t need extra code for my semaphore code.

But I will try my old code with you tips, because I didn´t reload the fs reg.

And the reason for writing my code in assembly is that I´m not able to write my algorithms in c :(

And I will also use the tip with the macros for the spinlocks!
FlashBurn
Member
Member
Posts: 313
Joined: Fri Oct 20, 2006 10:14 am

Post by FlashBurn »

I finally solved the problem. The code is working as it was before ;)

The problem was that I was using my idle threads for testing the semaphore code and if you have 4 cpus and 2 threads running, one time the idle thread of cpu0 will run on cpu1 and so code is running 2 times and both have the semaphore and then try to free them.

I also found another bug, that all the APs could run code before the BP has finished kernel initialisation.

Thanks to all who had a look an my code!
User avatar
mystran
Member
Member
Posts: 670
Joined: Thu Mar 08, 2007 11:08 am

Post by mystran »

Remember that any userspace code can load the null-descriptor into any segment register if they feel like it, so you can't rely on those in your kernel. Ofcourse that is only an issue when you are coming from userspace to kernel.
The real problem with goto is not with the control transfer, but with environments. Properly tail-recursive closures get both right.
Post Reply