Page 1 of 1

Why is my scheduler locking malfunctioning on SMP systems?

Posted: Sun Apr 17, 2011 12:53 pm
by rdos
Variable description:

Global variables:

Code: Select all

owner_lock   ; this is the spinlock variable. It is initialized to 0, and when it is 1 the section is locked
owner_sel     ; the currect core owning the scheduler lock
int_core_count  ; number of cores that have active interrupts pending (that have not been able to get the scheduler lock)
int_core_sel   ; the first core that could not the scheduler lock
has_signal     ; indicates that a signal is pending
owner_wait     ; number of cores block waiting for scheduler lock
Per core variables:

Code: Select all

ps_sel       ; the aliased core selector
ps_nesting ; interrupt nesting for currenct core. Initial value is -1.
ps_wait     ; indicates that core is blocked waiting for the scheduler lock. Unblocked (initial) value is 0
Try to lock scheduler (called from ISRs and similar).

Single core version:

Code: Select all

    push ax
    mov ax,core_data_sel
    mov fs,ax
    mov fs,fs:ps_sel
    pop ax
    add fs:ps_nesting,1
    ret
Multicore version:

Code: Select all

    push ax
    push dx

tlmSpinLock:
    sti
    mov ax,ds:owner_lock
    or ax,ax
    jz tlmGet
;
    pause
    jmp tlmSpinLock

tlmGet:
    cli
    inc ax
    xchg ax,ds:owner_lock
    or ax,ax
    jnz tlmSpinLock
;
    mov ax,core_data_sel
    mov fs,ax
    mov fs,fs:ps_sel
;    
    mov ax,ds:owner_sel
    or ax,ax
    jz tlmTake
;
    mov dx,fs
    cmp ax,dx
    je tlmTake

tlmFail:
    add fs:ps_nesting,1
    jnc tlmNested
;
    inc ds:int_core_count
    mov ax,ds:int_core_sel
    or ax,ax
    jnz tlmNested
;
    mov ds:int_core_sel,fs

tlmNested:
    mov ds:owner_lock,0
    sti
    clc
    jmp tlmDone

tlmTake:
    mov ds:owner_sel,fs
    add fs:ps_nesting,1
    mov ds:owner_lock,0    
    sti

tlmDone:
    pop dx
    pop ax
    ret
Unconditional lock of scheduler (called from base-line code where interrupt nesting should always be 0)

Single core version:

Code: Select all

    push ax
    mov ax,core_data_sel
    mov fs,ax
    mov fs,fs:ps_sel
    pop ax
    add fs:ps_nesting,1
    jc lsDone
;
    CrashGate    ; this should never happen

lsDone:     
    ret
Multicore version:

Code: Select all

    push ax
    push dx

lmSpinLock:
    sti
    mov ax,ds:owner_lock
    or ax,ax
    jz lmGet
;
    pause
    jmp lmSpinLock

lmGet:
    cli
    inc ax
    xchg ax,ds:owner_lock
    or ax,ax
    jnz lmSpinLock
;
    mov ax,core_data_sel
    mov fs,ax
    mov fs,fs:ps_sel
;    
    mov ax,ds:owner_sel
    or ax,ax
    jz lmTake
;
    mov dx,fs
    cmp ax,dx
    je lmTake

lmHalt:
    mov ax,1
    xchg ax,fs:ps_wait
    or ax,ax
    jnz lmStartWait
;    
    inc ds:owner_wait

lmStartWait:
    mov ds:owner_lock,0
    sti
    hlt
    jmp lmSpinLock

lmTake:
    mov ds:owner_sel,fs
    add fs:ps_nesting,1
    mov ds:owner_lock,0    
    jc lmDone
;
    CrashGate   ; this should never happen in baseline code (int nesting is non-zero)

lmDone:     
    sti
    pop dx
    pop ax
    ret
Unlock scheduler from anywhere (both from ISRs and baseline code)

Single core version:

Code: Select all

    push ax
    
tusRetry:    
    cli
    sub fs:ps_nesting,1
    jnc tusDone
;
    mov ax,fs:ps_curr_thread
    or ax,ax
    jz tusDone
;    
    test fs:ps_flags,PS_FLAG_TIMER      
    jnz tusSwap
;    
    mov al,ds:has_signal
    or al,ds:has_list
    jz tusDone

tusSwap:
    add fs:ps_nesting,1
    jnc tusRetry
;
    sti
    push OFFSET tusDone
    call SaveLockedThread
    jmp ContinueCurrentThread

tusDone:
    sti
    pop ax
    ret
Multicore version:

Code: Select all

    push eax

tumSpinLock:
    sti
    mov ax,ds:owner_lock
    or ax,ax
    jz tumGet
;
    pause
    jmp tumSpinLock

tumGet:
    cli
    inc ax
    xchg ax,ds:owner_lock
    or ax,ax
    jnz tumSpinLock
    
tumRetry:    
    sub fs:ps_nesting,1
    jnc tumUnlock
;
    mov ax,ds:int_core_count
    or ax,ax
    jnz tumInt
;
    mov ax,fs
    cmp ax,ds:owner_sel
    je tumOwner
;
    CrashGate    ; core doing the unlock does not own the lock

tumOwner:        
    test fs:ps_flags,PS_FLAG_TIMER      
    jnz tumSwap
;       
    mov al,ds:has_signal
    or al,ds:has_list
    jz tumWake

tumSwap:
    add fs:ps_nesting,1
    jnc tumRetry
;
    mov ds:owner_lock,0
    sti
;
    mov ax,fs:ps_curr_thread
    or ax,ax
    jz tumDone
;    
    push OFFSET tumDone
    call SaveLockedThread
    jmp ContinueCurrentThread

tumInt:
    mov ax,fs
    cmp ax,ds:owner_sel
    je tumIntOwner
;
    cmp ax,ds:int_core_sel
    jne tumIntCoreOk
;
    mov ds:int_core_sel,0

tumIntCoreOk:
    dec ds:int_core_count
    mov ds:owner_lock,0
    jmp tumDone
            
tumIntOwner:
    dec ds:int_core_count
    mov ax,ds:int_core_sel
    or ax,ax
    jnz tumSwitchOwner
;
    push fs
    push bx
    push cx
;    
    mov cx,ds:processor_count
    mov bx,OFFSET processor_arr

tumIntLoop:
    mov fs,ds:[bx]
    mov ax,fs:ps_nesting
    cmp ax,-1
    jne tumIntFound
;
    add bx,2
    loop tumIntLoop
;
    CrashGate    ; this implies that int_nesting logic is malfunctioning
  
tumIntFound:
    mov ax,fs
;
    pop cx    
    pop bx
    pop fs
    
tumSwitchOwner:
    mov ds:owner_sel,ax
    mov ds:int_core_sel,0
    mov ds:owner_lock,0
    jmp tumDone
    
tumWake:
    mov ds:owner_sel,0  
    mov al,ds:owner_wait
    or al,al
    jz tumUnlock
;
    mov ds:owner_lock,0
    sti
    call WakeProcessor
    jmp tumDone

tumUnlock:
    mov ds:owner_lock,0
    sti
    mov eax,fs:ps_mask
    not eax
    and eax,ds:processor_preempt
    jz tumDone
;
    push fs
    push bx
    mov bx,OFFSET processor_arr

tumPreemptLoop:
    rcr eax,1
    jc tumPreemptDo
;
    add bx,2
    jmp tumPreemptLoop

tumPreemptDo: 
    mov fs,ds:[bx]
    UnblockProcessor
    pop bx
    pop fs

tumDone:
    sti
    pop eax
    ret
And finally the unlock from baseline:

Single core version:

Code: Select all

    cli
    sub fs:ps_nesting,1
    jc lulsDone
;
    CrashGate   ; interrupt nesting is incorrect

lulsDone:       
    ret
Multicore version:

Code: Select all

    push eax

lumSpinLock:
    sti
    mov ax,ds:owner_lock
    or ax,ax
    jz lumGet
;
    pause
    jmp lumSpinLock

lumGet:
    cli
    inc ax
    xchg ax,ds:owner_lock
    or ax,ax
    jnz lumSpinLock
;    
    sub fs:ps_nesting,1
    jc lumNestingOk
;
    CrashGate    ; code is not running on correct int nesting level

lumNestingOk:
    mov ax,fs
    cmp ax,ds:owner_sel
    je lumOwnerOk
;
    CrashGate   ; current core does not own the lock

lumOwnerOk:
    mov ax,ds:int_core_count
    or ax,ax
    jnz lumInt
;
    mov ds:owner_sel,0  
    mov al,ds:owner_wait
    or al,al
    jz lumUnlock
;
    mov ds:owner_lock,0
    sti
    call WakeProcessor
    jmp lumDone

lumInt:
    dec ds:int_core_count
    mov ax,ds:int_core_sel
    or ax,ax
    jnz lumSwitchOwner
;
    push fs
    push bx
    push cx
;    
    mov cx,ds:processor_count
    mov bx,OFFSET processor_arr

lumIntLoop:
    mov fs,ds:[bx]
    mov ax,fs:ps_nesting
    cmp ax,-1
    jne lumIntFound
;
    add bx,2
    loop lumIntLoop
;
    CrashGate   ; incorrect int nesting

lumIntFound:
    mov ax,fs
;
    pop cx    
    pop bx
    pop fs
    
lumSwitchOwner:
    mov ds:owner_sel,ax
    mov ds:int_core_sel,0
    mov ds:owner_lock,0
    sti
    jmp lumDone

lumUnlock:
    mov ds:owner_lock,0
    sti
;
    mov eax,fs:ps_mask
    not eax
    and eax,ds:processor_preempt
    jz lumDone
;
    push fs
    push bx
    mov bx,OFFSET processor_arr

lumPreemptLoop:
    rcr eax,1
    jc lumPreemptDo
;
    add bx,2
    jmp lumPreemptLoop

lumPreemptDo: 
    mov fs,ds:[bx]
    UnblockProcessor
    pop bx
    pop fs

lumDone:
    pop eax
    ret

Re: Why is my scheduler locking malfunctioning on SMP system

Posted: Sun Apr 17, 2011 2:45 pm
by Combuster
Issue #1:
Lack of comments.

Issue #2:
Either issue 1 or undefined values for segment registers.

Re: Why is my scheduler locking malfunctioning on SMP system

Posted: Wed Apr 20, 2011 1:56 pm
by rdos
Problem is solved. The main issue was that the loadunlock procedure left with interrupts enabled. This meant that a task could be saved with the ss:sp of the core scheduler, and when it is switched to again, things go real bad. There was also an issue with one branch that would corrupt the nesting level.

Now RDOS works with advanced, multithreaded applications running on an Intel Atom with hyperthreading, and with 3 real cores on an AMD Athlon. There are some minor issues with Athlon, but they are not related to the scheduler (for instance, keeping real time).