Hi,
01000101 wrote:I was unable to find what the SBC assembly instruction that you used earlier does or how it is used, so I will need a little more info about handling overflows/negation.
That's because I wrote the wrong thing!
I was thinking SBC (SuBtract with Carry) when I should've written SBB (SuBtract with Borrow).
Basically, if RAX is zero and the carry flag is clear then "sbb rax,0" will leave RAX set to zero, and if the carry flag is set then "sbb rax,0" will set RAX to 0xFFFFFFFFFFFFFFFF (or -1 if you look at it as a signed number). Mostly it's the same as "if (carry == clear) { RAX = 0 } else { RAX = 0xFFFFFFFFFFFFFFFF }" except there is no branch.
If you want the function to return "RAX = 1" if an overflow was detected then there's plenty of alternatives - "setc al" (which sets AL to zero if carry is clear, and sets AL to one if carry is set); or maybe "rcl rax,1" (which just shifts the carry flag into the lowest bit of RAX), or "adc rax, 0" (which works in a similar way to "sbb"), or "cmovc eax,1" (which does "eax = 1" if carry is set). I'm not sure which way would be faster...
01000101 wrote:Also, does the Xeon 'pause' optimization work in this case, or is that only for extremely tight 'spinlock-like' loops?
Most modern CPUs do several instructions at the same time. For tight loops, the CPU can be doing several iterations of the same loop at the same time (e.g. some instructions at the end of the loop being retired/completed while more instructions at the start of the loop are being speculatively executed). For spinlocks on systems with hyper-threading this creates problems because many iterations of the same loop may be being executed, which wastes execution resources that could be used by the other logical CPU. It also increases the costs to exit the loop, because those unnecessary instructions being speculatively executed need to be cleared from the pipeline.
The PAUSE instruction fixes this by telling the CPU to wait until all instructions are retired/completed before continuing (which also uses less CPU resources and let's the other logical CPU use those resources).
For a normal loop (anything that isn't a "polling loop"), you want the CPU to execute as much as it can at the same time to improve performance (e.g. so that by the time the CPU figures out which way the "loopnz" will branch, most of the work for the next iteration of the loop is already done). In this case a PAUSE instruction will hurt performance.
Also note that using any form of loop adds overhead (extra calculations and a branch mis-prediction on exit), so unrolling the loop can help. Unrolling also helps to avoid delays caused by fetching data. For example, load everything at the start of the function, so that by the time the last load is executed the data for the first load has arrived, and then start manipulating the data.
Large (unrolled) functions aren't suitable for inlining, but (assuming these functions would be used from different places) inlining can prevent them from being in the trace cache.
Anyway, here's my attempt at shifting left...
Code: Select all
;Shift left ("result = operand1 << operand2")
;
;Input
; rcx Address of 512-bit operand 2
; rsi Address of 512-bit operand 1
; rdi Address of 512-bit result
;
;Output
; rax Zero if no overflow, non-zero if overflow detected
mp_shr:
mov rbx,rcx
and ecx,0x0000003F
shr rbx,6
xor edx,edx
cmp rbx,512/64
jae .s512
jmp [.shift_left_table + ebx * 8]
section .data
.shift_left_table:
dq .s000, .s064, .s128, .s192, .s256, .s320, .s384, .s448
section .text
.s512:
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
xor r12,r12
xor r13,r13
xor r14,r14
xor r15,r15
mov rax,[rsi]
or rax,[rsi+8]
or rax,[rsi+16]
or rax,[rsi+24]
or rax,[rsi+32]
or rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s448:
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
xor r12,r12
xor r13,r13
xor r14,r14
mov r15,[rsi]
mov rax,[rsi+8]
or rax,[rsi+16]
or rax,[rsi+24]
or rax,[rsi+32]
or rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s384:
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
xor r12,r12
xor r13,r13
mov r14,[rsi]
mov r15,[rsi+8]
mov rax,[rsi+16]
or rax,[rsi+24]
or rax,[rsi+32]
or rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s320:
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
xor r12,r12
mov r13,[rsi]
mov r14,[rsi+8]
mov r15,[rsi+16]
mov rax,[rsi+24]
or rax,[rsi+32]
or rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s256:
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
mov r12,[rsi]
mov r13,[rsi+8]
mov r14,[rsi+16]
mov r15,[rsi+24]
mov rax,[rsi+32]
or rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s192:
xor r8,r8
xor r9,r9
xor r10,r10
mov r11,[rsi]
mov r12,[rsi+8]
mov r13,[rsi+16]
mov r14,[rsi+24]
mov r15,[rsi+32]
mov rax,[rsi+40]
or rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s128:
xor r8,r8
xor r9,r9
mov r10,[rsi]
mov r11,[rsi+8]
mov r12,[rsi+16]
mov r13,[rsi+24]
mov r14,[rsi+32]
mov r15,[rsi+40]
mov rax,[rsi+48]
or rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s064:
xor r8,r8
mov r9,[rsi]
mov r10,[rsi+8]
mov r11,[rsi+16]
mov r12,[rsi+24]
mov r13,[rsi+32]
mov r14,[rsi+40]
mov r15,[rsi+48]
mov rax,[rsi+56]
test ecx,ecx
jne .donePreShift
jmp .donePostShift
.s000:
xor eax,eax
mov r8,[rsi]
mov r9,[rsi+8]
mov r10,[rsi+16]
mov r11,[rsi+24]
mov r12,[rsi+32]
mov r13,[rsi+40]
mov r14,[rsi+48]
mov r15,[rsi+56]
test ecx,ecx
je .donePostShift
.donePreShift:
shld rdx,r15,cl
shld r15,r14,cl
shld r14,r13,cl
shld r13,r12,cl
shld r12,r11,cl
shld r11,r10,cl
shld r10,r9,cl
shld r9,r8,cl
shl r8,cl
.donePostShift:
mov [rdi],r8
mov [rdi+8],r9
mov [rdi+16],r10
mov [rdi+24],r11
mov [rdi+32],r12
mov [rdi+40],r13
mov [rdi+48],r14
mov [rdi+56],r15
or rax,rdx
ret
And 2 different versions for subtraction:
Code: Select all
;Subtract ("result = operand1 - operand2")
;
;Input
; rdx Address of 512-bit operand 1
; rsi Address of 512-bit operand 2
; rdi Address of 512-bit result
;
;Output
; rax Zero if no overflow, non-zero if overflow detected
mp_sub_3operand:
mov r8,[rdx]
mov r9,[rdx+8]
mov r10,[rdx+16]
mov r11,[rdx+24]
mov r12,[rdx+32]
mov r13,[rdx+40]
mov r14,[rdx+48]
mov r15,[rdx+56]
sub r8,[rsi]
sbb r9,[rsi+8]
sbb r10,[rsi+16]
sbb r11,[rsi+24]
sbb r12,[rsi+32]
sbb r13,[rsi+40]
sbb r14,[rsi+48]
sbb r15,[rsi+56]
sbb eax,0
mov [rdi],r8
mov [rdi+8],r9
mov [rdi+16],r10
mov [rdi+24],r11
mov [rdi+32],r12
mov [rdi+40],r13
mov [rdi+48],r14
mov [rdi+56],r15
ret
;Subtract ("result -= operand2")
;
;Input
; rsi Address of 512-bit operand 2
; rdi Address of 512-bit operand 1 (replaced by result)
;
;Output
; rax Zero if no overflow, non-zero if overflow detected
mp_sub_2operand:
mov r8,[rsi]
mov r9,[rsi+8]
mov r10,[rsi+16]
mov r11,[rsi+24]
mov r12,[rsi+32]
mov r13,[rsi+40]
mov r14,[rsi+48]
mov r15,[rsi+56]
sub [rdi],r8
sbb [rdi+8],r9
sbb [rdi+16],r10
sbb [rdi+24],r11
sbb [rdi+32],r12
sbb [rdi+40],r13
sbb [rdi+48],r14
sbb [rdi+56],r15
sbb eax,0
ret
Also, something makes me think your inline assembly versions aren't right, but I didn't spend much time thinking about them - my brain has a built-in "NASM syntax" parser, and I assume it's easier to compile them and see if they produce the right results that way...
Cheers,
Brendan