Postmann wrote:Hello
I wrote this function to convert an integer to a string. It is fully working, but I wonder if it's somehow possible to speed it up? I guess it's too much code for such a simple task. "rax" holds the integer, "rcx" is the adress of the string, where I want to store it:
I needed something to brush up on my assembly\C and this piqued my curiosity, so here we go.
First, I do not believe your code is bug free. When you're reversing the order of the characters by popping the values off of the stack you're writing 8 bytes, not just one. This means you're writing past an otherwise correctly sized buffer once you get close to the end of the string. E.g. a buffer with a size of 21 bytes (1 for sign, 19 for digits, 1 for NULL termination) should work in all cases but your code writes 6 bytes past the end of such a buffer when popping the last value off the stack.
Second, your algorithm is the naive approach a 'itoa' style function. You can speed it up drastically by
1. Precomputing the number of digits (so you don't have to reverse them later)
2. Computing multiple digits in parallel.
3. Getting rid of the (very) costly div instruction
Those all come at the cost of using a little bit more memory, but if you're on an x86-64 machine you should be sacrificing modest amounts of memory for speed any chance you get.
Precomputing the number digits is really cheap. You can use the BSR\LZCNT (basically calculating the log base 2) instruction to count the number of leading 0's in the values binary form. You can then use a lookup table to find an "approximation" based on this. There are a couple of instances where this would give you inaccurate results so you need to use a second lookup table to correct it. E.g. 999 and 1000 both have 54 leading 0's in 64-bit binary form. So you have to compare the value to 1000 if it has 54 leading 0's to determine if it has 3 or 4 digits.
The easiest way to compute digits in parallel is to do div\modulo by 100 instead of 10. You then use a lookup table to convert the remainder to the digits that you would have recovered based on two div\modulo's by 10. The lookup table costs 200 bytes of memory. You can also do multiple div\modulo's in parallel. You can do a div\modulo by 100 and 10000 at the "same time" (Using instruction level parallelism and\or SSE) to break up dependencies. Computing the next 2 digits no longer relies on the div\modulo of the first. You can use this method to "unroll" (Note: this isn't actually loop unrolling, it just looks like it) the entire loop of div\modulo's.
Since you're dividing by a constant (100 with the above optimization) you can replace the div instruction with a multiplication and a series of shifts\adds\subs. It can get even faster on x86 by using the LEA instruction to do multiply\adds at same time. I just looked at the disassembly that GCC (with -O3) produced to come up with the assembly code I used.
With that in mind I wrote a function that uses all of the above except for the div\modulo by 10000, 1000000, etc. to "unroll" the loop optimization. Note that this will break if the LZCNT instruction is not supported and in the input is 0. LZCNT will return the operand size (64 in our case) if the input is 0 whereas with BSR (which is what LZCNT shares its encoding with) it is undefined. This is also the reason why the 'dlens' lookup table has 65 entries, because LZCNT can return 0->64 inclusive. It also doesn't give the actual max digit count for a given LZCNT output, but rather is max digit count subtracted by 1. This is because the output of the lookup (after being corrected) is used as an offset into the buffer provided to serve as a starting point. Saves us from having to subtract 1 from the output to get the starting offset into the buffer.
These are for unsigned values. Adding the sign is obviously trivial.
Full Code:
Code: Select all
; itoa_asm.asm
section .text
global myasm
global lzcount
global yourasm
lzcount:
lzcnt rax, rdi
ret
myasm:
lzcnt rcx, rdi
mov rax, maxv
cmp rdi, QWORD [rax + rcx*8]
jae .CorrectGuess
add ecx, 1
.CorrectGuess:
mov rax, dlens
movzx rax, BYTE [rax + rcx]
add rsi, rax
mov rax, rdi
mov rdi, digits
push rbx
mov rbx, 0x28f5c28f5c28f5c3 ; Magic number for div\rem by 100
cmp eax, 100
jb .Below100
.DoTwoDigits:
; Divides RAX by 100. Quotient in RAX, Remainder*2 in RDX. Clobbers RCX, RBX must = 0x28f5c28f5c28f5c3
; As done by GCC
mov rcx, rax
shr rax, 0x2
mov rdx, rax
mul rbx
shr rdx, 0x2
lea rax, [rdx+rdx*4]
lea rax, [rax+rax*4]
shl rax, 0x2
sub rcx, rax
mov rax, rdx
lea edx, [rcx+rcx*1]
; Done
mov cl, BYTE [rdi + rdx+1]
mov BYTE [rsi], cl
mov cl, BYTE [rdi + rdx]
mov BYTE [rsi-1], cl
sub rsi, 2
cmp rax, 100
ja .DoTwoDigits
.Below100:
pop rbx
cmp eax, 10
jae .TwoMore
mov cl, '0'
add cl, al
mov [rsi], cl
ret
.TwoMore:
shl eax, 1
mov cl, BYTE [rdi + rax+1]
mov BYTE [rsi], cl
mov cl, BYTE [rdi + rax]
mov BYTE [rsi-1], cl
ret
yourasm:
; Swap RDI\RAX and RSI\RCX and preserve RBX to match Linux x64 calling conventions
mov rax, rdi
mov rcx, rsi
push rbx
xor rdi, rdi
mov rbx, 10
.ZC2:
xor rdx, rdx
div rbx
add rdx, 48
push rdx
inc rdi
cmp rax, 0
jnz .ZC2
.ZC3:
pop QWORD [rcx]
inc rcx
dec rdi
cmp rdi, 0
jnz .ZC3
pop rbx
ret
section .data
dlens db 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, \
14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, \
8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0
maxv dq 10000000000000000000, 1000000000000000000, 1000000000000000000, 1000000000000000000, \
1000000000000000000, 100000000000000000, 100000000000000000, 100000000000000000, \
10000000000000000, 10000000000000000, 10000000000000000, 1000000000000000, 1000000000000000, \
1000000000000000, 1000000000000000, 100000000000000, 100000000000000, 100000000000000, \
10000000000000, 10000000000000, 10000000000000, 1000000000000, 1000000000000, 1000000000000, \
1000000000000, 100000000000, 100000000000, 100000000000, 10000000000, 10000000000, 10000000000, \
1000000000, 1000000000, 1000000000, 1000000000, 100000000, 100000000, 100000000, 10000000, 10000000, \
10000000, 1000000, 1000000, 1000000, 1000000, 100000, 100000, 100000, 10000, 10000, 10000, 1000, 1000, \
1000, 1000, 100, 100, 100, 10, 10, 10, 1, 1, 1
digits db '000102030405060708091011121314151617181920212223242526272829303132333435363738394041424344454647484950\
51525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899'
Code: Select all
// itoa_c.c
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <time.h>
unsigned char dlens[65] = {
19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0 };
unsigned long maxv[64] = {
10000000000000000000U, 1000000000000000000U, 1000000000000000000U, 1000000000000000000U, 1000000000000000000U, 100000000000000000U, 100000000000000000U, 100000000000000000U, 10000000000000000U, 10000000000000000U, 10000000000000000U, 1000000000000000U, 1000000000000000U, 1000000000000000U, 1000000000000000U, 100000000000000U, 100000000000000U, 100000000000000U, 10000000000000U, 10000000000000U, 10000000000000U, 1000000000000U, 1000000000000U, 1000000000000U, 1000000000000U, 100000000000U, 100000000000U, 100000000000U, 10000000000U, 10000000000U, 10000000000U, 1000000000U, 1000000000U, 1000000000U, 1000000000U, 100000000U, 100000000U, 100000000U, 10000000U, 10000000U, 10000000U, 1000000U, 1000000U, 1000000U, 1000000U, 100000U, 100000U, 100000U, 10000U, 10000U, 10000U, 1000U, 1000U, 1000U, 1000U, 100U, 100U, 100U, 10U, 10U, 10U, 1U, 1U, 1U};
char digits[200] = "00010203040506070809101112131415161718192021222324252627282930313233"
"3435363738394041424344454647484950515253545556575859606162636465666"
"76869707172737475767778798081828384858687888990919293949596979899";
extern void myasm(unsigned long i, char* buffer);
extern void yourasm(unsigned long i, char* buffer);
extern unsigned long wtfdiv(unsigned long i);
extern unsigned long lzcount(unsigned long i);
unsigned long llrand()
{
unsigned long r = 0;
for (int i = 0; i < 5; ++i) {
r = (r << 15) | (rand() & 0x7FFF);
}
return r & 0xFFFFFFFFFFFFFFFFUL;
}
void __attribute__ ((noinline)) myc(unsigned long i, char* buffer)
{
int x;
int length = lzcount(i); // Should use __builtin_ia32_lzcnt_u64() if available
if (i < maxv[length])
length++;
length = (int)dlens[length];
while (i >= 100) {
x = (i % 100) * 2;
i /= 100;
buffer[length] = digits[x + 1];
buffer[length - 1] = digits[x];
length -= 2;
}
if (i < 10) {
buffer[length] = '0' + (unsigned char)i;
} else {
x = i * 2;
buffer[length] = digits[x + 1];
buffer[length - 1] = digits[x];
}
}
int main()
{
unsigned long test = 6843654354; // To test to make sure our functions are sane.
char* buffer = malloc(32);
memset(buffer, 0, 32);
myasm(test, buffer);
printf("%s\n", buffer);
memset(buffer, 0, 32);
myc(test, buffer);
printf("%s\n", buffer);
memset(buffer, 0, 32);
yourasm(test, buffer);
printf("%s\n", buffer);
clock_t start = clock();
for (int i = 0; i < 1000000; i++) {
myasm(llrand(), buffer);
}
clock_t end = clock();
double time_taken = ((double)end - start)/CLOCKS_PER_SEC;
printf("myasm took %f seconds to execute \n", time_taken);
start = clock();
for (int i = 0; i < 1000000; i++) {
sprintf(buffer, "%lu", llrand());
}
end = clock();
time_taken = ((double)end - start)/CLOCKS_PER_SEC;
printf("sprintf took %f seconds to execute \n", time_taken);
start = clock();
for (int i = 0; i < 1000000; i++) {
myc(llrand(), buffer);
}
end = clock();
time_taken = ((double)end - start)/CLOCKS_PER_SEC;
printf("myc took %f seconds to execute \n", time_taken);
start = clock();
for (int i = 0; i < 1000000; i++) {
yourasm(llrand(), buffer);
}
end = clock();
time_taken = ((double)end - start)/CLOCKS_PER_SEC;
printf("yourasm took %f seconds to execute \n", time_taken);
return 0;
}
Compiling with:
Code: Select all
gcc -std=c99 -O3 -c itoa_c.c
nasm -f elf64 itoa_asm.asm
gcc -o itoa_test itoa_c.o itoa_asm.o
Produces the following output:
Code: Select all
6843654354
6843654354
6843654354
myasm took 0.058922 seconds to execute
sprintf took 0.130046 seconds to execute
myc took 0.062230 seconds to execute
yourasm took 0.255511 seconds to execute
This was run on an Ubuntu VM with a stock 4770k. As you can see the optimizations significantly speed up the operation. I was actually surprised to see how fast sprintf in this instance. Without the div optimization my assembly was about as fast as the sprintf results. Run multiple times and sometimes my C code as generated by GCC was faster then my assembly, so handwriting the assembly (at least by me) doesn't provide any significant speedups.