So I decided to generate some random data and sin tables so I could at least benchmark something.
After struggling to get GCC to compile something that doesn't segfault (damn PIE code) I've come up with the following:
Code: Select all
#!/usr/bin/env python3
from random import randint
with open("/tmp/sin_table.c", "w") as f:
f.write("short sin_table[0x40000] = {\n");
for _ in range(0x4_0000):
f.write(str(randint(-255, 255)) + '\n,')
f.write("};");
with open("/tmp/data.c", "w") as f:
f.write("short data[1 << 22] = {\n");
for _ in range(1 << 22):
f.write(str(randint(-255, 255)) + '\n,')
f.write("};");
Code: Select all
#include "stdio.h"
extern short sin_table[0x40000];
extern short data[1 << 22];
static int CalcPower(short int *data, int size, int init_phase, int phase_per_sample, long long *power) {
int phase = init_phase;
long long sum = 0;
for (int i = 0; i < size; i++) {
int p = (phase >> 14) & 0x3ffff;
sum += sin_table[p] * data[2 * i];
phase += phase_per_sample;
}
*power = sum;
return phase;
}
int main(int argc, char **argv) {
volatile long long dont_optimize_me;
for (int i = 0; i < 100; i++) {
for (int k = 10; k < 20; k++) {
long long power;
CalcPower(data, sizeof(data) / sizeof(*data) / 2, i, k, &power);
dont_optimize_me = power;
}
}
// Print one to verify correctness
printf("%lld\n", dont_optimize_me);
return 0;
}
Code: Select all
.intel_syntax noprefix
.globl _CalcFreqPowerA
.globl main
.section .text._CalcFreqPowerA
.p2align 4
# PARAMETERS: Data
# Size
# InitPhase
# PhasePerSample
# Power
#
# RETURNS: End phase
_CalcFreqPowerA:
push ebx
push ecx
push edx
push esi
push edi
push ebp
#
mov esi,[esp+0x1C]
mov ecx,[esp+0x20]
mov ebp,[esp+0x24]
mov edi,[esp+0x2C]
#
xor eax,eax
mov [edi + 0],eax
mov [edi + 4],eax
cfpaLoop:
mov ebx,ebp
shr ebx,13
and bl,0xFE
#
mov ax, [ebx + sin_table]
imul word ptr [esi]
movsx edx,dx
add word ptr [edi + 0],ax
adc dword ptr [edi + 2],edx
#
add esi,4
add ebp,[esp+0x28]
loop cfpaLoop
#
movsx eax,word ptr [edi + 4]
mov [edi + 4],eax
#
mov eax,ebp
#
pop ebp
pop edi
pop esi
pop edx
pop ecx
pop ebx
ret 20
.section .text.main
.p2align 4
main:
sub esp, 8 # reserve storage for power value
# for (int i = 0; i < 100; i++)
xor ecx, ecx
2:
# for (int k = 10; k < 20; k++)
mov edx, 10
3:
# CalcPower
push esp
push edx
push ecx
push 1 << 21
push [data_ptr]
call _CalcFreqPowerA
# for k
inc edx
cmp edx, 20
jnz 3b
# for i
inc ecx
cmp ecx, 100
jnz 2b
# printf
push [format_str_lld_ptr]
call printf
add esp, 4 + 8
# return 0
xor eax, eax
ret
data_ptr:
.long data
format_str_lld_ptr:
.long format_str_lld
.section .text.rodata
format_str_lld:
.asciz "%lld\n"
Code: Select all
CARGS += -O3
CARGS += -Wall
#CARGS += -flto
all: sin2_64_c_native sin2_64_c sin2_32_c sin2_32_asm
sin2_64_c_native: sin2.c sin_table.c data_64.o
$(CC) $(CARGS) -march=native $^ -o $@
sin2_64_c: sin2.c sin_table.c data_64.o
$(CC) $(CARGS) $^ -o $@
sin2_32_c: sin2.c sin_table.c data_32.o
$(CC) $(CARGS) -m32 $^ -o $@ -g3
sin2_32_asm: sin2.s sin_table.c data_32.o
$(CC) $(CARGS) -no-pie -fno-pie -m32 $^ -o $@
data_64.o: data.c
$(CC) $(CARGS) -c $^ -o $@
data_32.o: data.c
$(CC) $(CARGS) -c -m32 $^ -o $@
Benchmarking with `perf stat` yields the following results:
Code: Select all
david@pc1:/tmp$ perf stat ./sin2_64_c_native
-50213557
Performance counter stats for './sin2_64_c_native':
1634.27 msec task-clock # 1.000 CPUs utilized
3 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
184 page-faults # 0.113 K/sec
6498854866 cycles # 3.977 GHz (83.36%)
3982498 stalled-cycles-frontend # 0.06% frontend cycles idle (83.36%)
3418439760 stalled-cycles-backend # 52.60% backend cycles idle (83.36%)
23030851488 instructions # 3.54 insn per cycle
# 0.15 stalled cycles per insn (83.36%)
2090698244 branches # 1279.289 M/sec (83.36%)
47624 branch-misses # 0.00% of all branches (83.22%)
1.634828733 seconds time elapsed
1.634865000 seconds user
0.000000000 seconds sys
david@pc1:/tmp$ perf stat ./sin2_64_c
-50213557
Performance counter stats for './sin2_64_c':
1639.85 msec task-clock # 1.000 CPUs utilized
2 context-switches # 0.001 K/sec
2 cpu-migrations # 0.001 K/sec
183 page-faults # 0.112 K/sec
6457718560 cycles # 3.938 GHz (83.17%)
3505234 stalled-cycles-frontend # 0.05% frontend cycles idle (83.37%)
3482426541 stalled-cycles-backend # 53.93% backend cycles idle (83.41%)
23106438497 instructions # 3.58 insn per cycle
# 0.15 stalled cycles per insn (83.41%)
2098396719 branches # 1279.628 M/sec (83.41%)
51983 branch-misses # 0.00% of all branches (83.22%)
1.640573324 seconds time elapsed
1.640610000 seconds user
0.000000000 seconds sys
david@pc1:/tmp$ perf stat ./sin2_32_c
-50213557
Performance counter stats for './sin2_32_c':
2043.25 msec task-clock # 1.000 CPUs utilized
2 context-switches # 0.001 K/sec
0 cpu-migrations # 0.000 K/sec
175 page-faults # 0.086 K/sec
8239753164 cycles # 4.033 GHz (83.36%)
5344609 stalled-cycles-frontend # 0.06% frontend cycles idle (83.36%)
5192399565 stalled-cycles-backend # 63.02% backend cycles idle (83.36%)
27240135813 instructions # 3.31 insn per cycle
# 0.19 stalled cycles per insn (83.36%)
2097472578 branches # 1026.539 M/sec (83.36%)
58111 branch-misses # 0.00% of all branches (83.20%)
2.043735436 seconds time elapsed
2.043777000 seconds user
0.000000000 seconds sys
david@pc1:/tmp$ perf stat ./sin2_32_asm
-50213557
Performance counter stats for './sin2_32_asm':
3969.39 msec task-clock # 0.999 CPUs utilized
19 context-switches # 0.005 K/sec
7 cpu-migrations # 0.002 K/sec
175 page-faults # 0.044 K/sec
15536969090 cycles # 3.914 GHz (64.97%)
10339765 stalled-cycles-frontend # 0.07% frontend cycles idle (64.86%)
9893809047 stalled-cycles-backend # 63.68% backend cycles idle (64.83%)
23164560938 instructions # 1.49 insn per cycle
# 0.43 stalled cycles per insn (64.77%)
2104231298 branches # 530.114 M/sec (64.85%)
173766 branch-misses # 0.01% of all branches (64.97%)
3.973235657 seconds time elapsed
3.969949000 seconds user
0.000000000 seconds sys
The 64 C bit version is ~25% faster than the 32 C bit version. The 32 bit C version is ~95% faster than the 32 bit assembly version. I think the results speak for themselves.
Important to notice is that while the assembly version executes
less instructions than the C version, the C version has much better instruction scheduling which allows it to achieve ~2x IPC.