Hi,
shiner wrote:Well, the answer is good. but what i've wrong?! my code was marking those numbers that are not primes(his multiples) and add it when finds a prime number.
Your method should be even faster, as there's no need to mark non-primes. Your code didn't compile though, and had a few other bugs - I changed it to this:
Code: Select all
#include <stdio.h>
#include <string.h>
#include <math.h>
char is_not_prime_table[2000000];
unsigned int i, i2;
unsigned long long curr = 0;
int main(int argc, char** argv) {
memset(is_not_prime_table, 0, 2000000);
is_not_prime_table[0] = 1;
is_not_prime_table[1] = 1;
for(i = 2; i < 2000000; i++) {
if(is_not_prime_table[i] == 0) {
curr += (unsigned long long)i;
for(i2 = i + i; i2 < 2000000; i2 += i) {
is_not_prime_table[i2] = 1;
}
}
}
printf("%llu\n", curr);
}
With "gcc -O3" that takes 0.021 seconds on my computer - the same as my optimized version.
Of course I optimized your version by skipping every multiple of 2, and got this:
Code: Select all
#include <stdio.h>
#include <string.h>
#include <math.h>
char is_not_prime_table[1000000];
unsigned int i, i2;
unsigned long long curr = 2;
int main(int argc, char** argv) {
memset(is_not_prime_table, 0, 1000000);
for(i = 1; i < 1000000; i++) {
if(is_not_prime_table[i] == 0) {
curr += (unsigned long long)i + (unsigned long long)i + 1;
for(i2 = i + i + 1 + i; i2 < 1000000; i2 += i + i + 1) {
is_not_prime_table[i2] = 1;
}
}
}
printf("%llu\n", curr);
}
That takes half as long (0.010 seconds on my computer)...
Then I did a version in assembly with BT and BTS instructions. It didn't help (took 0.12 seconds on my computer), probably because my CPU has 4 MiB L2 caches and the previous versions don't end up thrashing the cache anyway. That annoyed me, so I sunk my teeth in...
In the end I came up with this:
Code: Select all
section .data
align 8
primes: times (1000000) db 0
section .text
getPrimes:
push rbx
push rcx
push rdx
xor ebx,ebx
mov eax,2
jmp .prime1A
.nextGroupA:
cmp qword [primes+rbx],0x0101010101010101
je .doneGroupA
cmp byte [primes+rbx],0
jne .prime1A
lea rdx,[rbx*2 + 0*2 + 1]
lea ecx,[primes+ebx*3 + 0*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime1A
.nextMark0:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark0
.prime1A:
cmp byte [primes+rbx+1],0
jne .prime2A
lea rdx,[rbx*2 + 1*2 + 1]
lea ecx,[primes+ebx*3 + 1*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime2A
.nextMark1:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark1
.prime2A:
cmp byte [primes+rbx+2],0
jne .prime3A
lea rdx,[rbx*2 + 2*2 + 1]
lea ecx,[primes+ebx*3 + 2*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime3A
.nextMark2:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark2
.prime3A:
cmp byte [primes+rbx+3],0
jne .prime4A
lea rdx,[rbx*2 + 3*2 + 1]
lea ecx,[primes+ebx*3 + 3*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime4A
.nextMark3:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark3
.prime4A:
cmp byte [primes+rbx+4],0
jne .prime5A
lea rdx,[rbx*2 + 4*2 + 1]
lea ecx,[primes+ebx*3 + 4*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime5A
.nextMark4:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark4
.prime5A:
cmp byte [primes+rbx+5],0
jne .prime6A
lea rdx,[rbx*2 + 5*2 + 1]
lea ecx,[primes+ebx*3 + 5*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime6A
.nextMark5:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark5
.prime6A:
cmp byte [primes+rbx+6],0
jne .prime7A
lea rdx,[rbx*2 + 6*2 + 1]
lea ecx,[primes+ebx*3 + 6*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .prime7A
.nextMark6:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark6
.prime7A:
cmp byte [primes+rbx+7],0
jne .doneGroupA
lea rdx,[rbx*2 + 7*2 + 1]
lea ecx,[primes+ebx*3 + 7*3 + 1]
add rax,rdx
cmp ecx,primes+1000000
jae .doneGroupA
.nextMark7:
mov byte [ecx],1
add ecx,edx
cmp ecx,primes+1000000
jb .nextMark7
.doneGroupA:
add ebx,8
cmp ebx,(1000000-1)/3+7
jb .nextGroupA
xor edx,edx
.nextGroupB:
cmp qword [primes+rbx],0x0101010101010101
je .doneGroupB
cmp byte [primes+rbx],0
jne .prime1B
lea rax,[rax+rbx*2 + 0*2 + 1]
.prime1B:
cmp byte [primes+rbx+1],0
jne .prime2B
lea rdx,[rdx+rbx*2 + 1*2 + 1]
.prime2B:
cmp byte [primes+rbx+2],0
jne .prime3B
lea rax,[rax+rbx*2 + 2*2 + 1]
.prime3B:
cmp byte [primes+rbx+3],0
jne .prime4B
lea rdx,[rdx+rbx*2 + 3*2 + 1]
.prime4B:
cmp byte [primes+rbx+4],0
jne .prime5B
lea rax,[rax+rbx*2 + 4*2 + 1]
.prime5B:
cmp byte [primes+rbx+5],0
jne .prime6B
lea rdx,[rdx+rbx*2 + 5*2 + 1]
.prime6B:
cmp byte [primes+rbx+6],0
jne .prime7B
lea rax,[rax+rbx*2 + 6*2 + 1]
.prime7B:
cmp byte [primes+rbx+7],0
jne .doneGroupB
lea rdx,[rdx+rbx*2 + 7*2 + 1]
.doneGroupB:
add ebx,8
cmp ebx,1000000
jb .nextGroupB
add rax,rdx
pop rdx
pop rcx
pop rbx
ret
That takes 0.008 seconds on my computer - roughly 20% faster than C...
Cheers,
Brendan