Hi,
I have done some more precise experiments; in a way to isolate more. Basically running the same code on double and long types:
Code: Select all
double * d = (double *) calloc (1024*1024*16,sizeof(double));
struct timeval st, et;
gettimeofday(&st,NULL);
for ( int j = 1 ; j < 1024*16 ; j ++)
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
d[i] = d[i] * j + i;
gettimeofday(&et,NULL);
printf ("%lu microsec\n", ((et.tv_sec - st.tv_sec) * 1000000) + (et.tv_usec - st.tv_usec));
free ( d );
return 0;
Code: Select all
long * d = (long *) calloc (1024*1024*16,sizeof(long));
struct timeval st, et;
gettimeofday(&st,NULL);
for ( int j = 1 ; j < 1024*16 ; j ++)
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
d[i] = d[i] * j + i;
gettimeofday(&et,NULL);
printf ("%lu microsec\n", ((et.tv_sec - st.tv_sec) * 1000000) + (et.tv_usec - st.tv_usec));
free ( d );
The "double" code takes on both my kernel and Linux 140 seconds, but the "long" code takes 140 seconds on my kernel compared to 204 seconds on Linux.
I used objdump to spit out the assembly generated by the compiler for both. The "double" assembly code is pretty much the same with minor variation. When I looked at the "long" assembly it is fairly different. The most important thing is that my kernel version does not use SSE registers/operations but the linux version does. Here is a sample of the dump.
Linux:
Code: Select all
0000000000001080 <main>:
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/time.h>
int main ()
{
1080: 53 push %rbx
long * d = (long *) calloc (1024*1024*16,sizeof(long));
1081: be 08 00 00 00 mov $0x8,%esi
1086: bf 00 00 00 01 mov $0x1000000,%edi
{
108b: 48 83 ec 30 sub $0x30,%rsp
long * d = (long *) calloc (1024*1024*16,sizeof(long));
108f: e8 ac ff ff ff callq 1040 <calloc@plt>
struct timeval st, et;
gettimeofday(&st,NULL);
1094: 31 f6 xor %esi,%esi
1096: 48 8d 7c 24 10 lea 0x10(%rsp),%rdi
long * d = (long *) calloc (1024*1024*16,sizeof(long));
109b: 48 89 c3 mov %rax,%rbx
gettimeofday(&st,NULL);
109e: e8 bd ff ff ff callq 1060 <gettimeofday@plt>
10a3: b9 ff 1f 00 00 mov $0x1fff,%ecx
10a8: 66 44 0f 6f 15 6f 0f movdqa 0xf6f(%rip),%xmm10 # 2020 <_IO_stdin_used+0x20>
10af: 00 00
10b1: 66 0f 6f 3d 77 0f 00 movdqa 0xf77(%rip),%xmm7 # 2030 <_IO_stdin_used+0x30>
10b8: 00
for ( int j = 1 ; j < 1024*16 ; j ++)
10b9: ba 01 00 00 00 mov $0x1,%edx
10be: 48 8d b3 00 00 00 08 lea 0x8000000(%rbx),%rsi
10c5: 48 63 c2 movslq %edx,%rax
10c8: 8d 7a 01 lea 0x1(%rdx),%edi
10cb: 66 41 0f 6f da movdqa %xmm10,%xmm3
10d0: 49 89 d8 mov %rbx,%r8
10d3: 48 89 44 24 08 mov %rax,0x8(%rsp)
10d8: f2 0f 12 54 24 08 movddup 0x8(%rsp),%xmm2
10de: 66 0f 6f f2 movdqa %xmm2,%xmm6
10e2: 48 89 d8 mov %rbx,%rax
10e5: 89 7c 24 08 mov %edi,0x8(%rsp)
10e9: 66 0f 6e 4c 24 08 movd 0x8(%rsp),%xmm1
10ef: 66 0f 73 d6 20 psrlq $0x20,%xmm6
10f4: 66 0f 70 c9 00 pshufd $0x0,%xmm1,%xmm1
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
d[i] = d[i] * j + i;
10f9: 66 0f 6f e1 movdqa %xmm1,%xmm4
10fd: 66 0f 38 25 c9 pmovsxdq %xmm1,%xmm1
1102: 66 0f 73 dc 08 psrldq $0x8,%xmm4
1107: 66 44 0f 6f c1 movdqa %xmm1,%xmm8
110c: 66 0f 38 25 e4 pmovsxdq %xmm4,%xmm4
1111: 66 41 0f 73 d0 20 psrlq $0x20,%xmm8
1117: 66 44 0f 6f cc movdqa %xmm4,%xmm9
111c: 66 41 0f 73 d1 20 psrlq $0x20,%xmm9
1122: 66 44 0f 6f e3 movdqa %xmm3,%xmm12
1127: 66 0f 38 25 eb pmovsxdq %xmm3,%xmm5
112c: 48 83 c0 20 add $0x20,%rax
1130: 66 41 0f 73 dc 08 psrldq $0x8,%xmm12
1136: 66 0f fe df paddd %xmm7,%xmm3
113a: 66 45 0f 38 25 ec pmovsxdq %xmm12,%xmm13
1140: f3 44 0f 6f 60 f0 movdqu -0x10(%rax),%xmm12
1146: 66 45 0f 6f dc movdqa %xmm12,%xmm11
114b: 66 41 0f 6f c4 movdqa %xmm12,%xmm0
1150: 66 41 0f 73 d3 20 psrlq $0x20,%xmm11
1156: 66 44 0f f4 e6 pmuludq %xmm6,%xmm12
115b: 66 44 0f f4 da pmuludq %xmm2,%xmm11
1160: 66 0f f4 c2 pmuludq %xmm2,%xmm0
1164: 66 45 0f d4 dc paddq %xmm12,%xmm11
1169: 66 41 0f 73 f3 20 psllq $0x20,%xmm11
116f: 66 41 0f d4 c3 paddq %xmm11,%xmm0
1174: 66 41 0f d4 c5 paddq %xmm13,%xmm0
1179: 66 44 0f 6f d8 movdqa %xmm0,%xmm11
117e: 66 44 0f 6f e0 movdqa %xmm0,%xmm12
1183: 66 41 0f 73 d3 20 psrlq $0x20,%xmm11
1189: 66 41 0f f4 c1 pmuludq %xmm9,%xmm0
118e: 66 44 0f f4 dc pmuludq %xmm4,%xmm11
1193: 66 44 0f f4 e4 pmuludq %xmm4,%xmm12
1198: 66 44 0f d4 d8 paddq %xmm0,%xmm11
119d: 66 41 0f 73 f3 20 psllq $0x20,%xmm11
11a3: 66 45 0f d4 e3 paddq %xmm11,%xmm12
11a8: 66 45 0f d4 e5 paddq %xmm13,%xmm12
11ad: f3 44 0f 6f 68 e0 movdqu -0x20(%rax),%xmm13
11b3: 44 0f 11 60 f0 movups %xmm12,-0x10(%rax)
11b8: 66 45 0f 6f dd movdqa %xmm13,%xmm11
11bd: 66 41 0f 6f c5 movdqa %xmm13,%xmm0
11c2: 66 41 0f 73 d3 20 psrlq $0x20,%xmm11
11c8: 66 44 0f f4 ee pmuludq %xmm6,%xmm13
11cd: 66 44 0f f4 da pmuludq %xmm2,%xmm11
11d2: 66 0f f4 c2 pmuludq %xmm2,%xmm0
11d6: 66 45 0f d4 dd paddq %xmm13,%xmm11
11db: 66 41 0f 73 f3 20 psllq $0x20,%xmm11
11e1: 66 41 0f d4 c3 paddq %xmm11,%xmm0
11e6: 66 0f d4 c5 paddq %xmm5,%xmm0
11ea: 66 44 0f 6f d8 movdqa %xmm0,%xmm11
11ef: 66 44 0f 6f e8 movdqa %xmm0,%xmm13
11f4: 66 41 0f 73 d3 20 psrlq $0x20,%xmm11
11fa: 66 41 0f f4 c0 pmuludq %xmm8,%xmm0
11ff: 66 44 0f f4 d9 pmuludq %xmm1,%xmm11
1204: 66 44 0f f4 e9 pmuludq %xmm1,%xmm13
1209: 66 44 0f d4 d8 paddq %xmm0,%xmm11
120e: 66 41 0f 73 f3 20 psllq $0x20,%xmm11
1214: 66 45 0f d4 dd paddq %xmm13,%xmm11
1219: 66 41 0f d4 eb paddq %xmm11,%xmm5
121e: 0f 11 68 e0 movups %xmm5,-0x20(%rax)
1222: 48 39 f0 cmp %rsi,%rax
1225: 0f 85 f7 fe ff ff jne 1122 <main+0xa2>
for ( int j = 1 ; j < 1024*16 ; j ++)
122b: 83 c2 02 add $0x2,%edx
122e: 83 e9 01 sub $0x1,%ecx
1231: 0f 85 8e fe ff ff jne 10c5 <main+0x45>
1237: 41 ba ff 3f 00 00 mov $0x3fff,%r10d
123d: 31 f6 xor %esi,%esi
123f: 4c 63 ca movslq %edx,%r9
1242: 41 29 d2 sub %edx,%r10d
1245: 49 8d 51 01 lea 0x1(%r9),%rdx
1249: 49 8b 00 mov (%r8),%rax
124c: 4c 89 c9 mov %r9,%rcx
124f: 4a 8d 3c 12 lea (%rdx,%r10,1),%rdi
1253: eb 07 jmp 125c <main+0x1dc>
1255: 0f 1f 00 nopl (%rax)
1258: 48 83 c2 01 add $0x1,%rdx
d[i] = d[i] * j + i;
125c: 48 0f af c1 imul %rcx,%rax
1260: 48 89 d1 mov %rdx,%rcx
1263: 48 01 f0 add %rsi,%rax
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
1266: 48 39 fa cmp %rdi,%rdx
1269: 75 ed jne 1258 <main+0x1d8>
126b: 48 83 c6 01 add $0x1,%rsi
126f: 49 89 00 mov %rax,(%r8)
1272: 49 83 c0 08 add $0x8,%r8
for ( int j = 1 ; j < 1024*16 ; j ++)
1276: 48 81 fe 00 00 00 01 cmp $0x1000000,%rsi
127d: 75 c6 jne 1245 <main+0x1c5>
gettimeofday(&et,NULL);
127f: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
1284: 31 f6 xor %esi,%esi
1286: e8 d5 fd ff ff callq 1060 <gettimeofday@plt>
printf ("%lu microsec\n", ((et.tv_sec - st.tv_sec) * 1000000) + (et.tv_usec - st.tv_usec));
128b: 48 8b 74 24 20 mov 0x20(%rsp),%rsi
1290: 48 2b 74 24 10 sub 0x10(%rsp),%rsi
1295: 31 c0 xor %eax,%eax
1297: 48 69 f6 40 42 0f 00 imul $0xf4240,%rsi,%rsi
129e: 48 8d 3d 5f 0d 00 00 lea 0xd5f(%rip),%rdi # 2004 <_IO_stdin_used+0x4>
12a5: 48 03 74 24 28 add 0x28(%rsp),%rsi
12aa: 48 2b 74 24 18 sub 0x18(%rsp),%rsi
12af: e8 7c fd ff ff callq 1030 <printf@plt>
free ( d );
12b4: 48 89 df mov %rbx,%rdi
12b7: e8 94 fd ff ff callq 1050 <free@plt>
return 0;
}
12bc: 48 83 c4 30 add $0x30,%rsp
12c0: 31 c0 xor %eax,%eax
12c2: 5b pop %rbx
12c3: c3 retq
12c4: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
12cb: 00 00 00
12ce: 66 90 xchg %ax,%ax
My Kernel:
Code: Select all
void test_long()
{
246970: 41 54 push %r12
long * d = (long *) calloc (1024*1024*16,sizeof(long));
246972: be 08 00 00 00 mov $0x8,%esi
246977: bf 00 00 00 01 mov $0x1000000,%edi
{
24697c: 55 push %rbp
24697d: 48 83 ec 08 sub $0x8,%rsp
long * d = (long *) calloc (1024*1024*16,sizeof(long));
246981: e8 aa ae fe ff callq 231830 <calloc>
246986: 48 89 c5 mov %rax,%rbp
clock_t clock0 = clock();
246989: e8 f2 23 fe ff callq 228d80 <clock>
24698e: b9 02 00 00 00 mov $0x2,%ecx
246993: 49 89 c4 mov %rax,%r12
for ( int j = 1 ; j < 1024*16 ; j ++)
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
246996: 48 8d 71 ff lea -0x1(%rcx),%rsi
clock_t clock0 = clock();
24699a: 31 d2 xor %edx,%edx
24699c: 89 cf mov %ecx,%edi
24699e: 66 90 xchg %ax,%ax
d[i] = d[i] * j + i;
2469a0: 48 8b 44 d5 00 mov 0x0(%rbp,%rdx,8),%rax
2469a5: 48 0f af c6 imul %rsi,%rax
2469a9: 48 01 d0 add %rdx,%rax
2469ac: 48 0f af c1 imul %rcx,%rax
2469b0: 48 01 d0 add %rdx,%rax
2469b3: 48 89 44 d5 00 mov %rax,0x0(%rbp,%rdx,8)
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
2469b8: 48 83 c2 01 add $0x1,%rdx
2469bc: 48 81 fa 00 00 00 01 cmp $0x1000000,%rdx
2469c3: 75 db jne 2469a0 <_Z9test_longv+0x30>
for ( int j = 1 ; j < 1024*16 ; j ++)
2469c5: 48 83 c1 02 add $0x2,%rcx
2469c9: 8d 47 01 lea 0x1(%rdi),%eax
2469cc: 48 81 f9 00 40 00 00 cmp $0x4000,%rcx
2469d3: 75 c1 jne 246996 <_Z9test_longv+0x26>
2469d5: 41 ba ff 3f 00 00 mov $0x3fff,%r10d
2469db: 49 89 e8 mov %rbp,%r8
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
2469de: 31 f6 xor %esi,%esi
2469e0: 4c 63 c8 movslq %eax,%r9
2469e3: 41 29 c2 sub %eax,%r10d
2469e6: 49 8d 51 01 lea 0x1(%r9),%rdx
2469ea: 49 8b 00 mov (%r8),%rax
2469ed: 4c 89 c9 mov %r9,%rcx
2469f0: 4a 8d 3c 12 lea (%rdx,%r10,1),%rdi
2469f4: eb 0e jmp 246a04 <_Z9test_longv+0x94>
2469f6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
2469fd: 00 00 00
246a00: 48 83 c2 01 add $0x1,%rdx
d[i] = d[i] * j + i;
246a04: 48 0f af c1 imul %rcx,%rax
246a08: 48 89 d1 mov %rdx,%rcx
246a0b: 48 01 f0 add %rsi,%rax
for ( int i = 0 ; i < 1024*1024*16 ; i ++)
246a0e: 48 39 d7 cmp %rdx,%rdi
246a11: 75 ed jne 246a00 <_Z9test_longv+0x90>
246a13: 48 83 c6 01 add $0x1,%rsi
246a17: 49 89 00 mov %rax,(%r8)
246a1a: 49 83 c0 08 add $0x8,%r8
for ( int j = 1 ; j < 1024*16 ; j ++)
246a1e: 48 81 fe 00 00 00 01 cmp $0x1000000,%rsi
246a25: 75 bf jne 2469e6 <_Z9test_longv+0x76>
clock_t clock1 = clock();
246a27: e8 54 23 fe ff callq 228d80 <clock>
double t = getClockDiff(clock0,clock1);
246a2c: 4c 89 e7 mov %r12,%rdi
clock_t clock1 = clock();
246a2f: 48 89 c6 mov %rax,%rsi
double t = getClockDiff(clock0,clock1);
246a32: e8 59 23 fe ff callq 228d90 <getClockDiff>
printf ("Time taken: %f\n",t);
246a37: bf 3d 3e 25 00 mov $0x253e3d,%edi
246a3c: b8 01 00 00 00 mov $0x1,%eax
246a41: e8 ba 3f fe ff callq 22aa00 <printf>
free ( d );
}
My feeling is that virtual box does not execute sse instructions fast in my case. I then created a virtual machine on virtualbox with linux on it and tried the two programs above on the same baremetal beneath, and I got the same results! I have used the same compilation switches with all programs at both fronts.
May be there is something I need to set in my kernel to make it faster, I mean there might be some HW setting for the processor that I am missing for the FP? I did detect the sse and avx features of my cores and set them up for all cores: I did enable the avx and sse bits in CR4 and the Extend Control Register (ECR).
Please let me know if there is anything else I should look at?
Thanks,
Karim.