memcpy & compiler optimization
Posted: Fri Jun 03, 2011 3:15 pm
So I'm optimizing memcpy for my kernel, starting off in using the benchmark program from http://forum.osdev.org/viewtopic.php?f= ... 9&start=45. I prefer writing things in plain, portable C over assembly, so I write a semi-optimized family of functions:
Yes, I prefer 160 lines of C to 80 of assembly any day Anyways, it is slow (as expected):
$ clang memcpy-speed.c memory.c -o memtest
The output of memtest:
$ clang -Os memcpy-speed.c memory.c -o memtest
1. The numbers are eerily close to those of libc. I don't think that I just happened to come up with nearly exactly the same implementation glibc uses for memcpy... What sort of black magic is clang doing with my code?
2. Will said black magic work with kernels? Again, the numbers are so close it looks like it's just aliasing libc's memcpy. But it shouldn't be... All optimizations should be independent of the environment, methinks, but you never know with such black magic.
Maybe I'm just an unbeliever. I don't know.
As an aside, I've read opinions on -O3 vs -Os. I'm not exactly sure which one is better, and I've seen both sides argued. Any other opinions, perhaps specific to osdev?
Code: Select all
#include <stdint.h>
#define AMD64
#define RESTRICT __restrict__
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
#ifdef AMD64
typedef uint64_t u64;
#endif
typedef unsigned long archint;
typedef unsigned long size_t;
void * memcpy_byte(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
u8 * dest = (u8*)vdest;
const u8 * src = (const u8*) vsrc;
for (; num > 0; --num) {
*(dest++) = *(src++);
}
return vdest;
}
void * memcpy_word(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
if ((num < 32) || ((((archint)vsrc) & 1) != (((archint)vdest) & 1))) {
return memcpy_byte(vdest, vsrc, num);
}
u8 * dest = (u8*)vdest;
const u8 * src = (const u8*)vsrc;
if (((archint)vsrc) & 1) {
--num;
*(dest++) = *(src++);
}
u16 * sdest = (u16*)dest;
const u16 * ssrc = (const u16*)ssrc;
for (size_t snum = num >> 1; snum > 0; --snum) {
*(sdest++) = *(ssrc++);
}
if (num & 1) {
dest = (u8*) sdest;
src = (const u8*)ssrc;
*(dest++) = *(src++);
}
return vdest;
}
#ifndef X86
void * memcpy_long(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
#else
void * memcpy_rbmj(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
#endif
if (num < 32) {
return memcpy_byte(vdest, vsrc, num);
}
if ((((archint)vsrc) & 3) != (((archint)vdest) & 3)) {
return memcpy_word(vdest, vsrc, num);
}
const u8 * src = (const u8*) vsrc;
u8 * dest = (u8*) vdest;
switch (((archint)dest) & 3) {
case 1:
*(dest++) = *(src++);
--num;
case 2:
*(dest++) = *(src++);
--num;
case 3:
*(dest++) = *(src++);
--num;
case 0:
break;
}
const u32 * lsrc = (const u32*) src;
u32 * ldest = (u32*) dest;
for (size_t lnum = num >> 2; lnum > 0; --lnum) {
*(ldest++) = *(lsrc++);
}
src = (const u8*) lsrc;
dest = (u8*) ldest;
switch (num & 3) {
case 3:
*(dest++) = *(src++);
case 2:
*(dest++) = *(src++);
case 1:
*(dest++) = *(src++);
case 0:
break;
}
return vdest;
}
#ifdef AMD64
void * memcpy_rbmj(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
if (num < 32) {
return memcpy_byte(vdest, vsrc, num);
}
if ((((archint)vsrc) & 7) != (((archint)vdest) & 7)) {
return memcpy_long(vdest, vsrc, num);
}
const u8 * src = (const u8*)vsrc;
u8 * dest = (u8*) vdest;
switch (((archint)dest) & 7) {
case 1:
*(dest++) = *(src++);
--num;
case 2:
*(dest++) = *(src++);
--num;
case 3:
*(dest++) = *(src++);
--num;
case 4:
*(dest++) = *(src++);
--num;
case 5:
*(dest++) = *(src++);
--num;
case 6:
*(dest++) = *(src++);
--num;
case 7:
*(dest++) = *(src++);
--num;
case 0:
break;
}
const u64 * qsrc = (const u64*) src;
u64 * qdest = (u64*) dest;
for (size_t qnum = num >> 3; qnum > 0; --qnum) {
*(qdest++) = *(qsrc++);
}
src = (const u8*) qsrc;
dest = (u8*) qdest;
switch (num & 7) {
case 7:
*(dest++) = *(src++);
case 6:
*(dest++) = *(src++);
case 5:
*(dest++) = *(src++);
case 4:
*(dest++) = *(src++);
case 3:
*(dest++) = *(src++);
case 2:
*(dest++) = *(src++);
case 1:
*(dest++) = *(src++);
case 0:
break;
}
return vdest;
}
#endif
$ clang memcpy-speed.c memory.c -o memtest
The output of memtest:
Again, pretty darn slow. However, just by adding one compiler flag...addr1=0xed4040 addr2=0xed4140
memcpy 64B -- 16777216 loops
aligned blocks
libc memcpy 0.104006 s
rbmj's memcpy 0.524033 s
+0/+4 moderately unaligned blocks
libc memcpy 0.092006 s
rbmj's memcpy 0.852053 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.096006 s
rbmj's memcpy 3.052191 s
addr1=0xed4240 addr2=0xed4380
memcpy 128B -- 8388608 loops
aligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.408025 s
+0/+4 moderately unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.708044 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 2.836177 s
addr1=0xed44c0 addr2=0xed4680
memcpy 256B -- 4194304 loops
aligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.368023 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056003 s
rbmj's memcpy 0.672042 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 2.712169 s
addr1=0xed4840 addr2=0xed4b00
memcpy 512B -- 2097152 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.612038 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 2.656166 s
addr1=0xed4040 addr2=0xed44c0
memcpy 1024B -- 1048576 loops
aligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.304019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.592037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 2.644165 s
addr1=0xed4040 addr2=0xed48c0
memcpy 2048B -- 524288 loops
aligned blocks
libc memcpy 0.024002 s
rbmj's memcpy 0.312019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.616039 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 2.696168 s
addr1=0xed4040 addr2=0xed50c0
memcpy 4kB -- 262144 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.296018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.636039 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 2.700168 s
addr1=0xed4040 addr2=0xed60c0
memcpy 8kB -- 131072 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.288018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.648041 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 2.700169 s
addr1=0xed4040 addr2=0xed80c0
memcpy 16kB -- 65536 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.292018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.600038 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 2.676167 s
addr1=0xed4040 addr2=0xedc0c0
memcpy 32kB -- 32768 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.296019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.072004 s
rbmj's memcpy 0.580036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 2.704169 s
addr1=0xed4040 addr2=0xee40c0
memcpy 64kB -- 16384 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.300019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.580036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 2.700169 s
addr1=0xed4040 addr2=0x7f93eb260040
memcpy 128kB -- 8192 loops
aligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.312019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.584036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 2.276142 s
addr1=0x7f93eb240040 addr2=0x7f93eb1ff040
memcpy 256kB -- 4096 loops
aligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 2.284143 s
addr1=0x7f93eb200040 addr2=0x7f93eb17f040
memcpy 512kB -- 2048 loops
aligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.320020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 2.280142 s
addr1=0x7f93eb180040 addr2=0x7f93eabf2040
memcpy 1024kB -- 1024 loops
aligned blocks
libc memcpy 0.152009 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.176011 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.180011 s
rbmj's memcpy 2.300144 s
$ clang -Os memcpy-speed.c memory.c -o memtest
The runtime speed approaches, and in some cases undercuts, that of libc memcpy! How are such over an order of magnitude performance improvements possible? The numbers are close enough that it almost seems like clang is replacing the calls to memcpy_rbmj with calls to memcpy... but it can't be, or the other benchmarks (omitted) wouldn't work. So my questions:addr1=0x20e0040 addr2=0x20e0140
memcpy 64B -- 16777216 loops
aligned blocks
libc memcpy 0.088005 s
rbmj's memcpy 0.168011 s
+0/+4 moderately unaligned blocks
libc memcpy 0.076004 s
rbmj's memcpy 0.244016 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.076004 s
rbmj's memcpy 0.256016 s
addr1=0x20e0240 addr2=0x20e0380
memcpy 128B -- 8388608 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.096006 s
+0/+4 moderately unaligned blocks
libc memcpy 0.040002 s
rbmj's memcpy 0.136009 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.040002 s
rbmj's memcpy 0.136009 s
addr1=0x20e04c0 addr2=0x20e0680
memcpy 256B -- 4194304 loops
aligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.064004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.100006 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.092005 s
addr1=0x20e0840 addr2=0x20e0b00
memcpy 512B -- 2097152 loops
aligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.064004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.060004 s
addr1=0x20e0040 addr2=0x20e04c0
memcpy 1024B -- 1048576 loops
aligned blocks
libc memcpy 0.028001 s
rbmj's memcpy 0.032002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036003 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.044002 s
addr1=0x20e0040 addr2=0x20e08c0
memcpy 2048B -- 524288 loops
aligned blocks
libc memcpy 0.028002 s
rbmj's memcpy 0.024002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.036002 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.036002 s
addr1=0x20e0040 addr2=0x20e10c0
memcpy 4kB -- 262144 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.052003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.052003 s
addr1=0x20e0040 addr2=0x20e20c0
memcpy 8kB -- 131072 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.040002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.048003 s
addr1=0x20e0040 addr2=0x20e40c0
memcpy 16kB -- 65536 loops
aligned blocks
libc memcpy 0.044002 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.052003 s
addr1=0x20e0040 addr2=0x20e80c0
memcpy 32kB -- 32768 loops
aligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.068004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.068004 s
addr1=0x20e0040 addr2=0x20f00c0
memcpy 64kB -- 16384 loops
aligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.068005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 0.072004 s
addr1=0x20e0040 addr2=0x7f7c9408e040
memcpy 128kB -- 8192 loops
aligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.060004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.060003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.060004 s
addr1=0x7f7c9406e040 addr2=0x7f7c9402d040
memcpy 256kB -- 4096 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.068004 s
addr1=0x7f7c9402e040 addr2=0x7f7c93fad040
memcpy 512kB -- 2048 loops
aligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.080005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.076005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.076005 s
addr1=0x7f7c93fae040 addr2=0x7f7c93a20040
memcpy 1024kB -- 1024 loops
aligned blocks
libc memcpy 0.152010 s
rbmj's memcpy 0.152009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.184012 s
rbmj's memcpy 0.184011 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.184012 s
rbmj's memcpy 0.184011 s
1. The numbers are eerily close to those of libc. I don't think that I just happened to come up with nearly exactly the same implementation glibc uses for memcpy... What sort of black magic is clang doing with my code?
2. Will said black magic work with kernels? Again, the numbers are so close it looks like it's just aliasing libc's memcpy. But it shouldn't be... All optimizations should be independent of the environment, methinks, but you never know with such black magic.
Maybe I'm just an unbeliever. I don't know.
As an aside, I've read opinions on -O3 vs -Os. I'm not exactly sure which one is better, and I've seen both sides argued. Any other opinions, perhaps specific to osdev?