memcpy & compiler optimization

Question about which tools to use, bugs, the best way to implement a function, etc should go here. Don't forget to see if your question is answered in the wiki first! When in doubt post here.
Post Reply
rbmj
Posts: 11
Joined: Tue Jan 13, 2009 4:24 pm

memcpy & compiler optimization

Post by rbmj »

So I'm optimizing memcpy for my kernel, starting off in using the benchmark program from http://forum.osdev.org/viewtopic.php?f= ... 9&start=45. I prefer writing things in plain, portable C over assembly, so I write a semi-optimized family of functions:

Code: Select all

#include <stdint.h>

#define AMD64
#define RESTRICT __restrict__

typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
#ifdef AMD64
typedef uint64_t u64;
#endif
typedef unsigned long archint;
typedef unsigned long size_t;

void * memcpy_byte(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
	u8 * dest = (u8*)vdest;
	const u8 * src = (const u8*) vsrc;
	for (; num > 0; --num) {
		*(dest++) = *(src++);
	}
	return vdest;
}

void * memcpy_word(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
	if ((num < 32) || ((((archint)vsrc) & 1) != (((archint)vdest) & 1))) {
		return memcpy_byte(vdest, vsrc, num);
	}
	u8 * dest = (u8*)vdest;
	const u8 * src = (const u8*)vsrc;
	if (((archint)vsrc) & 1) {
		--num;
		*(dest++) = *(src++);
	}
	u16 * sdest = (u16*)dest;
	const u16 * ssrc = (const u16*)ssrc;
	for (size_t snum = num >> 1; snum > 0; --snum) {
		*(sdest++) = *(ssrc++);
	}
	if (num & 1) {
		dest = (u8*) sdest;
		src = (const u8*)ssrc;
		*(dest++) = *(src++);
	}
	return vdest;
}

#ifndef X86
void * memcpy_long(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
#else
void * memcpy_rbmj(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
#endif
	if (num < 32) {
		return memcpy_byte(vdest, vsrc, num);
	}
	if ((((archint)vsrc) & 3) != (((archint)vdest) & 3)) {
		return memcpy_word(vdest, vsrc, num);
	}
	const u8 * src = (const u8*) vsrc;
	u8 * dest = (u8*) vdest;
	switch (((archint)dest) & 3) {
		case 1:
			*(dest++) = *(src++);
			--num;
		case 2:
			*(dest++) = *(src++);
			--num;
		case 3:
			*(dest++) = *(src++);
			--num;
		case 0:
			break;
	}
	const u32 * lsrc = (const u32*) src;
	u32 * ldest = (u32*) dest;
	for (size_t lnum = num >> 2; lnum > 0; --lnum) {
		*(ldest++) = *(lsrc++);
	}
	src = (const u8*) lsrc;
	dest = (u8*) ldest;
	switch (num & 3) {
		case 3:
			*(dest++) = *(src++);
		case 2:
			*(dest++) = *(src++);
		case 1:
			*(dest++) = *(src++);
		case 0:
			break;
	}
	return vdest;
}

#ifdef AMD64

void * memcpy_rbmj(void *RESTRICT vdest, const void *RESTRICT vsrc, size_t num) {
	if (num < 32) {
		return memcpy_byte(vdest, vsrc, num);
	}
	if ((((archint)vsrc) & 7) != (((archint)vdest) & 7)) {
		return memcpy_long(vdest, vsrc, num);
	}
	const u8 * src = (const u8*)vsrc;
	u8 * dest = (u8*) vdest;
	switch (((archint)dest) & 7) {
		case 1:
			*(dest++) = *(src++);
			--num;
		case 2:
			*(dest++) = *(src++);
			--num;
		case 3:
			*(dest++) = *(src++);
			--num;
		case 4:
			*(dest++) = *(src++);
			--num;
		case 5:
			*(dest++) = *(src++);
			--num;
		case 6:
			*(dest++) = *(src++);
			--num;
		case 7:
			*(dest++) = *(src++);
			--num;
		case 0:
			break;
	}
	const u64 * qsrc = (const u64*) src;
	u64 * qdest = (u64*) dest;
	for (size_t qnum = num >> 3; qnum > 0; --qnum) {
		*(qdest++) = *(qsrc++);
	}
	src = (const u8*) qsrc;
	dest = (u8*) qdest;
	switch (num & 7) {
		case 7:
			*(dest++) = *(src++);
		case 6:
			*(dest++) = *(src++);
		case 5:
			*(dest++) = *(src++);
		case 4:
			*(dest++) = *(src++);
		case 3:
			*(dest++) = *(src++);
		case 2:
			*(dest++) = *(src++);
		case 1:
			*(dest++) = *(src++);
		case 0:
			break;
	}
	return vdest;
}
#endif
Yes, I prefer 160 lines of C to 80 of assembly any day :) Anyways, it is slow (as expected):
$ clang memcpy-speed.c memory.c -o memtest
The output of memtest:
addr1=0xed4040 addr2=0xed4140
memcpy 64B -- 16777216 loops
aligned blocks
libc memcpy 0.104006 s
rbmj's memcpy 0.524033 s
+0/+4 moderately unaligned blocks
libc memcpy 0.092006 s
rbmj's memcpy 0.852053 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.096006 s
rbmj's memcpy 3.052191 s

addr1=0xed4240 addr2=0xed4380
memcpy 128B -- 8388608 loops
aligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.408025 s
+0/+4 moderately unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.708044 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 2.836177 s

addr1=0xed44c0 addr2=0xed4680
memcpy 256B -- 4194304 loops
aligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.368023 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056003 s
rbmj's memcpy 0.672042 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 2.712169 s

addr1=0xed4840 addr2=0xed4b00
memcpy 512B -- 2097152 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.612038 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 2.656166 s

addr1=0xed4040 addr2=0xed44c0
memcpy 1024B -- 1048576 loops
aligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.304019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.592037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 2.644165 s

addr1=0xed4040 addr2=0xed48c0
memcpy 2048B -- 524288 loops
aligned blocks
libc memcpy 0.024002 s
rbmj's memcpy 0.312019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.616039 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 2.696168 s

addr1=0xed4040 addr2=0xed50c0
memcpy 4kB -- 262144 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.296018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.636039 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 2.700168 s

addr1=0xed4040 addr2=0xed60c0
memcpy 8kB -- 131072 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.288018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.648041 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 2.700169 s

addr1=0xed4040 addr2=0xed80c0
memcpy 16kB -- 65536 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.292018 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.600038 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 2.676167 s

addr1=0xed4040 addr2=0xedc0c0
memcpy 32kB -- 32768 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.296019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.072004 s
rbmj's memcpy 0.580036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 2.704169 s

addr1=0xed4040 addr2=0xee40c0
memcpy 64kB -- 16384 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.300019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.580036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 2.700169 s

addr1=0xed4040 addr2=0x7f93eb260040
memcpy 128kB -- 8192 loops
aligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.312019 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.584036 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 2.276142 s

addr1=0x7f93eb240040 addr2=0x7f93eb1ff040
memcpy 256kB -- 4096 loops
aligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 2.284143 s

addr1=0x7f93eb200040 addr2=0x7f93eb17f040
memcpy 512kB -- 2048 loops
aligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.320020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 2.280142 s

addr1=0x7f93eb180040 addr2=0x7f93eabf2040
memcpy 1024kB -- 1024 loops
aligned blocks
libc memcpy 0.152009 s
rbmj's memcpy 0.324020 s
+0/+4 moderately unaligned blocks
libc memcpy 0.176011 s
rbmj's memcpy 0.588037 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.180011 s
rbmj's memcpy 2.300144 s
Again, pretty darn slow. However, just by adding one compiler flag...
$ clang -Os memcpy-speed.c memory.c -o memtest
addr1=0x20e0040 addr2=0x20e0140
memcpy 64B -- 16777216 loops
aligned blocks
libc memcpy 0.088005 s
rbmj's memcpy 0.168011 s
+0/+4 moderately unaligned blocks
libc memcpy 0.076004 s
rbmj's memcpy 0.244016 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.076004 s
rbmj's memcpy 0.256016 s

addr1=0x20e0240 addr2=0x20e0380
memcpy 128B -- 8388608 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.096006 s
+0/+4 moderately unaligned blocks
libc memcpy 0.040002 s
rbmj's memcpy 0.136009 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.040002 s
rbmj's memcpy 0.136009 s

addr1=0x20e04c0 addr2=0x20e0680
memcpy 256B -- 4194304 loops
aligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.064004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.100006 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.092005 s

addr1=0x20e0840 addr2=0x20e0b00
memcpy 512B -- 2097152 loops
aligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.064004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.060004 s

addr1=0x20e0040 addr2=0x20e04c0
memcpy 1024B -- 1048576 loops
aligned blocks
libc memcpy 0.028001 s
rbmj's memcpy 0.032002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036003 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.044002 s

addr1=0x20e0040 addr2=0x20e08c0
memcpy 2048B -- 524288 loops
aligned blocks
libc memcpy 0.028002 s
rbmj's memcpy 0.024002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.036002 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.036002 s

addr1=0x20e0040 addr2=0x20e10c0
memcpy 4kB -- 262144 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.052003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.052003 s

addr1=0x20e0040 addr2=0x20e20c0
memcpy 8kB -- 131072 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.040002 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.048003 s

addr1=0x20e0040 addr2=0x20e40c0
memcpy 16kB -- 65536 loops
aligned blocks
libc memcpy 0.044002 s
rbmj's memcpy 0.044003 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.048003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.052003 s

addr1=0x20e0040 addr2=0x20e80c0
memcpy 32kB -- 32768 loops
aligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.068004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.068004 s

addr1=0x20e0040 addr2=0x20f00c0
memcpy 64kB -- 16384 loops
aligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.068005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068005 s
rbmj's memcpy 0.072004 s

addr1=0x20e0040 addr2=0x7f7c9408e040
memcpy 128kB -- 8192 loops
aligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.060004 s
+0/+4 moderately unaligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.060003 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.060004 s

addr1=0x7f7c9406e040 addr2=0x7f7c9402d040
memcpy 256kB -- 4096 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.068004 s

addr1=0x7f7c9402e040 addr2=0x7f7c93fad040
memcpy 512kB -- 2048 loops
aligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.080005 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.076005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.076005 s

addr1=0x7f7c93fae040 addr2=0x7f7c93a20040
memcpy 1024kB -- 1024 loops
aligned blocks
libc memcpy 0.152010 s
rbmj's memcpy 0.152009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.184012 s
rbmj's memcpy 0.184011 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.184012 s
rbmj's memcpy 0.184011 s
The runtime speed approaches, and in some cases undercuts, that of libc memcpy! How are such over an order of magnitude performance improvements possible? The numbers are close enough that it almost seems like clang is replacing the calls to memcpy_rbmj with calls to memcpy... but it can't be, or the other benchmarks (omitted) wouldn't work. So my questions:

1. The numbers are eerily close to those of libc. I don't think that I just happened to come up with nearly exactly the same implementation glibc uses for memcpy... What sort of black magic is clang doing with my code?
2. Will said black magic work with kernels? Again, the numbers are so close it looks like it's just aliasing libc's memcpy. But it shouldn't be... All optimizations should be independent of the environment, methinks, but you never know with such black magic.

Maybe I'm just an unbeliever. I don't know.

As an aside, I've read opinions on -O3 vs -Os. I'm not exactly sure which one is better, and I've seen both sides argued. Any other opinions, perhaps specific to osdev?
User avatar
JamesM
Member
Member
Posts: 2935
Joined: Tue Jul 10, 2007 5:27 am
Location: York, United Kingdom
Contact:

Re: memcpy & compiler optimization

Post by JamesM »

The runtime speed approaches, and in some cases undercuts, that of libc memcpy! How are such over an order of magnitude performance improvements possible? The numbers are close enough that it almost seems like clang is replacing the calls to memcpy_rbmj with calls to memcpy... but it can't be, or the other benchmarks (omitted) wouldn't work. So my questions:
So interestingly your first command line had no optimisation parameter in it. I'm going to assume -O3 because you mentioned it later, and I doubt you'd be silly enough to compare -O0 against -Os...

Check the generated assembly. Clang, along with most compilers, ships with its own version of memcpy which it will silently slip in place of any call to memcpy. If this is happening, your speed will look identical to glibc's because they're both using Clang's own!

Secondly, -Os vs -O3 -- generally -O3 is more performant on x86. The x86 has a large icache, and this is what -O3 is optimised for (not caring about codesize). If you're on ARM, for example, -Os is going to be *much* better because of the icache reduction (cache misses dominate most of what a compiler can cleverly do).

Generally though, I'll give you some advice that memcpy is one of the very few functions that can be hand-tuned better than a compiler can possibly do. It is simple in nature, its inputs and outputs can be easily humanly verified, and because it requires no calculation is trivial to vectorise.

Cheers,
James
rbmj
Posts: 11
Joined: Tue Jan 13, 2009 4:24 pm

Re: memcpy & compiler optimization

Post by rbmj »

Interestingly enough, this particular segment of code runs slower with -O3. And I'm no expert on assembly, but it does not optimize out the function pointers, and all the pointers are in the right place. It appears to be using hard-coded offsets, but that might just be inlining.

It runs fast, but even optimizing for size the resulting function is 814 bytes long (this includes the 8 bit, 16 bit, 32 bit, AND 64 bit versions, so really the collection of 4 functions).

It's kind of nice to read the generated assembly and realize that it's not just blind hope, the compiler actually DOES optimize the tailcalls and other such things that you normally just naievly hope it does :D.
User avatar
JamesM
Member
Member
Posts: 2935
Joined: Tue Jul 10, 2007 5:27 am
Location: York, United Kingdom
Contact:

Re: memcpy & compiler optimization

Post by JamesM »

rbmj wrote:Interestingly enough, this particular segment of code runs slower with -O3. And I'm no expert on assembly, but it does not optimize out the function pointers, and all the pointers are in the right place. It appears to be using hard-coded offsets, but that might just be inlining.

It runs fast, but even optimizing for size the resulting function is 814 bytes long (this includes the 8 bit, 16 bit, 32 bit, AND 64 bit versions, so really the collection of 4 functions).

It's kind of nice to read the generated assembly and realize that it's not just blind hope, the compiler actually DOES optimize the tailcalls and other such things that you normally just naievly hope it does :D.
Sorry, I can't see any function pointers in your code, can you be more specific?

Also, try GCC. LLVM (and especially clang) isn't as highly optimised as GCC yet. You mention hard-coded offsets - what offsets are these? Anything it can promote to a constant it will do...
rbmj
Posts: 11
Joined: Tue Jan 13, 2009 4:24 pm

Re: memcpy & compiler optimization

Post by rbmj »

The function pointers are in the benchmarking code- it has a table of pointers to memcpy functions. This has the nice side effect of keeping the compiler from doing stuff it wouldn't normally do.

The compiler unrolls the loop that iterates through the table and uses constant offsets, but other than that it *appears* to call two separate functions. I'm not sure why the performance is so eerily similar, especially since the code generated is so different (I'm looking at http://repo.or.cz/w/glibc.git/blob/4c55 ... py-ssse3.S).

I will try GCC. I normally prefer Clang, simply because the error messages are (in my experience) an order of magnitude better and I'm a loser who cares about licensing rights they have but never use :D.

About coding memcpy in C vs ASM: I understand that it might be slightly better in assembly, but for me assembly (especially the inline variety) is horribly difficult to maintain. I've never been great with assembly- I can look at C code, think of CPU instructions, and figure out how to get the compiler to make more efficient code, but writing actual assembly has always been a pain for me. Additionally, C code is more portable, and I can let the compiler take advantage of SSE or whatever else is best instead of having to change code as newer and better tools are released.

OK so GCC:
$ gcc -O3 memcpy-speed.c memory.c -o memtest_gcc -std=gnu99

A couple notes:
GCC's O3 is much faster than GCC's Os, the opposite of clang

Benchmarks
addr1=0xffb040 addr2=0xffb140
memcpy 64B -- 16777216 loops
aligned blocks
libc memcpy 0.092005 s
rbmj's memcpy 0.216014 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.252016 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.440027 s

addr1=0xffb240 addr2=0xffb380
memcpy 128B -- 8388608 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.176011 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.168010 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.052004 s
rbmj's memcpy 0.248015 s

addr1=0xffb4c0 addr2=0xffb680
memcpy 256B -- 4194304 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.156009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.104006 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.160010 s

addr1=0xffb840 addr2=0xffbb00
memcpy 512B -- 2097152 loops
aligned blocks
libc memcpy 0.028002 s
rbmj's memcpy 0.152010 s
+0/+4 moderately unaligned blocks
libc memcpy 0.040002 s
rbmj's memcpy 0.092006 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.036002 s
rbmj's memcpy 0.116007 s

addr1=0xffb040 addr2=0xffb4c0
memcpy 1024B -- 1048576 loops
aligned blocks
libc memcpy 0.024002 s
rbmj's memcpy 0.152009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.036003 s
rbmj's memcpy 0.080005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.092005 s

addr1=0xffb040 addr2=0xffb8c0
memcpy 2048B -- 524288 loops
aligned blocks
libc memcpy 0.028002 s
rbmj's memcpy 0.144009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.080005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.032002 s
rbmj's memcpy 0.084005 s

addr1=0xffb040 addr2=0xffc0c0
memcpy 4kB -- 262144 loops
aligned blocks
libc memcpy 0.040003 s
rbmj's memcpy 0.144009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.076005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.076004 s

addr1=0xffb040 addr2=0xffd0c0
memcpy 8kB -- 131072 loops
aligned blocks
libc memcpy 0.044003 s
rbmj's memcpy 0.140009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.052003 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.076004 s

addr1=0xffb040 addr2=0xfff0c0
memcpy 16kB -- 65536 loops
aligned blocks
libc memcpy 0.036003 s
rbmj's memcpy 0.144009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.048003 s
rbmj's memcpy 0.072004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.056004 s
rbmj's memcpy 0.068004 s

addr1=0xffb040 addr2=0x10030c0
memcpy 32kB -- 32768 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.140009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.072004 s
rbmj's memcpy 0.072005 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072005 s

addr1=0xffb040 addr2=0x100b0c0
memcpy 64kB -- 16384 loops
aligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.140009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.072004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.072005 s
rbmj's memcpy 0.068004 s

addr1=0xffb040 addr2=0x7f0d8a13f040
memcpy 128kB -- 8192 loops
aligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.144009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.060004 s
rbmj's memcpy 0.076004 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.064004 s
rbmj's memcpy 0.072005 s

addr1=0x7f0d8a11f040 addr2=0x7f0d8a0de040
memcpy 256kB -- 4096 loops
aligned blocks
libc memcpy 0.072004 s
rbmj's memcpy 0.140009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.068004 s
rbmj's memcpy 0.108007 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.072005 s
rbmj's memcpy 0.100006 s

addr1=0x7f0d8a0df040 addr2=0x7f0d8a05e040
memcpy 512kB -- 2048 loops
aligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.144009 s
+0/+4 moderately unaligned blocks
libc memcpy 0.080005 s
rbmj's memcpy 0.108007 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.084005 s
rbmj's memcpy 0.104007 s

addr1=0x7f0d8a05f040 addr2=0x7f0d89ad1040
memcpy 1024kB -- 1024 loops
aligned blocks
libc memcpy 0.152009 s
rbmj's memcpy 0.188012 s
+0/+4 moderately unaligned blocks
libc memcpy 0.196012 s
rbmj's memcpy 0.160010 s
+10/+13 cruelly unaligned blocks
libc memcpy 0.196012 s
rbmj's memcpy 0.160010 s
This shows significantly different results. Clang seems to give better results for libc. In addition, my code actually runs faster on the unaligned blocks for GCC, whereas the more intuitive opposite is true with Clang.
User avatar
Brendan
Member
Member
Posts: 8561
Joined: Sat Jan 15, 2005 12:00 am
Location: At his keyboard!
Contact:

Re: memcpy & compiler optimization

Post by Brendan »

Hi,
rbmj wrote:I'm not sure why the performance is so eerily similar, especially since the code generated is so different (I'm looking at http://repo.or.cz/w/glibc.git/blob/4c55 ... py-ssse3.S).
The bottleneck in both cases is likely to be the computer's RAM bandwidth.

In general; if you're copying a small amount of data you want to avoid setup overhead (branches, instruction fetch, etc); and something extremely simple (like a "rep movsb") with very little setup overhead is likely to be faster. For copying large amounts of data you should redesign the calling code so that it isn't poo (e.g. put the data where it should've been to begin with, copy a pointer to the data instead of copying the data itself, make sure it's aligned on a page boundary and move page table entries, etc); because it's the only way to avoid the RAM bandwidth limit.

With this in mind, the best possible "memcpy()" would be something like:

Code: Select all

memcpy(dest, src, size) {
    if(size > 12345) {
        printf("ERROR: The calling code is crap!\n");
        exit(1);
    }
    __asm__( " rep movsb");
}
:D


Cheers,

Brendan
For all things; perfection is, and will always remain, impossible to achieve in practice. However; by striving for perfection we create things that are as perfect as practically possible. Let the pursuit of perfection be our guide.
User avatar
JamesM
Member
Member
Posts: 2935
Joined: Tue Jul 10, 2007 5:27 am
Location: York, United Kingdom
Contact:

Re: memcpy & compiler optimization

Post by JamesM »

For copying large amounts of data you should redesign the calling code so that it isn't poo (e.g. put the data where it should've been to begin with, copy a pointer to the data instead of copying the data itself, make sure it's aligned on a page boundary and move page table entries, etc); because it's the only way to avoid the RAM bandwidth limit.
No, clever design of algorithms will make prefeching work well, and will reduce the number of cache misses. You can't always rely on the calling code being able to align its memcpys - especially implicit structure copying which has a call out to memcpy inserted by the compiler.
User avatar
Brendan
Member
Member
Posts: 8561
Joined: Sat Jan 15, 2005 12:00 am
Location: At his keyboard!
Contact:

Re: memcpy & compiler optimization

Post by Brendan »

Hi,
JamesM wrote:
For copying large amounts of data you should redesign the calling code so that it isn't poo (e.g. put the data where it should've been to begin with, copy a pointer to the data instead of copying the data itself, make sure it's aligned on a page boundary and move page table entries, etc); because it's the only way to avoid the RAM bandwidth limit.
No, clever design of algorithms will make prefeching work well, and will reduce the number of cache misses.
Clever design of algorithms (so that they avoid copying large amounts of data), or the "I'm a miserable failure and therefore couldn't avoid copying large amounts of data, and want to use "prefetch, move, flush" to minimise the pain" type of "clever"?
JamesM wrote:You can't always rely on the calling code being able to align its memcpys - especially implicit structure copying which has a call out to memcpy inserted by the compiler.
Under which conditions are large (e.g. 512 KiB or more) structures implicitly copied?

I can't remember ever needing to (implicitly or explicitly) copy anything larger than 10 KiB.


Cheers,

Brendan
For all things; perfection is, and will always remain, impossible to achieve in practice. However; by striving for perfection we create things that are as perfect as practically possible. Let the pursuit of perfection be our guide.
User avatar
JamesM
Member
Member
Posts: 2935
Joined: Tue Jul 10, 2007 5:27 am
Location: York, United Kingdom
Contact:

Re: memcpy & compiler optimization

Post by JamesM »

I can't remember ever needing to (implicitly or explicitly) copy anything larger than 10 KiB.
10K is still large enough that an optimised memcpy provides serious benefits.
rbmj
Posts: 11
Joined: Tue Jan 13, 2009 4:24 pm

Re: memcpy & compiler optimization

Post by rbmj »

Does anyone have any idea why the unaligned copy is faster? This seems very strange... disregarding overhead, memory segments relatively aligned at 8 bytes would be most efficient, relatively aligned at 4 half as fast, and relatively aligned at 2 a quarter as fast. However, the speeds don't follow any pattern in the benchmarking, and are sometimes even are faster at less aligned addresses.

Speeds
1. clang -Os
2. gcc -O3
3. clang -O3
4. gcc -Os
Post Reply