Hey, I decided to attempt and make an SSE-optimized (and maybe MMX later) library of core functions such as memcpy, memcmp, strlen, strcmp, mommove, etc...
I'm trying to weight between using non-temporal moves that are serializing or non-non-temporal (
) moves that can utilize out-of-order execution. I'm using PREFETCHNTA for the instructions before moving large blocks and a cache-line flush (CLFLUSH) at the end of each major iteration, I hope that somewhat balances the scale.
I'd like to post a few functions here, and hopefully as a by-proxy team effor, scrutinize the code into an efficient function.
I am aware of existing libraries out there that do similar things, but I'd like to make one especially for small OS's and optimize the code for easy porting (to different OS's, not different architectures). Well, lets get started, here's the big hitter, the memcpy! (I'm aware it's not 100% POSIX compliant, but I never said that it would be
).
This function gets called by a stub (memcpy) when the stub detects the presence of SSE 4.1 capabilities. There is currently 3 memcpy's that this stub can call, memcpy_std (standard, non-sse), memcpy_sse2 (use SSE2 if 4.1 isn't allowed), and memcpy_sse4_1. Here's SSE 4.1 (movntdqa).
Code: Select all
/* =====================================================================
* === ALIGNED SSE4.1 MEMORY CLEARING OPERATIONS (MOVNTDQA == SSE4.1) ==
* ===================================================================== */
static const void * memcpy_sse4_1_aligned(const void * const dst, const void * const src, const size_t m_count)
{
size_t i;
i = 0;
/* is "src" aligned on a SSE_XMM_SIZE boundary */
if(!((size_t)src & (SSE_XMM_SIZE-1)))
{ }
else
{
/* lets make sure we don't copy 'too' many bytes (i < m_count) */
while((((size_t)src + i) & (SSE_XMM_SIZE-1)) && i < m_count)
{
asm("movsb;"::"S"((size_t)src + i), "D"((size_t)dst + i));
i++;
}
}
/* check to see if "dst" is aligned on a SSE_XMM_SIZE boundary */
if(!((size_t)dst & (SSE_XMM_SIZE-1)))
{
/* each iteration consumes a 128-byte chunk of memory */
#ifdef __x86_64__
for(; i + 256 < m_count; i += 256)
#else
for(; i + 128 < m_count; i += 128)
#endif
{
/* fill all the XMM 128-bit SSE registers! */
asm (" mfence; "
" prefetchnta 0(%0); "
" prefetchnta 32(%0); "
" prefetchnta 64(%0); "
" prefetchnta 96(%0); "
" prefetchnta 0(%1); "
" prefetchnta 32(%1); "
" prefetchnta 64(%1); "
" prefetchnta 96(%1); "
" movntdqa 0(%0) , %%xmm0; "
" movntdqa 16(%0), %%xmm1; "
" movntdqa 32(%0), %%xmm2; "
" movntdqa 48(%0), %%xmm3; "
" movntdqa 64(%0), %%xmm4; "
" movntdqa 80(%0), %%xmm5; "
" movntdqa 96(%0), %%xmm6; "
" movntdqa 112(%0), %%xmm7; "
#ifdef __x86_64__
" movntdqa 128(%0), %%xmm8; "
" movntdqa 144(%0), %%xmm9; "
" movntdqa 160(%0), %%xmm10; "
" movntdqa 176(%0), %%xmm11; "
" movntdqa 192(%0), %%xmm12; "
" movntdqa 208(%0), %%xmm13; "
" movntdqa 224(%0), %%xmm14; "
" movntdqa 240(%0), %%xmm15; "
#endif
" movntdq %%xmm0, 0(%1); "
" movntdq %%xmm1, 16(%1); "
" movntdq %%xmm2, 32(%1); "
" movntdq %%xmm3, 48(%1); "
" movntdq %%xmm4, 64(%1); "
" movntdq %%xmm5, 80(%1); "
" movntdq %%xmm6, 96(%1); "
" movntdq %%xmm7, 112(%1); "
#ifdef __x86_64__
" movntdq %%xmm8, 128(%1); "
" movntdq %%xmm9, 144(%1); "
" movntdq %%xmm10, 160(%1); "
" movntdq %%xmm11, 176(%1); "
" movntdq %%xmm12, 192(%1); "
" movntdq %%xmm13, 208(%1); "
" movntdq %%xmm14, 224(%1); "
" movntdq %%xmm15, 240(%1); "
" clflush 0(%0); "
" clflush 32(%0); "
" clflush 64(%0); "
" clflush 96(%0); "
" clflush 0(%1); "
" clflush 32(%1); "
" clflush 64(%1); "
" clflush 96(%1); "
#endif
::"r"((size_t)src + i), "r"((size_t)dst + i));
}
}
else
{
#ifdef __x86_64__
for(; i + 256 < m_count; i += 256)
#else
for(; i + 128 < m_count; i += 128)
#endif
{
asm (" mfence; "
" prefetchnta 0(%0); "
" prefetchnta 32(%0); "
" prefetchnta 64(%0); "
" prefetchnta 96(%0); "
" prefetchnta 0(%1); "
" prefetchnta 32(%1); "
" prefetchnta 64(%1); "
" prefetchnta 96(%1); "
" movntdqa 0(%0) , %%xmm0; "
" movntdqa 16(%0), %%xmm1; "
" movntdqa 32(%0), %%xmm2; "
" movntdqa 48(%0), %%xmm3; "
" movntdqa 64(%0), %%xmm4; "
" movntdqa 80(%0), %%xmm5; "
" movntdqa 96(%0), %%xmm6; "
" movntdqa 112(%0), %%xmm7; "
#ifdef __x86_64__
" movntdqa 128(%0), %%xmm8; "
" movntdqa 144(%0), %%xmm9; "
" movntdqa 160(%0), %%xmm10; "
" movntdqa 176(%0), %%xmm11; "
" movntdqa 192(%0), %%xmm12; "
" movntdqa 208(%0), %%xmm13; "
" movntdqa 224(%0), %%xmm14; "
" movntdqa 240(%0), %%xmm15; "
#endif
" movdqu %%xmm0, 0(%1); "
" movdqu %%xmm1, 16(%1); "
" movdqu %%xmm2, 32(%1); "
" movdqu %%xmm3, 48(%1); "
" movdqu %%xmm4, 64(%1); "
" movdqu %%xmm5, 80(%1); "
" movdqu %%xmm6, 96(%1); "
" movdqu %%xmm7, 112(%1); "
#ifdef __x86_64__
" movdqu %%xmm8, 128(%1); "
" movdqu %%xmm9, 144(%1); "
" movdqu %%xmm10, 160(%1); "
" movdqu %%xmm11, 176(%1); "
" movdqu %%xmm12, 192(%1); "
" movdqu %%xmm13, 208(%1); "
" movdqu %%xmm14, 224(%1); "
" movdqu %%xmm15, 240(%1); "
" clflush 0(%0); "
" clflush 32(%0); "
" clflush 64(%0); "
" clflush 96(%0); "
" clflush 0(%1); "
" clflush 32(%1); "
" clflush 64(%1); "
" clflush 96(%1); "
#endif
::"r"((size_t)src + i), "r"((size_t)dst + i));
}
}
i += m_count - i;
asm(" rep movsb; " :: "S"((size_t)src + i), "D"((size_t)dst + i), "c"(i):"memory");
return (void *)(((size_t)dst) + i);
}
In the above, I assume PREFETCHNTA will load the bare-minimum of 32-bytes and prefetch the entire interations-worth of data for both the source and destination areas.
I have a few questions myself to get started. First off, how can I tell how much PREFETCHNTA actually prefetches? Does it prefetch the entire cache-line that the imm8 is on, or just part of it (from where the imm8 starts)? Also, is there any latency involved in PREFETCHNTA, or is it neglegable? I know it's better to repeat using the same XMM register, but is it not more efficient to do all the loads into 8/16 registers at once instead of load-store-load-store?