1) The compiler usually does an intrinsic implementation which is very efficient if your project flags are set. With regards to the code and stuff posted here the implementation that the compiler does is also usually inlined saving us from setting up a new stack frame. However what's in this thread is almost as fast. If super desirable use SSE to implement an ultra fast 128 bit memcpy/memset.
2) You have to disable link time code generation in the property page for the memory.cpp ( Whole Program Optimization to no ) file or wherever the code is located. All other optimizations may and preferably be kept on. This bypasses that pesky error that everybody complains about when trying to implement memset which is undefined external _memset.
The compiler will output the efficient rep stosb and rep stosd for the code. The bitwise AND quickly determines modulus.
memory.h
//======================================================================================== #pragma once //======================================================================================== extern "C" void * __cdecl memcpy ( void*, const void*, size_t ); extern "C" void * __cdecl memset ( void*, int, size_t ); //======================================================================================== void* __cdecl memcpy ( void* _Dest, const void* _Source, size_t _Size ); void* __cdecl memset ( void* _Dest, int _Val, size_t _Size ); //===============================================================
//======================================================================================== #pragma function ( memcpy, memset ); // tell the compiler to not use the intrinsic form //======================================================================================== void* __cdecl memcpy ( void* _Dest, const void* _Source, size_t _Size ) { unsigned int uiBufferSize, uiRemainderSource, uiRemainderDest; PBYTE pbDest = ( PBYTE )_Dest; PBYTE pbSource = ( PBYTE )_Source; PDWORD pdwDest = ( PDWORD )_Dest; PDWORD pdwSource = ( PDWORD )_Source; uiRemainderSource = ( sizeof ( DWORD ) - ( ( DWORD )_Source & ( sizeof ( DWORD ) -1 ) ) ); uiRemainderDest = ( sizeof ( DWORD ) - ( ( DWORD )_Dest & ( sizeof ( DWORD ) -1 ) ) ); if ( uiRemainderSource == uiRemainderDest && uiRemainderSource != 0 && _Size >= uiRemainderSource ) { _Size -= uiRemainderSource; while ( uiRemainderSource-- ) { *pbDest++ = *pbSource++; } pdwDest = ( PDWORD )pbDest; pdwSource = ( PDWORD )pbSource; } // see how many dwords can fit in the space uiBufferSize = _Size >> 2; if ( uiBufferSize != 0 ) { _Size -= ( uiBufferSize << 2 ); while ( uiBufferSize-- ) { *pdwDest++ = *pdwSource++; } } if ( _Size != 0 ) // we still have space left over { pbDest = ( PBYTE )pdwDest; pbSource = ( PBYTE )pdwSource; while ( _Size-- ) { *pbDest++ = *pbSource++; } } return _Dest; } //======================================================================================== void* __cdecl memset ( void* _Dest, int _Val, size_t _Size ) { unsigned int uiBufferSize, uiRemainder; PDWORD pdwDest = ( PDWORD )_Dest; PBYTE pbDest = ( PBYTE )_Dest; uiRemainder = ( sizeof ( DWORD ) - ( ( DWORD )_Dest & ( sizeof ( DWORD ) -1 ) ) ); if ( uiRemainder != 0 && _Size >= uiRemainder ) // unaligned memory { _Size -= uiRemainder; while ( uiRemainder-- ) // get us on aligned memory { *pbDest++ = _Val; } pdwDest = ( PDWORD )pbDest; } // see how many dwords can fit in the space uiBufferSize = _Size >> 2; if ( uiBufferSize != 0 ) { _Size -= ( uiBufferSize << 2 ); while ( uiBufferSize-- ) { *pdwDest++ = _Val; } } if ( _Size != 0 ) // we still have space left over { pbDest = ( PBYTE )pdwDest; while ( _Size-- ) { *pbDest++ = _Val; } } return _Dest; }
Comment