No, but your results also seem to indicate that the member copy is the
fastest ... maybe I should have put my question like this: what is the
fastest way to copy arrays of pod?
It depends on the compiler. Good candidates are std::copy,
std::memcpy, memberwise loop, and, often the winner for poorer
compilers, Duff's Device.
In my tests, the results depend very much on the precise optimization
settings. With loop unrolling, iCopy, mCopy and dCopy are generally
better than cCopy (memcpy) and sCopy (std::copy). Without, dCopy is
generally best, followed by the rest all about even.
Here are my results (all with max optimization), followed by the code:
GCC 2.95 (-O3 -funroll-loops):
time for iCopy : 0.546
time for mCopy : 0.625
time for sCopy : 1.11
time for cCopy : 1.093
time for dCopy : 0.61
time for iCopy : 0.531
time for mCopy : 0.703
time for sCopy : 1.094
time for cCopy : 1.094
time for dCopy : 0.593
GCC 3.2 (-O3 -funroll-loops -mcpu=athlon):
time for iCopy : 0.516
time for mCopy : 0.609
time for sCopy : 1.156
time for cCopy : 1.032
time for dCopy : 0.64
time for iCopy : 0.531
time for mCopy : 0.625
time for sCopy : 1.204
time for cCopy : 1.046
time for dCopy : 0.641
VC7.1 (/Ox /Og /Oi /Ot /Oy /G7 /GA):
time for iCopy : 0.813
time for mCopy : 0.625
time for sCopy : 1.093
time for cCopy : 1.032
time for dCopy : 0.656
time for iCopy : 0.812
time for mCopy : 0.672
time for sCopy : 1.11
time for cCopy : 1.031
time for dCopy : 0.656
#include <time.h>
#include <algorithm>
#include <iostream>
#include <string.h>
using namespace std;
typedef double Element;
void iCopy( Element *dst, const Element *src, int n ) { while ( n-- >
0 ) *dst++ = *src++; }
void mCopy( Element *dst, const Element *src, int n ) { while ( n-- >
0 ) dst[n] = src[n]; }
void sCopy( Element *dst, const Element *src, int n ) { std::copy(src,
src+n, dst); }
void cCopy( Element *dst, const Element *src, int n ) { memcpy(dst,
src, n*sizeof *dst); }
void dCopy( Element *dst, const Element *src, int n )
{
const Element* const end = src + n;
if (n <= 0)
return;
switch(n % 8)
{
while(src != end)
{
case 0:
*dst++ = *src++;
case 7:
*dst++ = *src++;
case 6:
*dst++ = *src++;
case 5:
*dst++ = *src++;
case 4:
*dst++ = *src++;
case 3:
*dst++ = *src++;
case 2:
*dst++ = *src++;
case 1:
*dst++ = *src++;
}
}
}
class Vector
{
public:
Vector( unsigned int newsize ) { array = new Element[ newsize ];
size = newsize; }
~Vector() { delete[] array; }
void init() { for ( unsigned int i = 0; i<size; i++ ) array[ i ] =
i; }
void print() { for ( unsigned int i = 0; i<size; i++ ) cout <<
array[ i ] << " "; cout << endl; }
unsigned int size;
Element *array;
};
int main()
{
Vector a( 10 ), b( 10 ), c( 10 ); a.init();
cout << "a : "; a.print();
cout << "b : "; b.print();
cout << "c : "; c.print();
iCopy( b.array, a.array, 7 );
cout << "b after iCopy : "; b.print();
sCopy( c.array, a.array, 7 );
cout << "c after sCopy : "; c.print();
clock_t start, stop;
double taken;
int VECTOR_SIZE = 100;
int LOOP_COUNT = 5000000;
Vector d( VECTOR_SIZE ), e( VECTOR_SIZE );
d.init();
start = clock();
for ( int i = 0 ; i < LOOP_COUNT; i++ ) iCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for iCopy : " << taken << endl;
start = clock();
for ( int i = 0 ; i < LOOP_COUNT; i++ ) mCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for mCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) sCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for sCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) cCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for cCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) dCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for dCopy : " << taken << endl;
start = clock();
for ( int i = 0 ; i < LOOP_COUNT; i++ ) iCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for iCopy : " << taken << endl;
start = clock();
for ( int i = 0 ; i < LOOP_COUNT; i++ ) mCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for mCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) sCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for sCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) cCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for cCopy : " << taken << endl;
start = clock();
for ( int i = 0; i < LOOP_COUNT; i++ ) dCopy( e.array, d.array,
VECTOR_SIZE-10 );
stop = clock();
taken = ( stop - start ) / (double)CLOCKS_PER_SEC;
cerr << "time for dCopy : " << taken << endl;
return 0;
}