?
=?ISO-8859-1?Q?Per_Nordl=F6w?=
Hi all
I am using the boost::array template class trying to generalize my
handcrafted
vector specialization for dimensions 2 (class vec2), 3 (class vec3) etc.
As performance is of greatest importance I have written an initial
benchmarker that tests how well g++ can unroll loops whose number of
iterations
can be determined at compile time or upon entry to the loop. The gcc switch
"-funroll-loops" should do just that. The test program calculates the
dotproduct of two four-dimensional arrays of int 10 million times and
looks like follows:
#include "../array.hh"
#include "../Timer.hh"
using boost::array;
using std::cout;
using std::endl;
template <typename T, std::size_t N>
inline T general_dot(const array<T, N> & a, const array<T, N> & b)
{
T c = 0;
for (size_t i = 0; i < N; i++)
{
c += a * b;
}
return c;
}
template <typename T>
inline T special_dot(const array<T, 4> & a, const array<T, 4> & b)
{
return (a[0] * b[0] +
a[1] * b[1] +
a[2] * b[2] +
a[3] * b[3]);
}
int main(int argc, char * argv[])
{
typedef array<int, 4> T;
T a(3);
cout << "a: " << a << endl;
a[0] = 11;
a[1] = 13;
a[2] = 17;
a[3] = 19;
cout << "a: " << a << endl;
T b = a;
Timer t;
const unsigned int nloops = 10000000;
unsigned int sum = 0;
t.reset();
for (unsigned int i = 0; i < nloops; i++)
{
sum += general_dot(a, b);
}
t.read();
cout << "general: " << t << endl;
unsigned int tum = 0;
t.reset();
for (unsigned int i = 0; i < nloops; i++)
{
tum += special_dot(a, b);
}
t.read();
cout << "special: " << t << endl;
if (sum == tum)
{
cout << "Checksums are equal. OK" << endl;
}
else
{
cout << "Checksums are not equal. NOT OK" << endl;
}
return 0;
}
The calculation is performed with a general and a specialized version of
the dot product: general_dot() and special_dot() respectively.
However the performance of the general_dot() is terrible compared to the
special_dot(). Around 35 times slower when I compile it with gcc-3.3.2 using
the switches "-O3 -funroll-all-loops".
Is gcc really that lame or have I forgotten something?
Many thanks in advance,
Per Nordlöw
Swedish Defence Research Agency
Linköping
Sweden
I am using the boost::array template class trying to generalize my
handcrafted
vector specialization for dimensions 2 (class vec2), 3 (class vec3) etc.
As performance is of greatest importance I have written an initial
benchmarker that tests how well g++ can unroll loops whose number of
iterations
can be determined at compile time or upon entry to the loop. The gcc switch
"-funroll-loops" should do just that. The test program calculates the
dotproduct of two four-dimensional arrays of int 10 million times and
looks like follows:
#include "../array.hh"
#include "../Timer.hh"
using boost::array;
using std::cout;
using std::endl;
template <typename T, std::size_t N>
inline T general_dot(const array<T, N> & a, const array<T, N> & b)
{
T c = 0;
for (size_t i = 0; i < N; i++)
{
c += a * b;
}
return c;
}
template <typename T>
inline T special_dot(const array<T, 4> & a, const array<T, 4> & b)
{
return (a[0] * b[0] +
a[1] * b[1] +
a[2] * b[2] +
a[3] * b[3]);
}
int main(int argc, char * argv[])
{
typedef array<int, 4> T;
T a(3);
cout << "a: " << a << endl;
a[0] = 11;
a[1] = 13;
a[2] = 17;
a[3] = 19;
cout << "a: " << a << endl;
T b = a;
Timer t;
const unsigned int nloops = 10000000;
unsigned int sum = 0;
t.reset();
for (unsigned int i = 0; i < nloops; i++)
{
sum += general_dot(a, b);
}
t.read();
cout << "general: " << t << endl;
unsigned int tum = 0;
t.reset();
for (unsigned int i = 0; i < nloops; i++)
{
tum += special_dot(a, b);
}
t.read();
cout << "special: " << t << endl;
if (sum == tum)
{
cout << "Checksums are equal. OK" << endl;
}
else
{
cout << "Checksums are not equal. NOT OK" << endl;
}
return 0;
}
The calculation is performed with a general and a specialized version of
the dot product: general_dot() and special_dot() respectively.
However the performance of the general_dot() is terrible compared to the
special_dot(). Around 35 times slower when I compile it with gcc-3.3.2 using
the switches "-O3 -funroll-all-loops".
Is gcc really that lame or have I forgotten something?
Many thanks in advance,
Per Nordlöw
Swedish Defence Research Agency
Linköping
Sweden