A
AndrewD
Hey C++ folks,
I created this today, just for fun.
You can make object allocation for any class around 6 times faster,
simply by doing the following.
class MyClass : public TXpQAlloc<MyClass,N>
N is a chunking factor (defaults to 10).
No other code change needed to apply this to your own classes.
It seemed pretty cool to me, so I'm posting it here.
It's written on Windows, but the same approach should work anywhere.
Hope it's useful to someone. Let me know if it is.
....AndrewD...
//
// CRTP.cpp : A test program for am efficient class scope allocator
scheme.
// making use of the Curiously Recursive Template Pattern
(CRTP)..
//
#include "stdafx.h"
#include <iostream>
#define WIN32_LEAN_AND_MEAN
#include "windows.h"
using namespace std;
//
// TXpQAlloc - An efficient Class Allocator.
//
// Problems with heap allocation/deallocation.
// 1. Processing overhead per allocation/deallocation.
// 2. Space overhead per allocation, especially for many small
allocations.
// 3. Microsoft small heap allocation strategy will not free memory
back to
// O/S when many small allocations are deallocated. Try allocating a
million
// 8 byte classes, then freeing them all. Quite apart from the
processing and
// space allocation overheads during the processing, you will now
find that
// you have several megabytes of space assigned to your process that
will not
// be returned to the O/S for use by other apps.
// If these are problems for your design, then TXpQAlloc may help.
//
// TXpQAlloc allocates heap space in chunks large enough to contain
many instances
// of your class. Individual allocations of your class will be assigned
space within
// a heap allocated chunk. When a chunk is full, a new chunk is
allocated on the heap.
//
// On delete(), the space is hooked onto a chain of free space, to be
recycled.
// On new(), the free space chain is checked first.
// Recycling occurs in order of the Most Recently Used space, to
improve utilisation
// of processor cache.
//
// Proper alignment of object instances is guaranteed.
//
// Usage: As per "Curiously Recursive Template Pattern".
//
// class MyClass : public TXpQAlloc<MyClass,N>
//
// - N is the chunking factor (defaults to 10)..
// Instances of MyClass per heap allocated chunk.
// If you know ahead of time that your program will allocate a
particular number
// of objects of your class, then use that number as the
chunking factor.
// If you just know there will be a large number of
allocations, then choose a
// chunking factor N, such that the chunk size (N *
sizeof(MyClass)) is > 1KB,
// ensuring use of MS large allocation strategy. Larger may be
marginally better,
// but increases the overhead of allocated but unused space in
the latest chunk.
//
// Then use new/delete as usual.
// When all instances of the class are delete()'d, all chunks will
be free()'d from the heap.
// Until then, all allocated chunks will remain, because TXpQAlloc
can not tell when an
// individual chunk is unused.
//
// Overheads:
//
// 32 bytes in class scope.
// 0 bytes per object instance.
// 16 bytes per chunk. For a chunk factor of 10, this equates to 1.6
bytes per object instance.
// Whatever unused space there may be in the latest chunk.
//
// Performance appears too be around 6 times better than regular
new/delete.
//
typedef unsigned char XByte;
typedef __int64 XLongLong;
#pragma warning(disable : 4355) // warning C4355: 'this' : used in
base member initializer list
#pragma warning(disable : 4291) // warning C4291: no matching operator
delete found;
class CXpQAllocChunk
{
public:
CXpQAllocChunk(CXpQAllocChunk *poPrevChunk, size_t siChunk)
: poPrevChunk_m (poPrevChunk)
, pyEndOfChunk_m ((XByte *)(this + 1) + siChunk) {}
~CXpQAllocChunk() { delete
poPrevChunk_m; }
void *operator new (size_t siThis, size_t siChunk) { return
malloc(siThis + siChunk); }
void operator delete(void *pvMem) { free(pvMem);
}
XByte *GetEndOfChunk () { return
pyEndOfChunk_m; }
private:
CXpQAllocChunk *poPrevChunk_m;
XByte *pyEndOfChunk_m;
};
template<class T, int N=10> class TXpQAlloc
{
public:
TXpQAlloc() {}
TXpQAlloc(const TXpQAlloc<T>&) {}
~TXpQAlloc() {}
void *operator new(size_t /*siThis*/)
{
void *pvAlloc = pvFreeMRU_sm;
if (pvAlloc)
pvFreeMRU_sm = *(void **)pvAlloc;
else
{
if ((pvAlloc = (void *)pyNextFreeByte_sm) == NULL)
{
poLastChunk_sm = new(N*sizeof(T))
CXpQAllocChunk(poLastChunk_sm, N*sizeof(T));
pyNextFreeByte_sm = (XByte *)(poLastChunk_sm + 1); // Space
after chunk header.
pvAlloc = (void *)pyNextFreeByte_sm;
}
pyNextFreeByte_sm += sizeof(T);
if (pyNextFreeByte_sm >= poLastChunk_sm->GetEndOfChunk()) //
Chunk is full now.
pyNextFreeByte_sm = NULL; // Don't allocate new chunk
until needed.
}
++iAllocCount_sm;
return pvAlloc;
}
void operator delete(void *pvMem)
{
if (pvMem) // delete NULL is no-op.
{
// Hook deleted object space into MRU free list.
*(void **)pvMem = pvFreeMRU_sm;
pvFreeMRU_sm = pvMem;
if (--iAllocCount_sm == 0)
{
// Cascading destruction of all chunks when zero instances
left.
delete poLastChunk_sm;
poLastChunk_sm = NULL;
pyNextFreeByte_sm = NULL;
pvFreeMRU_sm = NULL;
}
}
}
private:
// Trail of chunk allocations per class.
static CXpQAllocChunk *poLastChunk_sm; // Last chunk allocated
which has link to one before etc.
static XByte *pyNextFreeByte_sm; // Position in last chunk
for next allocation.
static int iAllocCount_sm; // Count of object
allocated.
static void *pvFreeMRU_sm; // Most recently used free
slot.
};
template<class T,int N> CXpQAllocChunk *TXpQAlloc<T>:oLastChunk_sm
= NULL;
template<class T,int N> XByte *TXpQAlloc<T>:yNextFreeByte_sm
= NULL;
template<class T,int N> int TXpQAlloc<T>::iAllocCount_sm
= 0;
template<class T,int N> void *TXpQAlloc<T>:vFreeMRU_sm
= NULL;
class CXpBlah //: public TXpQAlloc<CXpBlah, 10000>
{
public:
CXpBlah(int iBlah1, int iBlah2)
: iBlah1_m (iBlah1)
, iBlah2_m (iBlah2) {}
~CXpBlah() {}
private:
int iBlah1_m;
int iBlah2_m;
};
typedef CXpBlah *PCXpBlah;
#define NumAlloc_L 1000000
int main(int argc, char* argv[])
{
PCXpBlah *papoBlah = new PCXpBlah[NumAlloc_L];
int i;
XLongLong llStartCounter;
QueryPerformanceCounter((LARGE_INTEGER *)&llStartCounter);
// Allocate all.
for (i=0; i<NumAlloc_L; i++)
{
*(papoBlah+i) = new CXpBlah(i, i);
}
// Deallocate every second one.
for (i=0; i<NumAlloc_L; i+=2)
{
delete *(papoBlah+i);
}
// Allocate every second one.again
for (i=0; i<NumAlloc_L; i++)
{
*(papoBlah+i) = new CXpBlah(i, i);
}
// Deallocate all.
for (i=0; i<NumAlloc_L; i++)
{
delete *(papoBlah+i);
}
XLongLong llFinalCounter;
QueryPerformanceCounter((LARGE_INTEGER *)&llFinalCounter);
XLongLong llCounterFrequency;
QueryPerformanceFrequency((LARGE_INTEGER *)&llCounterFrequency);
double dMsPerCount = (double)1000000.00 /
(double)llCounterFrequency;
XLongLong llDeltaMs = (XLongLong)(dMsPerCount *
(double)(llFinalCounter - llStartCounter));
printf("DeltaTime = %I64d.\n", llDeltaMs);
return 0;
}
I created this today, just for fun.
You can make object allocation for any class around 6 times faster,
simply by doing the following.
class MyClass : public TXpQAlloc<MyClass,N>
N is a chunking factor (defaults to 10).
No other code change needed to apply this to your own classes.
It seemed pretty cool to me, so I'm posting it here.
It's written on Windows, but the same approach should work anywhere.
Hope it's useful to someone. Let me know if it is.
....AndrewD...
//
// CRTP.cpp : A test program for am efficient class scope allocator
scheme.
// making use of the Curiously Recursive Template Pattern
(CRTP)..
//
#include "stdafx.h"
#include <iostream>
#define WIN32_LEAN_AND_MEAN
#include "windows.h"
using namespace std;
//
// TXpQAlloc - An efficient Class Allocator.
//
// Problems with heap allocation/deallocation.
// 1. Processing overhead per allocation/deallocation.
// 2. Space overhead per allocation, especially for many small
allocations.
// 3. Microsoft small heap allocation strategy will not free memory
back to
// O/S when many small allocations are deallocated. Try allocating a
million
// 8 byte classes, then freeing them all. Quite apart from the
processing and
// space allocation overheads during the processing, you will now
find that
// you have several megabytes of space assigned to your process that
will not
// be returned to the O/S for use by other apps.
// If these are problems for your design, then TXpQAlloc may help.
//
// TXpQAlloc allocates heap space in chunks large enough to contain
many instances
// of your class. Individual allocations of your class will be assigned
space within
// a heap allocated chunk. When a chunk is full, a new chunk is
allocated on the heap.
//
// On delete(), the space is hooked onto a chain of free space, to be
recycled.
// On new(), the free space chain is checked first.
// Recycling occurs in order of the Most Recently Used space, to
improve utilisation
// of processor cache.
//
// Proper alignment of object instances is guaranteed.
//
// Usage: As per "Curiously Recursive Template Pattern".
//
// class MyClass : public TXpQAlloc<MyClass,N>
//
// - N is the chunking factor (defaults to 10)..
// Instances of MyClass per heap allocated chunk.
// If you know ahead of time that your program will allocate a
particular number
// of objects of your class, then use that number as the
chunking factor.
// If you just know there will be a large number of
allocations, then choose a
// chunking factor N, such that the chunk size (N *
sizeof(MyClass)) is > 1KB,
// ensuring use of MS large allocation strategy. Larger may be
marginally better,
// but increases the overhead of allocated but unused space in
the latest chunk.
//
// Then use new/delete as usual.
// When all instances of the class are delete()'d, all chunks will
be free()'d from the heap.
// Until then, all allocated chunks will remain, because TXpQAlloc
can not tell when an
// individual chunk is unused.
//
// Overheads:
//
// 32 bytes in class scope.
// 0 bytes per object instance.
// 16 bytes per chunk. For a chunk factor of 10, this equates to 1.6
bytes per object instance.
// Whatever unused space there may be in the latest chunk.
//
// Performance appears too be around 6 times better than regular
new/delete.
//
typedef unsigned char XByte;
typedef __int64 XLongLong;
#pragma warning(disable : 4355) // warning C4355: 'this' : used in
base member initializer list
#pragma warning(disable : 4291) // warning C4291: no matching operator
delete found;
class CXpQAllocChunk
{
public:
CXpQAllocChunk(CXpQAllocChunk *poPrevChunk, size_t siChunk)
: poPrevChunk_m (poPrevChunk)
, pyEndOfChunk_m ((XByte *)(this + 1) + siChunk) {}
~CXpQAllocChunk() { delete
poPrevChunk_m; }
void *operator new (size_t siThis, size_t siChunk) { return
malloc(siThis + siChunk); }
void operator delete(void *pvMem) { free(pvMem);
}
XByte *GetEndOfChunk () { return
pyEndOfChunk_m; }
private:
CXpQAllocChunk *poPrevChunk_m;
XByte *pyEndOfChunk_m;
};
template<class T, int N=10> class TXpQAlloc
{
public:
TXpQAlloc() {}
TXpQAlloc(const TXpQAlloc<T>&) {}
~TXpQAlloc() {}
void *operator new(size_t /*siThis*/)
{
void *pvAlloc = pvFreeMRU_sm;
if (pvAlloc)
pvFreeMRU_sm = *(void **)pvAlloc;
else
{
if ((pvAlloc = (void *)pyNextFreeByte_sm) == NULL)
{
poLastChunk_sm = new(N*sizeof(T))
CXpQAllocChunk(poLastChunk_sm, N*sizeof(T));
pyNextFreeByte_sm = (XByte *)(poLastChunk_sm + 1); // Space
after chunk header.
pvAlloc = (void *)pyNextFreeByte_sm;
}
pyNextFreeByte_sm += sizeof(T);
if (pyNextFreeByte_sm >= poLastChunk_sm->GetEndOfChunk()) //
Chunk is full now.
pyNextFreeByte_sm = NULL; // Don't allocate new chunk
until needed.
}
++iAllocCount_sm;
return pvAlloc;
}
void operator delete(void *pvMem)
{
if (pvMem) // delete NULL is no-op.
{
// Hook deleted object space into MRU free list.
*(void **)pvMem = pvFreeMRU_sm;
pvFreeMRU_sm = pvMem;
if (--iAllocCount_sm == 0)
{
// Cascading destruction of all chunks when zero instances
left.
delete poLastChunk_sm;
poLastChunk_sm = NULL;
pyNextFreeByte_sm = NULL;
pvFreeMRU_sm = NULL;
}
}
}
private:
// Trail of chunk allocations per class.
static CXpQAllocChunk *poLastChunk_sm; // Last chunk allocated
which has link to one before etc.
static XByte *pyNextFreeByte_sm; // Position in last chunk
for next allocation.
static int iAllocCount_sm; // Count of object
allocated.
static void *pvFreeMRU_sm; // Most recently used free
slot.
};
template<class T,int N> CXpQAllocChunk *TXpQAlloc<T>:oLastChunk_sm
= NULL;
template<class T,int N> XByte *TXpQAlloc<T>:yNextFreeByte_sm
= NULL;
template<class T,int N> int TXpQAlloc<T>::iAllocCount_sm
= 0;
template<class T,int N> void *TXpQAlloc<T>:vFreeMRU_sm
= NULL;
class CXpBlah //: public TXpQAlloc<CXpBlah, 10000>
{
public:
CXpBlah(int iBlah1, int iBlah2)
: iBlah1_m (iBlah1)
, iBlah2_m (iBlah2) {}
~CXpBlah() {}
private:
int iBlah1_m;
int iBlah2_m;
};
typedef CXpBlah *PCXpBlah;
#define NumAlloc_L 1000000
int main(int argc, char* argv[])
{
PCXpBlah *papoBlah = new PCXpBlah[NumAlloc_L];
int i;
XLongLong llStartCounter;
QueryPerformanceCounter((LARGE_INTEGER *)&llStartCounter);
// Allocate all.
for (i=0; i<NumAlloc_L; i++)
{
*(papoBlah+i) = new CXpBlah(i, i);
}
// Deallocate every second one.
for (i=0; i<NumAlloc_L; i+=2)
{
delete *(papoBlah+i);
}
// Allocate every second one.again
for (i=0; i<NumAlloc_L; i++)
{
*(papoBlah+i) = new CXpBlah(i, i);
}
// Deallocate all.
for (i=0; i<NumAlloc_L; i++)
{
delete *(papoBlah+i);
}
XLongLong llFinalCounter;
QueryPerformanceCounter((LARGE_INTEGER *)&llFinalCounter);
XLongLong llCounterFrequency;
QueryPerformanceFrequency((LARGE_INTEGER *)&llCounterFrequency);
double dMsPerCount = (double)1000000.00 /
(double)llCounterFrequency;
XLongLong llDeltaMs = (XLongLong)(dMsPerCount *
(double)(llFinalCounter - llStartCounter));
printf("DeltaTime = %I64d.\n", llDeltaMs);
return 0;
}