M
Markus Dehmann
Do I have to handle hash collisions in a hash_set myself?
I did a test in which I use find() to look for objects in a hash_set.
These objects are definitely not contained, but find() sometimes finds
them anyway. See this code:
<code>
#include <iostream>
#include <stdexcept>
#include <ctime>
#include <set>
// also include header that defines gnu_namespace and includes
hash_set
class MyContainer {
std::vector<int> v;
public:
MyContainer(){}
std::size_t hashcode() const {
std::size_t hash = 0;
for(unsigned i=0; i<v.size(); ++i){ // sdbm function gives
collisions sometimes
hash = v + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
void add(int i){v.push_back(i);}
};
struct eqPtrs {
bool operator()(const MyContainer* h1, const MyContainer* h2) const
{
return h1->hashcode() == h2->hashcode(); // problematic b/c of
collisions?
}
};
namespace gnu_namespace {
template<> struct hash<const MyContainer*> {
inline size_t operator()(const MyContainer* d) const {
return d->hashcode();
}
};
}
int getRand(int min, int max){
return ((rand() % (max-min+1)) + min);
}
int main(int argc, char** argv){
srand(time(0));
typedef hash_set<const MyContainer*, hash<const MyContainer*>,
eqPtrs> MyMap;
MyMap myMap;
int repeat = 100000;
int size = 10;
for(int i=0; i<repeat; ++i){
MyContainer* h = new MyContainer();
for(int j=0; j<size; ++j){
h->add(getRand(0, 1000));
}
myMap.insert(h);
}
for(int i=0; i<repeat; ++i){
MyContainer* h = new MyContainer();
for(int j=0; j<size; ++j){
h->add(getRand(2000, 3000));
}
MyMap::const_iterator found = myMap.find(h);
assert(found == myMap.end()); // aborts!
}
// TODO: finally delete elements in myMap
return EXIT_SUCCESS;
}
</code>
The solution seems to be to adapt the equality condition in eqPtrs to
also test for actual equality of the members, not just equality of the
hash codes:
struct eqPtrs {
bool operator()(const MyContainer* h1, const MyContainer* h2) const
{
return h1->hashcode() == h2->hashcode() && haveSameElements(*h1,
*h2); // added condition
}
};
Is that the solution, or am I doing something wrong in general?
General comments on my code are welcome!
Markus
I did a test in which I use find() to look for objects in a hash_set.
These objects are definitely not contained, but find() sometimes finds
them anyway. See this code:
<code>
#include <iostream>
#include <stdexcept>
#include <ctime>
#include <set>
// also include header that defines gnu_namespace and includes
hash_set
class MyContainer {
std::vector<int> v;
public:
MyContainer(){}
std::size_t hashcode() const {
std::size_t hash = 0;
for(unsigned i=0; i<v.size(); ++i){ // sdbm function gives
collisions sometimes
hash = v + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
void add(int i){v.push_back(i);}
};
struct eqPtrs {
bool operator()(const MyContainer* h1, const MyContainer* h2) const
{
return h1->hashcode() == h2->hashcode(); // problematic b/c of
collisions?
}
};
namespace gnu_namespace {
template<> struct hash<const MyContainer*> {
inline size_t operator()(const MyContainer* d) const {
return d->hashcode();
}
};
}
int getRand(int min, int max){
return ((rand() % (max-min+1)) + min);
}
int main(int argc, char** argv){
srand(time(0));
typedef hash_set<const MyContainer*, hash<const MyContainer*>,
eqPtrs> MyMap;
MyMap myMap;
int repeat = 100000;
int size = 10;
for(int i=0; i<repeat; ++i){
MyContainer* h = new MyContainer();
for(int j=0; j<size; ++j){
h->add(getRand(0, 1000));
}
myMap.insert(h);
}
for(int i=0; i<repeat; ++i){
MyContainer* h = new MyContainer();
for(int j=0; j<size; ++j){
h->add(getRand(2000, 3000));
}
MyMap::const_iterator found = myMap.find(h);
assert(found == myMap.end()); // aborts!
}
// TODO: finally delete elements in myMap
return EXIT_SUCCESS;
}
</code>
The solution seems to be to adapt the equality condition in eqPtrs to
also test for actual equality of the members, not just equality of the
hash codes:
struct eqPtrs {
bool operator()(const MyContainer* h1, const MyContainer* h2) const
{
return h1->hashcode() == h2->hashcode() && haveSameElements(*h1,
*h2); // added condition
}
};
Is that the solution, or am I doing something wrong in general?
General comments on my code are welcome!
Markus