S
s0suk3
The task: Write a program that reads a set of words from standard
input and prints the number of distinct words.
I came across a website that listed a few programs to accomplish this
task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
flaming , and thought that all of them did unnecessary operations,
so I wrote my own. But for some reason, my version turned out slower
that ALL of the versions in the website, even though it seems to
perform less operations (yes, I benchmarked them on my own computer).
According to the website, the slowest version is:
#include <set>
#include <string>
#include <iostream>
int main(int argc, char **argv)
{
// Declare and Initialize some variables
std::string word;
std::set<std::string> wordcount;
// Read words and insert in rb-tree
while (std::cin >> word) wordcount.insert(word);
// Print the result
std::cout << "Words: " << wordcount.size() << std::endl;
return 0;
}
My version is about 12 times slower than that. It uses lower-level
constructs. Here it is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
struct SetNode
{
char *word;
struct SetNode *next;
};
// An unorderd set of words
//
static struct SetNode *gSet = 0;
static int gSetSize = 0;
#define kInitWordSize 32
// Returns a word read from stdin. The returned pointer must be
// deallocated with free().
//
static char *
ReadOneWord(void)
{
int ch = getchar();
while (ch != EOF && isspace(ch))
ch = getchar();
if (ch == EOF)
return 0;
char *word = (char *) malloc(kInitWordSize);
if (!word)
return 0;
int size = kInitWordSize;
int i = 0;
while (ch != EOF && !isspace(ch)) {
if (i >= size) {
size *= 2;
char *newWord = (char *) realloc(word, size);
if (!newWord) {
free(word);
return 0;
}
word = newWord;
}
word[i++] = ch;
ch = getchar();
}
if (i >= size) {
size *= 2;
char *newWord = (char *) realloc(word, size);
if (!newWord) {
free(word);
return 0;
}
word = newWord;
}
word = '\0';
return word;
}
// Inserts a word into the set if it isn't in the set.
// The passed string is expected to have been allocated with
// a memory allocation function, and it should be considered
// lost after passed to this function.
//
static void
InsertWord(char *aWord)
{
struct SetNode *node;
for (node = gSet; node; node = node->next) {
if (strcmp(node->word, aWord) == 0) {
free(aWord);
return;
}
}
node = (struct SetNode *) malloc(sizeof(struct SetNode));
if (!node) {
free(aWord);
return;
}
node->word = aWord;
node->next = gSet;
gSet = node;
++gSetSize;
}
static void
DeleteSet(void)
{
struct SetNode *node = gSet;
struct SetNode *temp;
while (node) {
temp = node;
node = node->next;
free(temp->word);
free(temp);
}
gSet = 0;
gSetSize = 0;
}
int
main(void)
{
char *word;
while ((word = ReadOneWord()))
InsertWord(word);
printf("Words: %d\n", gSetSize);
// Skip cleanup for now...
//DeleteSet();
}
Any ideas as to what causes the big slowdown?
Sebastian
input and prints the number of distinct words.
I came across a website that listed a few programs to accomplish this
task: http://unthought.net/c++/c_vs_c++.html (ignore all the language
flaming , and thought that all of them did unnecessary operations,
so I wrote my own. But for some reason, my version turned out slower
that ALL of the versions in the website, even though it seems to
perform less operations (yes, I benchmarked them on my own computer).
According to the website, the slowest version is:
#include <set>
#include <string>
#include <iostream>
int main(int argc, char **argv)
{
// Declare and Initialize some variables
std::string word;
std::set<std::string> wordcount;
// Read words and insert in rb-tree
while (std::cin >> word) wordcount.insert(word);
// Print the result
std::cout << "Words: " << wordcount.size() << std::endl;
return 0;
}
My version is about 12 times slower than that. It uses lower-level
constructs. Here it is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
struct SetNode
{
char *word;
struct SetNode *next;
};
// An unorderd set of words
//
static struct SetNode *gSet = 0;
static int gSetSize = 0;
#define kInitWordSize 32
// Returns a word read from stdin. The returned pointer must be
// deallocated with free().
//
static char *
ReadOneWord(void)
{
int ch = getchar();
while (ch != EOF && isspace(ch))
ch = getchar();
if (ch == EOF)
return 0;
char *word = (char *) malloc(kInitWordSize);
if (!word)
return 0;
int size = kInitWordSize;
int i = 0;
while (ch != EOF && !isspace(ch)) {
if (i >= size) {
size *= 2;
char *newWord = (char *) realloc(word, size);
if (!newWord) {
free(word);
return 0;
}
word = newWord;
}
word[i++] = ch;
ch = getchar();
}
if (i >= size) {
size *= 2;
char *newWord = (char *) realloc(word, size);
if (!newWord) {
free(word);
return 0;
}
word = newWord;
}
word = '\0';
return word;
}
// Inserts a word into the set if it isn't in the set.
// The passed string is expected to have been allocated with
// a memory allocation function, and it should be considered
// lost after passed to this function.
//
static void
InsertWord(char *aWord)
{
struct SetNode *node;
for (node = gSet; node; node = node->next) {
if (strcmp(node->word, aWord) == 0) {
free(aWord);
return;
}
}
node = (struct SetNode *) malloc(sizeof(struct SetNode));
if (!node) {
free(aWord);
return;
}
node->word = aWord;
node->next = gSet;
gSet = node;
++gSetSize;
}
static void
DeleteSet(void)
{
struct SetNode *node = gSet;
struct SetNode *temp;
while (node) {
temp = node;
node = node->next;
free(temp->word);
free(temp);
}
gSet = 0;
gSetSize = 0;
}
int
main(void)
{
char *word;
while ((word = ReadOneWord()))
InsertWord(word);
printf("Words: %d\n", gSetSize);
// Skip cleanup for now...
//DeleteSet();
}
Any ideas as to what causes the big slowdown?
Sebastian