K
Kevin
C++: 94 seconds.
my java: 227 seconds.
Time including all (CPU, IO, etc).
Task: read in data line by line (required), and find the word frequency
count from a large file (1.2G).
A c++ code (I do not have its source), it takes total 94 seconds, using
constant 600k memory as reported from Win TaskManager.
My code (I comment out some lines and run the code several times to get
these statistics):
1) Scan the data file line by line and convert each line to a customed
"raw string" class, total use 60 seconds. I use InputStream and read
byte[] of 32k each time (the data file is plain ascii file). And do my
own "NextLine()" function from the byte[]. In the follow code, this is
done by comment out lines from Line A to Line C.
2) on top of 1), adding splitting each line into words, use another 40
seconds. (total 100 seconds). In the follow code, this is done by
comment out lines from Line B to Line C.
3) on top of 2), adding a hashtable (hashmap similar result) frequency
count, use another 117 seconds (so total is 227 seconds).
Reading 1.2G plain text file line by line using 60 seconds seems not
that bad.
Splitting each line (byte[]) into words, which are many user-defined
"raw string" class, using 40 seconds, not that bad.
However, hash counting those words needs another 127 seconds,
hum..........
So:
Number 1 issue: counting the words using hashtable takes soooo long
time. What is the problem?
Number 2 issue: the split() function to split a byte[] seems not good
enough.
I list the codes below. Suggestions greatly appreciate.
Have a nice weekend!
///================================================================
Several major classes are:
My own raw string, which is basically a byte[] array.
public class KDRawString
{
private
byte[] Data = null;
public int hashCode()
{
int c = 1;
int v = 31;
for (int i=0; i<Data.length; i++)
{
c = c + Data*v;
v = v*31;
};
return c;
}
public boolean equals(Object o)
{
if (o instanceof KDRawString)
{
KDRawString K = (KDRawString) o;
if (K.Data.length != this.Data.length)
return false;
for (int i=0; i<this.Data.length; i++)
{
if (this.Data != K.Data)
return false;
};
return true;
}
else
return false;
}
public KDRawString(final byte[] data)
{
Data = Utility.Duplicate(data);
// Utility.Duplicate is just like System.arraycopy.
}
public KDRawString(byte[] data, final boolean ReUse)
{
if (ReUse) // sometimes, we do not need to create new array, if we
can use the input one.
Data = data;
else
Data = Utility.Duplicate(data);
// Utility.Duplicate is just like System.arraycopy.
}
//////////
public KDRawString[] split(final char c)
{
// basically just call the static split, and convert each byte[]
into KDRawString.
byte[][] b = split(Data, c);
KDRawString[] R = new KDRawString[b.length];
for (int i=0; i<b.length; i++)
{
R = new KDRawString(b, true);
// we can reuse the byte[] here since they are newly created and
not used any where else. This saved some time in my test run.
};
return R;
}
//
public static byte[][] split(final byte[] data, final char c)
{
// split it by char c.
// first, decide how many segment we need.
int count = 0;
for (int i=0; i<data.length; i++)
if (data == c)
count ++;
//
byte[][] result = new byte[count+1][];
// then split it by char c.
int resultIndex = 0;
int prePos = 0;
for (int i=0; i<data.length; i++)
{
if (data == c)
{
byte[] b = Utility.Duplicate(data, prePos, i-prePos);
prePos = i+1;
result[resultIndex] = b;
resultIndex ++;
};
};
byte[] b = Utility.Duplicate(data, prePos, data.length-prePos);
result[resultIndex] = b;
//
return result;
}
////////
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Another class is KDInt, which is just a int, used as counter (so I do
not need to new many Integer in the counting process).
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for the main function, it is:
Hashtable ht = new Hashtable(10000); // total number of words is
about 4000.
// Using HashMap can save 1-2 seconds, as I have tested. So not a
major problem.
KDASCIITextFileReader kr = new KDASCIITextFileReader(fileName);
// This is a class to read in plain text file line by line as
byte[].
byte[] b = kr.NextLine_AsBytes();
int lineCount = 0;
while (b != null)
{
lineCount++;
//
KDRawString ks = new KDRawString(b);
//< ------------------------------------------------- Line A
KDRawString[] K = ks.split('\t');
//< ------------------------------------------------- Line B
for (int i=1; i<K.length; i++)
{
KDInt c = (KDInt) ht.get(K);
if (c == null)
{
c = new KDInt(0);
ht.put(K, c);
};
c.IncreaseValue();
};
//< ------------------------------------------------- Line C
b = kr.NextLine_AsBytes();
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// KDASCIITextFileReader is a class to read in text file line by line
into byte[].
public class KDASCIITextFileReader
{
private
boolean hasError = false;
private
InputStream is = null;
private
byte[] DataBuffer = null;
private
int DataBufferReadCount = -1; // bytes in the DataBuffer,
private
int DataBufferPointer = -1; // file seek pointer.
private
int initialBufferSize = 32768; // 32K.
public byte[] NextLine_AsBytes()
{
byte[] b = NextLine_PossibleBlank_AsBytes();
while ((b != null) && (b.length ==0) )
b = NextLine_PossibleBlank_AsBytes();
//
return b;
}
public byte[] NextLine_PossibleBlank_AsBytes()
{
// break at both \r and \n
// UNIX is \n, Win is \r\n, Apple is \r
//
if (is == null)
return null;
//
byte[] R = null; // bytes to return.
while (true)
{
int seekPos = DataBufferPointer;
while (seekPos < DataBufferReadCount)
{
if ( (DataBuffer[seekPos] == '\r') || (DataBuffer[seekPos] ==
'\n') )
{
byte[] b = Utility.Append(R, DataBuffer, DataBufferPointer,
seekPos-DataBufferPointer);
DataBufferPointer = seekPos+1;
return b;
};
seekPos++;
};
// so need to refill. add current to R before that.
R = Utility.Append(R, DataBuffer, DataBufferPointer,
DataBufferReadCount-DataBufferPointer);
fillBuffer();
if (DataBufferReadCount <0)
{
// end of file.
close();
return R;
};
}
}
public KDASCIITextFileReader(final String FileName)
{
initFile(FileName);
}
////////////////////
private void initFile(final String FileName)
{
hasError = false;
try
{
File f = new File(FileName);
FileInputStream fis = new FileInputStream(f);
//
String fnUpper = FileName.toLowerCase();
is = fis;
DataBuffer = new byte[initialBufferSize];
//
fillBuffer();
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
};
}
////////////////////
private void fillBuffer()
{
try
{
DataBufferReadCount = is.read(DataBuffer);
DataBufferPointer = 0;
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
}
}
public void close()
{
try
{
if (is != null)
is.close();
is = null;
DataBufferReadCount = -1;
DataBufferPointer = -1;
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
};
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// End of post.
my java: 227 seconds.
Time including all (CPU, IO, etc).
Task: read in data line by line (required), and find the word frequency
count from a large file (1.2G).
A c++ code (I do not have its source), it takes total 94 seconds, using
constant 600k memory as reported from Win TaskManager.
My code (I comment out some lines and run the code several times to get
these statistics):
1) Scan the data file line by line and convert each line to a customed
"raw string" class, total use 60 seconds. I use InputStream and read
byte[] of 32k each time (the data file is plain ascii file). And do my
own "NextLine()" function from the byte[]. In the follow code, this is
done by comment out lines from Line A to Line C.
2) on top of 1), adding splitting each line into words, use another 40
seconds. (total 100 seconds). In the follow code, this is done by
comment out lines from Line B to Line C.
3) on top of 2), adding a hashtable (hashmap similar result) frequency
count, use another 117 seconds (so total is 227 seconds).
Reading 1.2G plain text file line by line using 60 seconds seems not
that bad.
Splitting each line (byte[]) into words, which are many user-defined
"raw string" class, using 40 seconds, not that bad.
However, hash counting those words needs another 127 seconds,
hum..........
So:
Number 1 issue: counting the words using hashtable takes soooo long
time. What is the problem?
Number 2 issue: the split() function to split a byte[] seems not good
enough.
I list the codes below. Suggestions greatly appreciate.
Have a nice weekend!
///================================================================
Several major classes are:
My own raw string, which is basically a byte[] array.
public class KDRawString
{
private
byte[] Data = null;
public int hashCode()
{
int c = 1;
int v = 31;
for (int i=0; i<Data.length; i++)
{
c = c + Data*v;
v = v*31;
};
return c;
}
public boolean equals(Object o)
{
if (o instanceof KDRawString)
{
KDRawString K = (KDRawString) o;
if (K.Data.length != this.Data.length)
return false;
for (int i=0; i<this.Data.length; i++)
{
if (this.Data != K.Data)
return false;
};
return true;
}
else
return false;
}
public KDRawString(final byte[] data)
{
Data = Utility.Duplicate(data);
// Utility.Duplicate is just like System.arraycopy.
}
public KDRawString(byte[] data, final boolean ReUse)
{
if (ReUse) // sometimes, we do not need to create new array, if we
can use the input one.
Data = data;
else
Data = Utility.Duplicate(data);
// Utility.Duplicate is just like System.arraycopy.
}
//////////
public KDRawString[] split(final char c)
{
// basically just call the static split, and convert each byte[]
into KDRawString.
byte[][] b = split(Data, c);
KDRawString[] R = new KDRawString[b.length];
for (int i=0; i<b.length; i++)
{
R = new KDRawString(b, true);
// we can reuse the byte[] here since they are newly created and
not used any where else. This saved some time in my test run.
};
return R;
}
//
public static byte[][] split(final byte[] data, final char c)
{
// split it by char c.
// first, decide how many segment we need.
int count = 0;
for (int i=0; i<data.length; i++)
if (data == c)
count ++;
//
byte[][] result = new byte[count+1][];
// then split it by char c.
int resultIndex = 0;
int prePos = 0;
for (int i=0; i<data.length; i++)
{
if (data == c)
{
byte[] b = Utility.Duplicate(data, prePos, i-prePos);
prePos = i+1;
result[resultIndex] = b;
resultIndex ++;
};
};
byte[] b = Utility.Duplicate(data, prePos, data.length-prePos);
result[resultIndex] = b;
//
return result;
}
////////
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Another class is KDInt, which is just a int, used as counter (so I do
not need to new many Integer in the counting process).
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for the main function, it is:
Hashtable ht = new Hashtable(10000); // total number of words is
about 4000.
// Using HashMap can save 1-2 seconds, as I have tested. So not a
major problem.
KDASCIITextFileReader kr = new KDASCIITextFileReader(fileName);
// This is a class to read in plain text file line by line as
byte[].
byte[] b = kr.NextLine_AsBytes();
int lineCount = 0;
while (b != null)
{
lineCount++;
//
KDRawString ks = new KDRawString(b);
//< ------------------------------------------------- Line A
KDRawString[] K = ks.split('\t');
//< ------------------------------------------------- Line B
for (int i=1; i<K.length; i++)
{
KDInt c = (KDInt) ht.get(K);
if (c == null)
{
c = new KDInt(0);
ht.put(K, c);
};
c.IncreaseValue();
};
//< ------------------------------------------------- Line C
b = kr.NextLine_AsBytes();
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// KDASCIITextFileReader is a class to read in text file line by line
into byte[].
public class KDASCIITextFileReader
{
private
boolean hasError = false;
private
InputStream is = null;
private
byte[] DataBuffer = null;
private
int DataBufferReadCount = -1; // bytes in the DataBuffer,
private
int DataBufferPointer = -1; // file seek pointer.
private
int initialBufferSize = 32768; // 32K.
public byte[] NextLine_AsBytes()
{
byte[] b = NextLine_PossibleBlank_AsBytes();
while ((b != null) && (b.length ==0) )
b = NextLine_PossibleBlank_AsBytes();
//
return b;
}
public byte[] NextLine_PossibleBlank_AsBytes()
{
// break at both \r and \n
// UNIX is \n, Win is \r\n, Apple is \r
//
if (is == null)
return null;
//
byte[] R = null; // bytes to return.
while (true)
{
int seekPos = DataBufferPointer;
while (seekPos < DataBufferReadCount)
{
if ( (DataBuffer[seekPos] == '\r') || (DataBuffer[seekPos] ==
'\n') )
{
byte[] b = Utility.Append(R, DataBuffer, DataBufferPointer,
seekPos-DataBufferPointer);
DataBufferPointer = seekPos+1;
return b;
};
seekPos++;
};
// so need to refill. add current to R before that.
R = Utility.Append(R, DataBuffer, DataBufferPointer,
DataBufferReadCount-DataBufferPointer);
fillBuffer();
if (DataBufferReadCount <0)
{
// end of file.
close();
return R;
};
}
}
public KDASCIITextFileReader(final String FileName)
{
initFile(FileName);
}
////////////////////
private void initFile(final String FileName)
{
hasError = false;
try
{
File f = new File(FileName);
FileInputStream fis = new FileInputStream(f);
//
String fnUpper = FileName.toLowerCase();
is = fis;
DataBuffer = new byte[initialBufferSize];
//
fillBuffer();
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
};
}
////////////////////
private void fillBuffer()
{
try
{
DataBufferReadCount = is.read(DataBuffer);
DataBufferPointer = 0;
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
}
}
public void close()
{
try
{
if (is != null)
is.close();
is = null;
DataBufferReadCount = -1;
DataBufferPointer = -1;
}
catch (Exception e)
{
e.printStackTrace();
setError(e.toString());
};
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// End of post.