J
James Aguilar
Hey all,
I'm working on an encoding scheme where I am running into a problem with
reading a file off a stream. Looking at the binary encoding of the file
(using a simple hex editor), there is no problem, and the whole file is
there. However, when I try to read it from cin, at certain times, cin stops
reading. I cannot force cin to go around the bad character, nor, indeed, do
I know what the bad character is.
I am including code at the bottom, but I do not think that will be helpful.
Does anyone know how to read past an end of file character (supposing that
one comes in on the stream from a text file or some similar source but is
not -actually- the end of the file)? The location of the problem is marked
with *** below.
-JFA1
#include <iostream>
#include <vector>
#include <utility>
#include <algorithm>
#include <map>
#include <cassert>
using namespace std;
typedef unsigned char uchar;
typedef unsigned long ulong;
typedef pair<ulong, int> range;
typedef pair<range, uchar> rangewchar;
const ulong lmsbmask = 0x80000000;
int total = UCHAR_MAX;
vector<int> numEnc = vector<int>(256, 1);
map<range, uchar> rangetochar;
map<uchar, range> chartorange;
map<ulong, range> starttorange;
int recalcCount = 0;
int writeCounter;
uchar writeBuf;
int readCounter;
uchar readBuf;
void compress();
void recalculate();
ulong neededSpace(uchar c);
void decompress();
uchar readNextChar();
void flushWriteBuffer();
void writeBits(ulong lng, const int nbits);
void writeBit(bool bit);
bool readBit();
bool tripless(const rangewchar &r1, const rangewchar &r2);
bool tripgreat(const rangewchar &r1, const rangewchar &r2);
bool rless(const range &r1, const range &r2);
bool rgreat(const range &r1, const range &r2);
int main(int argc, char *argv[])
{
if (argc != 2) {
cerr << "Error: incorrect number of command line flags specified.
Bailing.\n"
<< "Usage: ARC [-c|-u]\n";
exit(EXIT_FAILURE);
}
if (!strcmp(argv[1], "-c"))
compress();
else if (!strcmp(argv[1], "-u"))
decompress();
if (writeCounter != 0)
flushWriteBuffer();
return 0;
}
void compress()
{
recalculate();
while (cin.peek() != (char) EOF) {
if (recalcCount == (1 << 8)) {
recalculate();
recalcCount = 0;
}
char next;
cin.get(next);
range nextRange((chartorange.find((uchar) next))->second);
writeBits(nextRange.first, nextRange.second);
++recalcCount; ++numEnc[(uchar) next]; ++total;
}
}
void recalculate()
{
vector<rangewchar> totalinfo(UCHAR_MAX);
for (int i = 0; i < UCHAR_MAX; ++i) {
range pr;
totalinfo.first.second = neededSpace(i);
totalinfo.second = i;
}
sort(totalinfo.begin(), totalinfo.end(), &tripless);
ulong previous = 0;
for (int i = 0; i < UCHAR_MAX; ++i) {
totalinfo.first.first = previous;
previous = previous + (lmsbmask >> (totalinfo.first.second - 1));
chartorange[totalinfo.second] = totalinfo.first;
starttorange[totalinfo.first.first] = totalinfo.first;
rangetochar[totalinfo.first] = totalinfo.second;
}
}
ulong neededSpace(uchar c)
{
double requiredRange = .5, avgratio = (double) numEnc[(unsigned char) c]
/ (double) total;
int bitsNeeded = 1;
while (requiredRange > avgratio) {
requiredRange /= 2;
++bitsNeeded;
}
return bitsNeeded;
}
void decompress()
{
recalculate();
while (cin.peek() != EOF) { //****The problem seems to happen HERE****
if (recalcCount == (1 << 8)) {
recalculate();
recalcCount = 0;
}
uchar nextchar = readNextChar();
cout << nextchar;
++recalcCount; ++numEnc[nextchar]; ++total;
}
}
uchar readNextChar()
{
int nread(0);
ulong tmp(0);
while (true) {
bool nextbit(readBit());
if (nextbit)
tmp |= (lmsbmask >> nread);
map<ulong, range>::const_iterator it(starttorange.find(tmp));
if (it != starttorange.end()) { //If we find a matching start point
assert(nread <= it->second.second);
if (it->second.second == nread+1) //If we've read the right number of
chars
return rangetochar.find(it->second)->second; //Bingo
}
++nread;
}
}
const uchar clsbmask = 0x01;
const uchar cmsbmask = 0x80;
void writeBits(ulong lng, const int nbits)
{
for (int i = 0; i < nbits; ++i) {
writeBit((lng & lmsbmask) == lmsbmask);
lng <<= 1;
}
}
void writeBit(bool bit)
{
if (writeCounter == 8) {
cout.put(writeBuf);
writeCounter = 0;
writeBuf = 0;
}
writeBuf <<= 1;
if (bit)
writeBuf |= clsbmask;
++writeCounter;
}
void flushWriteBuffer()
{
while (writeCounter!=1) {
writeBit(false);
}
}
bool readBit()
{
if (readCounter == 0) {
readCounter = 8;
cin.get(reinterpret_cast<char &>(readBuf));
}
bool retBit = (readBuf & cmsbmask) == cmsbmask;
readBuf <<= 1;
--readCounter;
return retBit;
}
bool rless(const range &r1, const range &r2)
{
return (r1.second) < (r2.second);
}
bool rgreat(const range &r1, const range &r2)
{
return (r1.second) > (r2.second);
}
bool tripless(const rangewchar &r1, const rangewchar &r2)
{
return (r1.first.second) < (r2.first.second);
}
bool tripgreat(const rangewchar &r1, const rangewchar &r2)
{
return (r1.first.second) > (r2.first.second);
}
I'm working on an encoding scheme where I am running into a problem with
reading a file off a stream. Looking at the binary encoding of the file
(using a simple hex editor), there is no problem, and the whole file is
there. However, when I try to read it from cin, at certain times, cin stops
reading. I cannot force cin to go around the bad character, nor, indeed, do
I know what the bad character is.
I am including code at the bottom, but I do not think that will be helpful.
Does anyone know how to read past an end of file character (supposing that
one comes in on the stream from a text file or some similar source but is
not -actually- the end of the file)? The location of the problem is marked
with *** below.
-JFA1
#include <iostream>
#include <vector>
#include <utility>
#include <algorithm>
#include <map>
#include <cassert>
using namespace std;
typedef unsigned char uchar;
typedef unsigned long ulong;
typedef pair<ulong, int> range;
typedef pair<range, uchar> rangewchar;
const ulong lmsbmask = 0x80000000;
int total = UCHAR_MAX;
vector<int> numEnc = vector<int>(256, 1);
map<range, uchar> rangetochar;
map<uchar, range> chartorange;
map<ulong, range> starttorange;
int recalcCount = 0;
int writeCounter;
uchar writeBuf;
int readCounter;
uchar readBuf;
void compress();
void recalculate();
ulong neededSpace(uchar c);
void decompress();
uchar readNextChar();
void flushWriteBuffer();
void writeBits(ulong lng, const int nbits);
void writeBit(bool bit);
bool readBit();
bool tripless(const rangewchar &r1, const rangewchar &r2);
bool tripgreat(const rangewchar &r1, const rangewchar &r2);
bool rless(const range &r1, const range &r2);
bool rgreat(const range &r1, const range &r2);
int main(int argc, char *argv[])
{
if (argc != 2) {
cerr << "Error: incorrect number of command line flags specified.
Bailing.\n"
<< "Usage: ARC [-c|-u]\n";
exit(EXIT_FAILURE);
}
if (!strcmp(argv[1], "-c"))
compress();
else if (!strcmp(argv[1], "-u"))
decompress();
if (writeCounter != 0)
flushWriteBuffer();
return 0;
}
void compress()
{
recalculate();
while (cin.peek() != (char) EOF) {
if (recalcCount == (1 << 8)) {
recalculate();
recalcCount = 0;
}
char next;
cin.get(next);
range nextRange((chartorange.find((uchar) next))->second);
writeBits(nextRange.first, nextRange.second);
++recalcCount; ++numEnc[(uchar) next]; ++total;
}
}
void recalculate()
{
vector<rangewchar> totalinfo(UCHAR_MAX);
for (int i = 0; i < UCHAR_MAX; ++i) {
range pr;
totalinfo.first.second = neededSpace(i);
totalinfo.second = i;
}
sort(totalinfo.begin(), totalinfo.end(), &tripless);
ulong previous = 0;
for (int i = 0; i < UCHAR_MAX; ++i) {
totalinfo.first.first = previous;
previous = previous + (lmsbmask >> (totalinfo.first.second - 1));
chartorange[totalinfo.second] = totalinfo.first;
starttorange[totalinfo.first.first] = totalinfo.first;
rangetochar[totalinfo.first] = totalinfo.second;
}
}
ulong neededSpace(uchar c)
{
double requiredRange = .5, avgratio = (double) numEnc[(unsigned char) c]
/ (double) total;
int bitsNeeded = 1;
while (requiredRange > avgratio) {
requiredRange /= 2;
++bitsNeeded;
}
return bitsNeeded;
}
void decompress()
{
recalculate();
while (cin.peek() != EOF) { //****The problem seems to happen HERE****
if (recalcCount == (1 << 8)) {
recalculate();
recalcCount = 0;
}
uchar nextchar = readNextChar();
cout << nextchar;
++recalcCount; ++numEnc[nextchar]; ++total;
}
}
uchar readNextChar()
{
int nread(0);
ulong tmp(0);
while (true) {
bool nextbit(readBit());
if (nextbit)
tmp |= (lmsbmask >> nread);
map<ulong, range>::const_iterator it(starttorange.find(tmp));
if (it != starttorange.end()) { //If we find a matching start point
assert(nread <= it->second.second);
if (it->second.second == nread+1) //If we've read the right number of
chars
return rangetochar.find(it->second)->second; //Bingo
}
++nread;
}
}
const uchar clsbmask = 0x01;
const uchar cmsbmask = 0x80;
void writeBits(ulong lng, const int nbits)
{
for (int i = 0; i < nbits; ++i) {
writeBit((lng & lmsbmask) == lmsbmask);
lng <<= 1;
}
}
void writeBit(bool bit)
{
if (writeCounter == 8) {
cout.put(writeBuf);
writeCounter = 0;
writeBuf = 0;
}
writeBuf <<= 1;
if (bit)
writeBuf |= clsbmask;
++writeCounter;
}
void flushWriteBuffer()
{
while (writeCounter!=1) {
writeBit(false);
}
}
bool readBit()
{
if (readCounter == 0) {
readCounter = 8;
cin.get(reinterpret_cast<char &>(readBuf));
}
bool retBit = (readBuf & cmsbmask) == cmsbmask;
readBuf <<= 1;
--readCounter;
return retBit;
}
bool rless(const range &r1, const range &r2)
{
return (r1.second) < (r2.second);
}
bool rgreat(const range &r1, const range &r2)
{
return (r1.second) > (r2.second);
}
bool tripless(const rangewchar &r1, const rangewchar &r2)
{
return (r1.first.second) < (r2.first.second);
}
bool tripgreat(const rangewchar &r1, const rangewchar &r2)
{
return (r1.first.second) > (r2.first.second);
}