Finding strings in binary files

R

Robert Manea

Hello everyone,

I wrote, simply as an exercise, a small piece of code to find 'strings'
(defined as an amount of at least 3 ASCII characters followed by a non
ASCII character) in binary files.

The purpose of the program is to serve as a facile 'strings' (Unix
command) replacement and to be 100% ANSI C. Unfortunatelly it operates
notedly slower than the original 'strings' from the fileutils package.

Maybe someone has some hints on how to improve performance and keep the
code at the same time pure ANSI C.

Any other remarks to obvoius or not so obvoius errors are highly
appreciated, too.

#v+

/* Seek for ASCII-strings in binary streams and output them including
* their byte-position in the stream
*/

#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>

#define MAXSTRING 128
#define ISSTRSIZE 3

int main(int argc, char *argv[])
{
FILE *inp;
size_t i=0, j=MAXSTRING;
int ch;
char *buf;

switch(argc) {
case 0:
case 1:
if( !(inp=fdopen(fileno(stdin), "r")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
case 2:
if( !(inp=fopen(argv[1], "rb")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

if( !(buf = malloc(MAXSTRING)) )
return EXIT_FAILURE;


while( !feof(inp) ) {

ch = fgetc(inp);

if ( isprint(ch) ) {
if( i>j ) {
buf = realloc(buf, j*2);
j *= 2;
}
buf[i++] = (char) ch;
}
else {
if( i>ISSTRSIZE ) {
#ifdef POSITION
printf("%6lu: ", ftell(inp)-i-1);
#endif
buf = '\0';
puts(buf);
}
i=0;
}
}

free(buf);
return EXIT_SUCCESS;
}

#v-

TIA & Greets, Rob
 
J

Jens.Toerring

Robert Manea said:
I wrote, simply as an exercise, a small piece of code to find 'strings'
(defined as an amount of at least 3 ASCII characters followed by a non
ASCII character) in binary files.
The purpose of the program is to serve as a facile 'strings' (Unix
command) replacement and to be 100% ANSI C. Unfortunatelly it operates
notedly slower than the original 'strings' from the fileutils package.
Maybe someone has some hints on how to improve performance and keep the
code at the same time pure ANSI C.
Any other remarks to obvoius or not so obvoius errors are highly
appreciated, too.
/* Seek for ASCII-strings in binary streams and output them including
* their byte-position in the stream
*/

#define MAXSTRING 128
#define ISSTRSIZE 3
int main(int argc, char *argv[])
{
FILE *inp;
size_t i=0, j=MAXSTRING;
int ch;
char *buf;
switch(argc) {
case 0:
case 1:
if( !(inp=fdopen(fileno(stdin), "r")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
case 2:
if( !(inp=fopen(argv[1], "rb")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

Why do you open stdin with "r" but a file with "rb"? What you get from
stdin could be a binary file (e.g. via a pipe).
if( !(buf = malloc(MAXSTRING)) )

Some people might object to using logical negation operator in this
case for stylistic reasons, and I would think

if ( ( buf = malloc( MAXSTRING ) ) == NULL )

could make your intentions easier to see.
return EXIT_FAILURE;

while( !feof(inp) ) {

ch = fgetc(inp);

feof() doesn't work as you seem to assume. It only will return a
useful value _after_ you have tried to read something. Why don't
you go for the much simpler

while ( ( ch = fgetc( inp ) ) != EOF ) {

That should cover all cases nicely, both end of file and read errors.
if ( isprint(ch) ) {
if( i>j ) {

That should be "i >= j" - if i is already as large as j with buf[ i ]
you would already be one past the end of the buffer.
buf = realloc(buf, j*2);
j *= 2;
}
buf[i++] = (char) ch;
}
else {
if( i>ISSTRSIZE ) {
#ifdef POSITION
printf("%6lu: ", ftell(inp)-i-1);
#endif
buf = '\0';


You might need here another check - if i > j - 1 this would write past
the end of the buffer.
puts(buf);
}
i=0;
}
}
free(buf);
return EXIT_SUCCESS;
}

I guess that some of the effects of the original strings implementation
being faster might result from reading in larger chunks of the file at
once into memory and then operating on that buffer instead of calling
fgetc() for each character. That's something you could also implement.
But since they aren't bound by strict ANSI C conformance they also can
use additional, platform dependend tricks like the use of mmap() where
available...
Regards, Jens
 
A

Alex Fraser

Robert Manea said:
I wrote, simply as an exercise, a small piece of code to find 'strings'
(defined as an amount of at least 3 ASCII characters followed by a non
ASCII character) in binary files.

ITYM at least 3 printable characters.
The purpose of the program is to serve as a facile 'strings' (Unix
command) replacement and to be 100% ANSI C. Unfortunatelly it operates
notedly slower than the original 'strings' from the fileutils package.

Maybe someone has some hints on how to improve performance and keep the
code at the same time pure ANSI C.

Any other remarks to obvoius or not so obvoius errors are highly
appreciated, too.

#v+

/* Seek for ASCII-strings in binary streams and output them including
* their byte-position in the stream
*/

#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>

#define MAXSTRING 128
#define ISSTRSIZE 3

int main(int argc, char *argv[])
{
FILE *inp;
size_t i=0, j=MAXSTRING;
int ch;
char *buf;

switch(argc) {
case 0:
case 1:
if( !(inp=fdopen(fileno(stdin), "r")) ) {

fdopen() and fileno() are not ANSI C functions. You can simply:
inp = stdin;

Or perhaps use freopen() to open stdin in binary mode.
perror("Error");
return EXIT_FAILURE;
}
break;
case 2:
if( !(inp=fopen(argv[1], "rb")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

if( !(buf = malloc(MAXSTRING)) )
return EXIT_FAILURE;


while( !feof(inp) ) {
ch = fgetc(inp);

while ((ch = getc(inp)) != EOF) {

For reasons given elsewhere.
if ( isprint(ch) ) {
if( i>j ) {
buf = realloc(buf, j*2);

Always use a temporary pointer and test the result for success.
j *= 2;
}
buf[i++] = (char) ch;
}
else {
if( i>ISSTRSIZE ) {
#ifdef POSITION
printf("%6lu: ", ftell(inp)-i-1);
#endif
buf = '\0';
puts(buf);
}
i=0;
}
}

free(buf);
return EXIT_SUCCESS;
}


You may be able to improve performance by using fread() to read larger
chunks. Also, there is no reason to buffer entire string; after you've
collected enough chars to decide it's a string, you can simply write them to
stdout and then continue to copy stdin to stdout until you see a
non-printable character. This might make a big performance difference if I
offered a rather large text file as input :).

Alex
 
M

Malcolm

Robert Manea said:
while( !feof(inp) ) {

ch = fgetc(inp);
There's no gross inefficiency here, but you are making two function calls
that could be replaced with a single macro call to getc(). As others have
noted the use of feof() is incorrect anyway, though it hardly matters (it
means the last call to fgetc() will return EOF, which you handle as a normal
character).
This could well speed you up.
if ( isprint(ch) ) {
if( i>j ) {
buf = realloc(buf, j*2);
You need a test here for out of memory.
j *= 2;
}
buf[i++] = (char) ch;
}
else {
if( i>ISSTRSIZE ) {
#ifdef POSITION
printf("%6lu: ", ftell(inp)-i-1);
#endif
buf = '\0';
puts(buf);
}
i=0;
}
}

free(buf);
return EXIT_SUCCESS;
}
 
R

Robert Manea

Segfault in module "Alex Fraser" - dump details are as follows:

Thanks a lot for your suggestions Jens and Alex! I followed your
advices and achieved a real boost in speed.

For anyone interested here is the new version (I'm sure it still isn't
perfect, but way faster than the one before) including some benchmark
results.

First of all the benchmarks:

Tested on the following file:
$ dd if=/dev/urandom of=foo bs=1024 count=131072
$ ls -lh foo
-rw-rw-r-- 1 robert robert 128M 29. Jul 19:50 foo

$ time strings foo > /dev/null
7,01s user 0,46s system 99% cpu 7,544 total

$ time ./my_strings_OLD > /dev/null
8,60s user 0,65s system 99% cpu 9,292 total

$ time ./my_strings_NEW > /dev/null
2,02s user 0,48s system 98% cpu 2,547 total


And The Code:

#v+

#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>

#define ISSTRSIZE 3
#define READBUF 2048


int main(int argc, char *argv[])
{
FILE *inp;
size_t i=0, k=0, c_read;
char buf[ISSTRSIZE+1], rbuf[READBUF];

switch(argc) {
case 0:
case 1:
inp = stdin;
break;
case 2:
if( !(inp=fopen(argv[1], "rb")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

while ( (c_read=fread(rbuf, 1, READBUF, inp)) > 0 ) {

while(k <= c_read) { /* Not really sure here if '<' or '<=' */
if ( isprint(rbuf[k]) ) {
if(i<ISSTRSIZE)
buf[i++] = rbuf[k];
else if(i == ISSTRSIZE) {
buf[ISSTRSIZE] = '\0';
fputs(buf, stdout);
fputc(rbuf[k], stdout);
i++;
}
else {
fputc(rbuf[k], stdout);
i++;
}
}
else {
if (i>ISSTRSIZE)
putchar('\n');
i=0;
}
k++;
}
k=0;
}

return EXIT_SUCCESS;
}

#v-

Greets, Rob
 
A

Arthur J. O'Dwyer

Thanks a lot for your suggestions Jens and Alex! I followed your
advices and achieved a real boost in speed.

[I tried profiling your original program with gprof, but it ran too
quickly to generate any data, even on several-megabyte inputs. It
looks like you have enough disk to run gigantic tests; have you
tried profiling the code to see where its bottlenecks are? Google
'gprof manual'.]

As for your code, it may well be as fast as possible. So I'm
going to inflict style tips on it.
#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>

#define ISSTRSIZE 3
#define READBUF 2048

Neither of these names seems really correct. 'ISSTRSIZE' sounds
like a boolean, and 'READBUF' sounds like an action. But both of
them are really integer buffer sizes.
int main(int argc, char *argv[])
{
FILE *inp;
size_t i=0, k=0, c_read;
char buf[ISSTRSIZE+1], rbuf[READBUF];

switch(argc) {
case 0:
case 1:
inp = stdin;
break;
case 2:
if( !(inp=fopen(argv[1], "rb")) ) {
perror("Error");
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

while ( (c_read=fread(rbuf, 1, READBUF, inp)) > 0 ) {

while(k <= c_read) { /* Not really sure here if '<' or '<=' */

A bad sign. 'c_read' is the number of bytes read from the file,
correct? And at the beginning of this loop, 'k' is... [scan the
file looking for initialization of 'k'...] zero. And you're...
[scan the file looking for increment...] incrementing 'k' and
accessing 'rbuf[k]' for each 'k'. So if 'c_read' is 'READBUF',
then 'k' ought to go only up to 'READBUF-1'. You meant '<', not
'<='. (This is almost always a safe bet in C.)
if ( isprint(rbuf[k]) ) {
if(i<ISSTRSIZE)
buf[i++] = rbuf[k];
else if(i == ISSTRSIZE) {
buf[ISSTRSIZE] = '\0';
fputs(buf, stdout);
fputc(rbuf[k], stdout);
i++;
}
else {
fputc(rbuf[k], stdout);
i++;
}

Here you write 'i++' three times in three different control
branches. Only one increment is really needed. Pull it out of
the branches into the body of the enclosing 'if'.
}
else {
if (i>ISSTRSIZE)
putchar('\n');
i=0;
}
k++;
}
k=0;

The re-initialization of 'k' is shoved all the way down here,
far from where it's used. This is bad. (As with the '++i', you're
duplicating code in the wrong places rather than putting it in the
right place to begin with.)
}

return EXIT_SUCCESS;
}

Rewriting to incorporate all these style changes, we have:


#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>

#define MIN_STRING_SIZE 3
#define BUFFER_SIZE 2048

int main(int argc, char *argv[])
{
FILE *inp;
size_t i, c_read;
char buf[MIN_STRING_SIZE];
char rbuf[BUFFER_SIZE];

switch(argc) {
case 0:
case 1:
inp = stdin;
break;
case 2:
inp = fopen(argv[1], "rb");
if (inp == NULL) {
fprintf(stderr, "Could not open file '%s'\n", argv[1]);
return EXIT_FAILURE;
}
break;
default:
fprintf(stderr, "Syntax: %s [File]\n", argv[0]);
return EXIT_FAILURE;
}

i = 0;
while ((c_read = fread(rbuf, 1, sizeof rbuf, inp)) > 0)
{
size_t k;
for (k=0; k < c_read; ++k) {
if (isprint(rbuf[k])) {
if (i < MIN_STRING_SIZE)
buf = rbuf[k];
else if (i == MIN_STRING_SIZE) {
printf("%.*s", sizeof buf, buf);
putchar(rbuf[k]);
}
else {
putchar(rbuf[k]);
}
++i;
}
else {
if (i > MIN_STRING_SIZE)
putchar('\n');
i = 0;
}
}
}

return EXIT_SUCCESS;
}

The scope of 'i' is still kind of icky-looking to me, and I don't
like the three-way branch depending on the comparison of 'i' and
'MIN_STR_SIZE'; but I'm not sure there's a better approach that
would retain this general algorithm.

HTH,
-Arthur
 
C

CBFalconer

Robert said:
I wrote, simply as an exercise, a small piece of code to find
'strings' (defined as an amount of at least 3 ASCII characters
followed by a non ASCII character) in binary files.

The purpose of the program is to serve as a facile 'strings'
(Unix command) replacement and to be 100% ANSI C. Unfortunatelly
it operates notedly slower than the original 'strings' from the
fileutils package.

Maybe someone has some hints on how to improve performance and
keep the code at the same time pure ANSI C.

Here is something I threw together a while ago. As you can see
from the comments, Leor Zolman also published something here. The
point of mine was to avoid any backtracking, and to handle both
binary and text file inputs. All standard C, except the
dependency on input redirection to handle text files.

/*
Leor said:
I think so. Here's a version I just threw together:
*/

/* And heres another throw -- binfsrch.c by CBF */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

/* The difference between a binary and a text file, on read,
is the conversion of end-of-line delimiters. What those
delimiters are does not affect the action. In some cases
the presence of 0x1a EOF markers (MsDos) does.

This is a version of Knuth-Morris-Pratt algorithm. The
point of using this is to avoid any backtracking in file
reading, and thus avoiding any use of buffer arrays.
*/

size_t chrcount; /* debuggery, count of input chars, zeroed */

/* --------------------- */

/* Almost straight out of Sedgewick */
/* The next array indicates what index in id should next be
compared to the current char. Once the (lgh - 1)th char
has been successfully compared, the id has been found.
The array is formed by comparing id to itself. */
void initnext(int *next, const char *id, int lgh)
{
int i, j;

assert(lgh > 0);
next[0] = -1; i = 0; j = -1;
while (i < lgh) {
while ((j >= 0) && (id != id[j])) j = next[j];
i++; j++;
next = j;
}
#if (0)
for (i = 0; i < lgh; i++)
printf("id[%d] = '%c' next[%d] = %d\n",
i, id, i, next);
#endif
} /* initnext */

/* --------------------- */

/* reads f without rewinding until either EOF or *marker
has been found. Returns EOF if not found. At exit the
last matching char has been read, and no further. */
int kmpffind(const char *marker, int lgh, int *next, FILE *f)
{
int j; /* char position in marker to check */
int ch; /* current char */

assert(lgh > 0);
j = 0;
while ((j < lgh) && (EOF != (ch = getc(f)))) {
chrcount++;
while ((j >= 0) && (ch != marker[j])) j = next[j];
j++;
}
return ch;
} /* kmpffind */

/* --------------------- */

/* Find marker in f, display following printing chars
up to some non printing character or EOF */
int binfsrch(const char *marker, FILE *f)
{
int *next;
int lgh;
int ch;
int items; /* count of markers found */

lgh = strlen(marker);
if (!(next = malloc(lgh * sizeof *next))) {
puts("No memory");
exit(EXIT_FAILURE);
}
else {
initnext(next, marker, lgh);
items = 0;
while (EOF != kmpffind(marker, lgh, next, f)) {
/* found, take appropriate action */
items++;
printf("%d %s : \"", items, marker);
while (isprint(ch = getc(f))) {
chrcount++;
putchar(ch);
}
puts("\"");
if (EOF == ch) break;
else chrcount++;
}
free(next);
return items;
}
} /* binfsrch */

/* --------------------- */

int main(int argc, char **argv)
{
FILE *f;

f = stdin;
if (3 == argc) {
if (!(f = fopen(argv[2], "rb"))) {
printf("Can't open %s\n", argv[2]);
exit(EXIT_FAILURE);
}
argc--;
}
if (2 != argc) {
puts("Usage: binfsrch name [binaryfile]");
puts(" (file defaults to stdin text mode)");
}
else if (binfsrch(argv[1], f)) {
printf("\"%s\" : found\n", argv[1]);
}
else printf("\"%s\" : not found\n", argv[1]);
printf("%lu chars\n", (unsigned long)chrcount);
return 0;
} /* main binfsrch */
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,995
Messages
2,570,230
Members
46,819
Latest member
masterdaster

Latest Threads

Top