Program to remove C comments (long signature)

James Hu · Nov 12, 2003

This program is long. I don't really want to bore everyone with the
details, but it handles wierd cases like:

/\
* this is a comment *\
/

#define FOO ??/* this is not a comment */

char *a = /* this is a comment "\*/"this is a string"/*" another comment */;

I intend this program to be an example of how to write a kind of state
machine, not really an example of tight coding, but any comments would
be welcome.

Thanks,

-- James
--
/*
* cstripc: A C program to strip comments from C files.
* Usage:
* cstripc [file [...]]
* cstripc [-t]
*
* The '-t' options is used for testing. It prints some pointers to strings
* that are interlaced with comment characters.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*****************/
/**** GLOBALS ****/
/*****************/

static const char *progname;
static int debug_flag;

/**********************/
/**** MAIN PROGRAM ****/
/**********************/

static void print_usage(void);
static void print_test(void);

static FILE * open_input_file(const char *filename);
static void close_input_file(FILE *infile);
static void parse_input_file(FILE *infile);

int
main(int argc, char *argv[])
{
progname = argv[0];
if (progname == 0) {
progname = "cstripc";
}

while (argc > 1) {

if ((*argv[1] != '-') || (strcmp(argv[1], "-") == 0)) {
break;
}

if (strcmp(argv[1], "-t") == 0) {
print_test();
exit(0);
} else if (strcmp(argv[1], "-d") == 0) {
debug_flag = 1;
} else {
fprintf(stderr, "%s: Unrecognized option '%s'\n",
progname, argv[1]);
print_usage();
exit(EXIT_FAILURE);
}

--argc;
++argv;
}

if (argc <= 1) {
parse_input_file(stdin);
exit(0);
}

while (argc > 1) {
FILE *infile;

parse_input_file(infile = open_input_file(argv[1]));
close_input_file(infile);

--argc;
++argv;
}
}

/**************************/
/**** PRINT USAGE/TEST ****/
/**************************/

static const char *usage_string =
"%s: A C program to strip comments from C files.\n"
"Usage:\n"
" %s [file [...]]\n"
" %s [-t]\n"
"\n"
"The '-t' options is used for testing. It prints some pointers to strings\n"
"that are interlaced with comment characters.\n"
;

static void
print_usage(void)
{
fprintf(stderr, usage_string, progname, progname, progname);
}

static const char *a;
static const char *b;
static const char *c;

static void
print_test(void)
{
if (a) puts(a);
if (b) puts(b);
if (c) puts(c);
}

/*******************************/
/**** OPEN/CLOSE INPUT FILE ****/
/*******************************/

static const char *input_file_name;

static FILE *
open_input_file(const char *filename)
{
FILE *infile;

input_file_name = filename;

if (filename == 0) {
return 0;
}

if (strcmp(filename, "-") == 0) {
return stdin;
}

infile = fopen(filename, "r");
if (infile == 0) {
fprintf(stderr, "%s: Could not open '%s' for reading.\n",
progname, filename);
}

return infile;
}

static void
close_input_file(FILE *infile)
{
if (infile) {
if (infile != stdin) {
if (fclose(infile) == EOF)
fprintf(stderr, "%s, Could not close '%s'.\n",
progname, input_file_name);
} else {
clearerr(stdin);
}
}
}

/**************************/
/**** PARSE INPUT FILE ****/
/**************************/

typedef struct scan_state scan_state;
typedef struct scan_context scan_context;

struct scan_context {
scan_state *ss;
char *sbuf;
unsigned sbufsz;
unsigned sbufcnt;
};

struct scan_state {
scan_state *(*scan)(scan_context *ctx, int input);
const char *name;\0
};

static scan_context initial_scan_context;

static void
parse_input_file(FILE *infile)
{
int c;
scan_context ctx;

if (infile == 0) {
return;
}

ctx = initial_scan_context;

while ((c = fgetc(infile)) != EOF) {
if (debug_flag) {
fprintf(stderr, "%s\n", ctx.ss->name);
}
ctx.ss = ctx.ss->scan(&ctx, c);
}
}

/***********************/
/**** STATE MACHINE ****/
/***********************/

/*
*
***************************************************************************
* Assume input is a syntactically correct C program.
*
* The basic algorithm is:
* Scan character by character:
* Treat trigraphs as a single character.
* If the sequence does not start a comment, emit the sequence.
* Otherwise,
* Scan character by character:
* Treat trigraphs as a single character.
* Treat the sequence '\\' '\n' as no character.
* If the sequence does not end a comment, continue consuming.
* Otherwise, emit a space, and loop back to top.
***************************************************************************
*
*/

#define SCAN_STATE_DEFINE(name) \
static scan_state * name##_func(scan_context *ctx, int input); \
static scan_state name##_state = { name##_func, #name }

SCAN_STATE_DEFINE(normal);
SCAN_STATE_DEFINE(normal_maybe_tri_1);
SCAN_STATE_DEFINE(normal_maybe_tri_2);
SCAN_STATE_DEFINE(string);
SCAN_STATE_DEFINE(string_maybe_tri_1);
SCAN_STATE_DEFINE(string_maybe_tri_2);
SCAN_STATE_DEFINE(string_maybe_splice);
SCAN_STATE_DEFINE(char);
SCAN_STATE_DEFINE(char_maybe_tri_1);
SCAN_STATE_DEFINE(char_maybe_tri_2);
SCAN_STATE_DEFINE(char_maybe_splice);
SCAN_STATE_DEFINE(slash);
SCAN_STATE_DEFINE(slash_maybe_tri_1);
SCAN_STATE_DEFINE(slash_maybe_tri_2);
SCAN_STATE_DEFINE(slash_maybe_splice);
SCAN_STATE_DEFINE(slashslash);
SCAN_STATE_DEFINE(slashslash_maybe_tri_1);
SCAN_STATE_DEFINE(slashslash_maybe_tri_2);
SCAN_STATE_DEFINE(slashslash_maybe_splice);
SCAN_STATE_DEFINE(slashsplat);
SCAN_STATE_DEFINE(slashsplat_splat);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_tri_1);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_tri_2);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_splice);

#define SCAN_STATE(name) (&name##_state)

static scan_context initial_scan_context = { SCAN_STATE(normal), 0, 0, 0 };

static void sbuf_append_char(scan_context *ctx, int c);
static void sbuf_append_string(scan_context *ctx, char *s);
static void sbuf_clear(scan_context *ctx);
static void sbuf_emit(scan_context *ctx);

static scan_state *
normal_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_tri_1);
case '"': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
case '\'': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
case '/': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(slash);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
}
}

static scan_state *
normal_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
normal_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(normal_maybe_tri_2);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-':
case '/': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
string_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_tri_1);
case '"': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_splice);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
}
}

static scan_state *
string_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(string)->scan(ctx, input);
}
}

static scan_state *
string_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(string_maybe_tri_2);
case '/': sbuf_append_car(ctx, input);
return SCAN_STATE(string_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
default: sbuf_emit(ctx);
return SCAN_STATE(string)->scan(ctx, input);
}
}

static scan_state *
string_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
}
}

static scan_state *
char_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_tri_1);
case '\'': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_splice);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
}
}

static scan_state *
char_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_tri_2);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char)->scan(ctx, input);
}
}

static scan_state *
char_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(char_maybe_tri_2);
case '/': sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
default: sbuf_emit(ctx);
return SCAN_STATE(char)->scan(ctx, input);
}
}

static scan_state *
char_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
}
}

static scan_state *
slash_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_tri_1);
case '\\': sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_splice);
case '/': sbuf_clear(ctx);
return SCAN_STATE(slashslash);
case '*': sbuf_clear(ctx);
return SCAN_STATE(slashsplat);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
slash_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slash_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
slash_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_string(ctx, "??");
return SCAN_STATE(normal_maybe_tri_2);
case '/': sbuf_append_char(ctx, '?');
sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_append_char(ctx, '?');
sbuf_append_char(ctx, input);
sbuf_emit(ctx);
return SCAN_STATE(normal);
default: sbuf_append_char(ctx, '?');
sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
slash_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': sbuf_append_char(ctx, input);
return SCAN_STATE(slash);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}

static scan_state *
slashslash_func(scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_1);
case '\\': return SCAN_STATE(slashslash_maybe_splice);
case '\n': putchar(' ');
putchar(input);
return SCAN_STATE(normal);
default: return SCAN_STATE(slashslash);
}
}

static scan_state *
slashslash_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_2);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}

static scan_state *
slashslash_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_2);
case '/': return SCAN_STATE(slashslash_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slashslash);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}

static scan_state *
slashslash_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slashslash);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}

static scan_state *
slashsplat_func(scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '*': return SCAN_STATE(slashsplat_splat);
default: return SCAN_STATE(slashsplat);
}
}

static scan_state *
slashsplat_splat_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashsplat_splat_maybe_tri_1);
case '\\': return SCAN_STATE(slashsplat_splat_maybe_splice);
case '/': putchar(' ');
return SCAN_STATE(normal);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}

static scan_state *
slashsplat_splat_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashsplat_splat_maybe_tri_2);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}

static scan_state *
slashsplat_splat_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '/': return SCAN_STATE(slashsplat_splat_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slashsplat);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}

static scan_state *
slashsplat_splat_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slashsplat_splat);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}

/*************************/
/**** BUFFER HANDLING ****/
/*************************/

static void
sbuf_append_char(scan_context *ctx, int c)
{
if (ctx->sbuf == 0) {
ctx->sbuf = malloc(ctx->sbufsz = 128);
} else if (ctx->sbufcnt == ctx->sbufsz) {
char *p = realloc(ctx->sbuf, ctx->sbufsz *= 2);
if (p == 0) {
fprintf(stderr, "%s: memory allocation failure\n", progname);
exit(EXIT_FAILURE);
}
ctx->sbuf = p;
}

ctx->sbuf[ctx->sbufcnt++] = c;
ctx->sbuf[ctx->sbufcnt] = '\0';
}

static void
sbuf_append_string(scan_context *ctx, char *s)
{
while (*s != '\0') {
sbuf_append_char(ctx, *s++);
}
}

static void
sbuf_clear(scan_context *ctx)
{
ctx->sbufcnt = 0;
if (ctx->sbuf) {
ctx->sbuf[ctx->sbufcnt] = '\0';
}
}

static void
sbuf_emit(scan_context *ctx)
{
if (ctx->sbuf == 0 || ctx->sbufcnt == 0) {
return;
}

printf("%s", ctx->sbuf);
sbuf_clear(ctx);
}

/********************/
/**** TEST CASES ****/
/********************/

/* a comment */
/\
* a comment split */
/\
\
* a comment split twice */
/*
block comment
*/
/* comment, trailing delimiter split *\
/
/* comment, trailing delimiter split twice *\
\
/
/* comment, trailing delimiter split once, and again by trigraph *\
??/
/

static const char *a = /* comment in code line "*/"Hello, "/**/"World!";
static const char *b = /\
* comment on code line split */ "Hello, " /\
\
* comment on code line split twice */ "World!";

#define FOO ??/* this does not start a comment */

#if defined(__STDC__) && (__STDC__ == 1)
#if defined(__STD_VERSION__) && (__STD_VERSION__ >= 199901L)
//*** MORE TEST CASES ***//
/\
/ // comment split
/\
\
/ // comment split twice
static const char *c = // // comment on code line
"Hello, " /\
/ // comment on code line split
"World!" /\
\
/ // comment on code line split twice.
;

#define BAR ??// this does not start a comment

// This is a // comment \
on two lines

#else
static const char *c = "STDC without STD_VERSION";
#endif
#endif

Martijn · Nov 12, 2003

James said:
This program is long. I don't really want to bore everyone with the
details, but it handles wierd cases like:

^^^^^
<rant>
Weird how this word is one of the most misspelled words I have ever come
across

.

#define FOO ??/* this is not a comment */

Why is this not a comment? I have always wondered about the behaviour of
comments in "defines". And if it really becomes part of the define, isn't
ommiting it here not the same as ommiting it in all the places where FOO
would be replaced?

Thanks for the info,

Joona I Palaste · Nov 12, 2003

Martijn said:
^^^^^
<rant>
Weird how this word is one of the most misspelled words I have ever come
across .
</rant>

Why is this not a comment? I have always wondered about the behaviour of
comments in "defines". And if it really becomes part of the define, isn't
ommiting it here not the same as ommiting it in all the places where FOO
would be replaced?

It's not a comment because the ??/ forms a trigraph, expanding into a \
(backslash).

Irrwahn Grausewitz · Nov 12, 2003

Martijn said:
James Hu wrote:

Why is this not a comment? I have always wondered about the behaviour of
comments in "defines".

The problem is not to have a comment after a #define directive, the
"problem" is that ??/ is a trigraph sequence that will be replaced
by a backslash by the C preprocessor /prior/ to comment stripping.

And if it really becomes part of the define, isn't
ommiting it here not the same as ommiting it in all the places where FOO
would be replaced?

Mu. Comments will be replaced by a single blank each, /before/ the
#define directive is handled by the preprocessor. (Otherwise it would
be impossible to 'comment out' preprocessor directives.)

For more info about translation phases I suggest to read section
5.1.1.2 in the C99 Standard.

Thanks for the info,

HTH
Regards

James Hu · Nov 13, 2003

James Hu said:
This program is long. I don't really want to bore everyone with the
details, but it handles wierd cases like:

That should be "weird".

Found a bug, a cut and paste error. Discovery of the bug and the fix
will be left as an exercise to the interested reader.

-- James

Fao, Sean · Nov 13, 2003

Martijn said:
^^^^^
<rant>
Weird how this word is one of the most misspelled words I have ever come
across .
</rant>

Because the rule says I before E, except after C. The rhyme did
acknowledge "weird" and another word as exceptions but I forget that
part of it.

Why is this not a comment? I have always wondered about the behaviour of
comments in "defines". And if it really becomes part of the define, isn't
ommiting it here not the same as ommiting it in all the places where FOO
would be replaced?

It's not because it's in a #define. The following is a comment:

#define FOO 123 /* This is a comment */

Sean

Jeremy Yallop · Nov 13, 2003

Because the rule says I before E, except after C.

$ grep -ci '[^c]ei' /usr/dict/words
764

Alan Balmer · Nov 13, 2003

That should be "weird".

Found a bug, a cut and paste error. Discovery of the bug and the fix
will be left as an exercise to the interested reader.

-- James

Just recently I was reading someone who said that cut and paste should
be disallowed in program editors

Fao, Sean · Nov 14, 2003

Jeremy said:
Because the rule says I before E, except after C.

Click to expand...

$ grep -ci '[^c]ei' /usr/dict/words
764

I hate English

.

Martijn · Nov 14, 2003

Because the rule says I before E, except after C. The rhyme did
acknowledge "weird" and another word as exceptions but I forget that
part of it.

leisure? (and any derivative?)

Anyway, thanks everybody for the help! I suspected the ??/ was the gotcha
in this comment

I wasn't even aware of this kind of escape characters...

Thanks!

James Hu · Nov 14, 2003

My spelling is pretty good, but I am no spelling B contender. Perhaps
if I stuck with Latin a few more years in college...

leisure? (and any derivative?)

Either. Neither. Lein. Heinous. Apparently there are many exceptions.

Anyway, thanks everybody for the help! I suspected the ??/ was the
gotcha in this comment I wasn't even aware of this kind of escape
characters...

As others have noted, these are trigraphs. There are also token
sequences that are called digraphs, but they are treated like regular C
tokens, and not globally substituted during the first translation phase
as trigraphs are. My comment stripping program does not give special
treatment to digraphs, because none of the digraph sequences interfere
with a comment.

In an older version of this program (I don't think I ever posted it
here), I converted trigraphs and digraphs to their corresponding "single
character" equivalents. The primary motivation for doing so was that
I wanted to avoid buffering up the scanned input. This meant when
I encountered a line splice after a slash, I would just count how
many line splices occurred until I either did or didn't see a splat.
If eventually a non line splice occurs that is not a splat, then I
would have to output the slash and the number of splices that I had
encountered. The comment program just output the counted number of
line splices using "\\\n", and did not know if a particular splice was
created using a trigraph or not in the original source.

In the version I posted, I decided it was better to output the program
using the original text of the program (modulo turning comments into
a single space character). I suppose if I was really serious about
saving memory, I would buffer up using some sort of RLE compression.

-- James

CBFalconer · Nov 14, 2003

Fao said:
Jeremy said:

Because the rule says I before E, except after C.

Click to expand...

$ grep -ci '[^c]ei' /usr/dict/words
764

Click to expand...

I hate English .

I get 2447. Here are the first few:

abeigh
absenteeism
Acanthodei
acetylenediurein
acetyltropeine
acheilia

iterators	10	Jul 8, 2013
Command Line Arguments	0	Mar 7, 2023
Help in this program.	2	May 14, 2022
C pipe	1	Dec 9, 2021
// comments	35	Apr 26, 2008
Adding adressing of IPv6 to program	1	Feb 16, 2023
Using Do-While loop statement for a selection Menu	3	May 1, 2023
How to alter the program so that when user types z or Z or 0, the program sets both a and b to zero?	0	Oct 11, 2022

Program to remove C comments (long signature)

James Hu

Martijn

Joona I Palaste

Irrwahn Grausewitz

James Hu

Fao, Sean

Jeremy Yallop

Alan Balmer

Fao, Sean

Martijn

James Hu

CBFalconer

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads