CBFalconer said:
I posted recently asking where it failed, and got no replies. ...
It fails the split comment cases, such as these:
/\
* this is a comment */
/\
/ this is a comment too
Also from the previous thread, I learned about not messing with the
preprocessor
directives, so both yours and mine failed cases like:
#define COMMENT_START /* blah blah blah
#define COMMENT_END blah blah blah */
Of course, keep in mind corner cases like:
/* hey */ #define FOO \
/* bzzt */
I have fixed my program to properly deal with preprocessor directives.
--
- James
/*
* cstripc: A C program to strip comments from C files.
* Usage:
* cstripc [file [...]]
* cstripc [-t]
*
* The '-t' options is used for testing. It prints some pointers
* to strings that are interlaced with comment characters.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*****************/
/**** GLOBALS ****/
/*****************/
static const char *progname;
static int debug_flag;
/**********************/
/**** MAIN PROGRAM ****/
/**********************/
static void print_usage(void);
static void print_test(void);
static FILE * open_input_file(const char *filename);
static void close_input_file(FILE *infile);
static void parse_input_file(FILE *infile);
int
main(int argc, char *argv[])
{
progname = argv[0];
if (progname == 0) {
progname = "cstripc";
}
while (argc > 1) {
if ((*argv[1] != '-') || (strcmp(argv[1], "-") == 0)) {
break;
}
if (strcmp(argv[1], "-t") == 0) {
print_test();
exit(0);
} else if (strcmp(argv[1], "-d") == 0) {
debug_flag = 1;
} else {
fprintf(stderr, "%s: Unrecognized option '%s'\n",
progname, argv[1]);
print_usage();
exit(EXIT_FAILURE);
}
--argc;
++argv;
}
if (argc <= 1) {
parse_input_file(stdin);
exit(0);
}
while (argc > 1) {
FILE *infile;
parse_input_file(infile = open_input_file(argv[1]));
close_input_file(infile);
--argc;
++argv;
}
return 0;
}
/**************************/
/**** PRINT USAGE/TEST ****/
/**************************/
static const char *usage_string =
"%s: A C program to strip comments from C files.\n"
"Usage:\n"
" %s [file [...]]\n"
" %s [-t]\n"
"\n"
"The '-t' options is used for testing. "
"It prints some pointers to strings\n"
"that are interlaced with comment characters.\n"
;
static void
print_usage(void)
{
fprintf(stderr, usage_string, progname, progname, progname);
}
static const char *a;
static const char *b;
static const char *c;
static void
print_test(void)
{
if (a) puts(a);
if (b) puts(b);
if (c) puts(c);
}
/*******************************/
/**** OPEN/CLOSE INPUT FILE ****/
/*******************************/
static const char *input_file_name;
static FILE *
open_input_file(const char *filename)
{
FILE *infile;
input_file_name = filename;
if (filename == 0) {
return 0;
}
if (strcmp(filename, "-") == 0) {
return stdin;
}
infile = fopen(filename, "r");
if (infile == 0) {
fprintf(stderr, "%s: Could not open '%s' for reading.\n",
progname, filename);
}
return infile;
}
static void
close_input_file(FILE *infile)
{
if (infile) {
if (infile != stdin) {
if (fclose(infile) == EOF)
fprintf(stderr, "%s, Could not close '%s'.\n",
progname, input_file_name);
} else {
clearerr(stdin);
}
}
}
/**************************/
/**** PARSE INPUT FILE ****/
/**************************/
typedef struct scan_state scan_state;
typedef struct scan_context scan_context;
struct scan_context {
const scan_state *ss;
char *sbuf;
unsigned sbufsz;
unsigned sbufcnt;
int bol;
};
struct scan_state {
const scan_state *(*scan)(scan_context *ctx, int input);
const char *name;
};
static scan_context initial_scan_context;
static void
parse_input_file(FILE *infile)
{
int c;
scan_context ctx;
if (infile == 0) {
return;
}
ctx = initial_scan_context;
while ((c = fgetc(infile)) != EOF) {
if (debug_flag) {
fprintf(stderr, "%s\n", ctx.ss->name);
}
ctx.ss = ctx.ss->scan(&ctx, c);
}
}
/***********************/
/**** STATE MACHINE ****/
/***********************/
/*
*
*********************************************************************
* Assume input is a syntactically correct C program.
*
* The basic algorithm is:
* Scan character by character:
* Treat trigraphs as a single character.
* If the sequence does not start a comment, emit the sequence.
* Otherwise,
* Scan character by character:
* Treat trigraphs as a single character.
* Treat the sequence '\\' '\n' as no character.
* If the sequence does not end a comment, continue consuming.
* Otherwise, emit a space, and loop back to top.
*********************************************************************
*
*/
#define SCAN_STATE_DEFINE(name) \
static const scan_state * name##_func(scan_context *ctx, int input); \
static const scan_state name##_state = { name##_func, #name }
SCAN_STATE_DEFINE(normal);
SCAN_STATE_DEFINE(normal_maybe_tri_1);
SCAN_STATE_DEFINE(normal_maybe_tri_2);
SCAN_STATE_DEFINE(normal_maybe_splice);
SCAN_STATE_DEFINE(string);
SCAN_STATE_DEFINE(string_maybe_tri_1);
SCAN_STATE_DEFINE(string_maybe_tri_2);
SCAN_STATE_DEFINE(string_maybe_splice);
SCAN_STATE_DEFINE(char);
SCAN_STATE_DEFINE(char_maybe_tri_1);
SCAN_STATE_DEFINE(char_maybe_tri_2);
SCAN_STATE_DEFINE(char_maybe_splice);
SCAN_STATE_DEFINE(slash);
SCAN_STATE_DEFINE(slash_maybe_tri_1);
SCAN_STATE_DEFINE(slash_maybe_tri_2);
SCAN_STATE_DEFINE(slash_maybe_splice);
SCAN_STATE_DEFINE(slashslash);
SCAN_STATE_DEFINE(slashslash_maybe_tri_1);
SCAN_STATE_DEFINE(slashslash_maybe_tri_2);
SCAN_STATE_DEFINE(slashslash_maybe_splice);
SCAN_STATE_DEFINE(slashsplat);
SCAN_STATE_DEFINE(slashsplat_splat);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_tri_1);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_tri_2);
SCAN_STATE_DEFINE(slashsplat_splat_maybe_splice);
SCAN_STATE_DEFINE(preproc);
SCAN_STATE_DEFINE(preproc_maybe_tri_1);
SCAN_STATE_DEFINE(preproc_maybe_tri_2);
SCAN_STATE_DEFINE(preproc_maybe_splice);
#define SCAN_STATE(name) (&name##_state)
static scan_context initial_scan_context = {
SCAN_STATE(normal), 0, 0, 0, 1
};
static void sbuf_append_char(scan_context *ctx, int c);
static void sbuf_append_string(scan_context *ctx, char *s);
static void sbuf_clear(scan_context *ctx);
static void sbuf_emit(scan_context *ctx);
static const scan_state *
normal_func(scan_context *ctx, int input)
{
switch (input) {
case '#': sbuf_emit(ctx);
putchar(input);
return ctx->bol ? SCAN_STATE(preproc)
: SCAN_STATE(normal);
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_tri_1);
case '"': ctx->bol = 0;
sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
case '\'': ctx->bol = 0;
sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
case '/': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(slash);
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_splice);
case '\n': ctx->bol = 1;
/* fallthrough */
case ' ':
case '\t': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
default: ctx->bol = 0;
sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
}
}
static const scan_state *
normal_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_tri_2);
default: ctx->bol = 0;
sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
normal_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': ctx->bol = 0;
putchar(input);
return SCAN_STATE(normal_maybe_tri_2);
case '=': sbuf_emit(ctx);
putchar(input);
return ctx->bol ? SCAN_STATE(preproc)
: SCAN_STATE(normal);
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': ctx->bol = 0;
sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
case '/': sbuf_append_char(ctx, input);
return SCAN_STATE(normal_maybe_splice);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
normal_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
default: ctx->bol = 0;
/* fallthrough */
case '\n': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
}
}
static const scan_state *
string_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_tri_1);
case '"': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_splice);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
}
}
static const scan_state *
string_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(string)->scan(ctx, input);
}
}
static const scan_state *
string_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(string_maybe_tri_2);
case '/': sbuf_append_char(ctx, input);
return SCAN_STATE(string_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
default: sbuf_emit(ctx);
return SCAN_STATE(string)->scan(ctx, input);
}
}
static const scan_state *
string_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(string);
}
}
static const scan_state *
char_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_tri_1);
case '\'': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_splice);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
}
}
static const scan_state *
char_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(char)->scan(ctx, input);
}
}
static const scan_state *
char_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(char_maybe_tri_2);
case '/': sbuf_append_char(ctx, input);
return SCAN_STATE(char_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
default: sbuf_emit(ctx);
return SCAN_STATE(char)->scan(ctx, input);
}
}
static const scan_state *
char_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(char);
}
}
static const scan_state *
slash_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_tri_1);
case '\\': sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_splice);
case '/': sbuf_clear(ctx);
return SCAN_STATE(slashslash);
case '*': sbuf_clear(ctx);
return SCAN_STATE(slashsplat);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
slash_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slash_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
slash_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_emit(ctx);
sbuf_append_string(ctx, "??");
return SCAN_STATE(normal_maybe_tri_2);
case '/': sbuf_append_char(ctx, '?');
sbuf_append_char(ctx, input);
return SCAN_STATE(slash_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_append_char(ctx, '?');
sbuf_append_char(ctx, input);
sbuf_emit(ctx);
return SCAN_STATE(normal);
default: sbuf_append_char(ctx, '?');
sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
slash_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': sbuf_append_char(ctx, input);
return SCAN_STATE(slash);
default: sbuf_emit(ctx);
return SCAN_STATE(normal)->scan(ctx, input);
}
}
static const scan_state *
slashslash_func(scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_1);
case '\\': return SCAN_STATE(slashslash_maybe_splice);
case '\n': putchar(' ');
putchar(input);
return SCAN_STATE(normal);
default: return SCAN_STATE(slashslash);
}
}
static const scan_state *
slashslash_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_2);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}
static const scan_state *
slashslash_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashslash_maybe_tri_2);
case '/': return SCAN_STATE(slashslash_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slashslash);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}
static const scan_state *
slashslash_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slashslash);
default: return SCAN_STATE(slashslash)->scan(ctx, input);
}
}
static const scan_state *
slashsplat_func(scan_context *ctx, int input)
{
/* UNUSED */ ctx = ctx;
switch (input) {
case '*': return SCAN_STATE(slashsplat_splat);
default: return SCAN_STATE(slashsplat);
}
}
static const scan_state *
slashsplat_splat_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashsplat_splat_maybe_tri_1);
case '\\': return SCAN_STATE(slashsplat_splat_maybe_splice);
case '/': putchar(' ');
return SCAN_STATE(normal);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}
static const scan_state *
slashsplat_splat_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': return SCAN_STATE(slashsplat_splat_maybe_tri_2);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}
static const scan_state *
slashsplat_splat_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '/': return SCAN_STATE(slashsplat_splat_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': return SCAN_STATE(slashsplat);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}
static const scan_state *
slashsplat_splat_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n': return SCAN_STATE(slashsplat_splat);
default: return SCAN_STATE(slashsplat)->scan(ctx, input);
}
}
static const scan_state *
preproc_func(scan_context *ctx, int input)
{
switch (input) {
case '\\': sbuf_emit(ctx);
sbuf_append_char(ctx, input);
return SCAN_STATE(preproc_maybe_splice);
case '\n': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(normal);
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(preproc);
}
}
static const scan_state *
preproc_maybe_tri_1_func(scan_context *ctx, int input)
{
switch (input) {
case '?': sbuf_append_char(ctx, input);
return SCAN_STATE(preproc_maybe_tri_2);
default: sbuf_emit(ctx);
return SCAN_STATE(preproc)->scan(ctx, input);
}
}
static const scan_state *
preproc_maybe_tri_2_func(scan_context *ctx, int input)
{
switch (input) {
case '?': putchar(input);
return SCAN_STATE(preproc_maybe_tri_2);
case '/': sbuf_append_char(ctx, input);
return SCAN_STATE(preproc_maybe_splice);
case '=':
case '(':
case ')':
case '<':
case '>':
case '!':
case '\'':
case '-': sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(preproc);
default: sbuf_emit(ctx);
return SCAN_STATE(preproc)->scan(ctx, input);
}
}
static const scan_state *
preproc_maybe_splice_func(scan_context *ctx, int input)
{
switch (input) {
case '\n':
default: sbuf_emit(ctx);
putchar(input);
return SCAN_STATE(preproc);
}
}
/*************************/
/**** BUFFER HANDLING ****/
/*************************/
static void
sbuf_append_char(scan_context *ctx, int c)
{
if (ctx->sbuf == 0) {
ctx->sbuf = malloc(ctx->sbufsz = 128);
} else if (ctx->sbufcnt == ctx->sbufsz) {
char *p = realloc(ctx->sbuf, ctx->sbufsz *= 2);
if (p == 0) {
fprintf(stderr, "%s: memory allocation failure\n",
progname);
exit(EXIT_FAILURE);
}
ctx->sbuf = p;
}
ctx->sbuf[ctx->sbufcnt++] = c;
ctx->sbuf[ctx->sbufcnt] = '\0';
}
static void
sbuf_append_string(scan_context *ctx, char *s)
{
while (*s != '\0') {
sbuf_append_char(ctx, *s++);
}
}
static void
sbuf_clear(scan_context *ctx)
{
ctx->sbufcnt = 0;
if (ctx->sbuf) {
ctx->sbuf[ctx->sbufcnt] = '\0';
}
}
static void
sbuf_emit(scan_context *ctx)
{
if (ctx->sbuf == 0 || ctx->sbufcnt == 0) {
return;
}
printf("%s", ctx->sbuf);
sbuf_clear(ctx);
}
/********************/
/**** TEST CASES ****/
/********************/
/* a comment */
/\
* a comment split */
/\
\
* a comment split twice */
/*
block comment
*/
/* comment, trailing delimiter split *\
/
/* comment, trailing delimiter split twice *\
\
/
/* comment, trailing delimiter split once, and again by trigraph *\
??/
/
static const char *a = /* comment in code "*/"Hello, "/**/"World!";
static const char *b = /\
* comment on code line split */ "Hello, " /\
\
* comment on code line split twice */ "World!";
#if 0
??/* this does not start a comment */
#endif
#define FOO1 /* don't touch this */
#define FOO2 \
/* don't touch this */
/* comment */ #define FOO3 /* don't touch this */
#define FOO4 /* don't touch
#define FOO5 this */
#if defined(__STDC__) && (__STDC__ == 1)
#if defined(__STD_VERSION__) && (__STD_VERSION__ >= 199901L)
//*** MORE TEST CASES ***//
/\
/ // comment split
/\
\
/ // comment split twice
static const char *c = // // comment on code line
"Hello, " /\
/ // comment on code line split
"World!" /\
\
/ // comment on code line split twice.
;
#if 0
??// this does not start a comment
#endif
// This is a // comment \
on two lines
#else
static const char *c = "STDC without STD_VERSION";
#endif
#endif