delete comments in .c file

C

CBFalconer

*** rude top-posting fixed ***

Stephen said:
Timex said:
I want to delete all comments in .c file.

Size of .c file is very big.

Any good idea to do this?

Please show me example code.

Here's a perl script which will handle *MOST* sane C code...

Some things that it will miss (scan manually for it first:

a double quote inside of single quotes (e.g.)
char confusion = '"';

C-99 // comments like this

I'm sure that some people can come up with other convoluted
counter-examples.

It reads and plays with the entire file, so it will need to
hold at least two or three copies of it in RAM. (for today's
computers, that would be some number of megabytes).

If you want any of the above fixed, feel free to send me a
cheque.
____________________________________________________
#!/usr/bin/perl
$s=join("",<>);
# printf "[[%s]]\n\n",$s;
$s=~ s/("(\\\\|\\"|[^"])*")|(\/\*([^*]|\*(?=[^\/]))*\*\/)|(\/\/.*)/[[$1 ]]/g;
printf "[[%s]]\n\n",$s;
____________________________________________________
Yep, That's it... 5 lines including the shell header.

Please do not top-post.

The following AFAIK does not have the above faults, and does not
need to store any file copies, in fact not even any line copies.
It will probably be at least an order of magnitude faster.

/* File uncmntc.c - demo of a text filter
Strips C comments. Tested to strip itself
by C.B. Falconer. 2002-08-15
Public Domain. Attribution appreciated
report bugs to <mailto:[email protected]>
*/

/* With gcc3.1, must omit -ansi to compile eol comments */

#include <stdio.h>
#include <stdlib.h>

static int ch, lastch;

/* ---------------- */

static void putlast(void)
{
if (0 != lastch) fputc(lastch, stdout);
lastch = ch;
ch = 0;
} /* putlast */

/* ---------------- */

/* gobble chars until star slash appears */
static int stdcomment(void)
{
int ch, lastch;

ch = 0;
do {
lastch = ch;
if (EOF == (ch = fgetc(stdin))) return EOF;
} while (!(('*' == lastch) && ('/' == ch)));
return ch;
} /* stdcomment */

/* ---------------- */

/* gobble chars until EOLine or EOF. i.e. // comments */
static int eolcomment(void)
{
int ch, lastch;

ch = '\0';
do {
lastch = ch;
if (EOF == (ch = fgetc(stdin))) return EOF;
} while (!(('\n' == ch) && ('\\' != lastch)));
return ch;
} /* eolcomment */

/* ---------------- */

/* echo chars until '"' or EOF */
static int echostring(void)
{
putlast();
if (EOF == (ch = fgetc(stdin))) return EOF;
do {
putlast();
if (EOF == (ch = fgetc(stdin))) return EOF;
} while (!(('"' == ch) && ('\\' != lastch)));
return ch;
} /* echostring */

/* ---------------- */

int main(void)
{
lastch = '\0';
while (EOF != (ch = fgetc(stdin))) {
if ('/' == lastch)
if (ch == '*') {
lastch = '\0';
if (EOF == stdcomment()) break;
ch = ' ';
putlast();
}
else if (ch == '/') {
lastch = '\0';
if (EOF == eolcomment()) break;
ch = '\n';
putlast(); // Eolcomment here
// Eolcomment line \
with continuation line.
}
else {
putlast();
}
else if (('"' == ch) && ('\\' != lastch)
&& ('\'' != lastch)) {
if ('"' != (ch = echostring())) {
fputs("\"Unterminated\" string\n", stderr);
fputs("checking for\
continuation line string\n", stderr);
fputs("checking for" "concat string\n", stderr);
return EXIT_FAILURE;
}
putlast();
}
else {
putlast();
}
} /* while */
putlast(/* embedded comment */);
return 0;
} /* main */
 
E

Ed Morton

Timex said:
I want to delete all comments in .c file.

Size of .c file is very big.

Any good idea to do this?

Please show me example code.

Try "ncsl": http://www.lucentssg.com/displayProduct.cfm?prodid=33
It strips all comments and indentation so just run an indenter (e.g.
"indent") or a C beautifier (e.g. "cb" - google for "cb download
beautifier" and take your pick) on the output to get it back in readable
format. Disclaimer - I've never used this specific download of "ncsl",
I've just used the version provided on UNIX boxes within Lucent.

Ed.
 
S

Stephen Samuel

Here's a perl script which will handle *MOST* sane C code...

Some things that it will miss (scan manually for it first:

a double quote inside of single quotes (e.g.)
char confusion = '"';

C-99 // comments like this

I'm sure that some people can come up with other convoluted counter-examples.

It reads and plays with the entire file, so it will need to hold
at least two or three copies of it in RAM. (for today's computers,
that would be some number of megabytes).

If you want any of the above fixed, feel free to send me a cheque.
____________________________________________________
#!/usr/bin/perl
$s=join("",<>);
# printf "[[%s]]\n\n",$s;
$s=~ s/("(\\\\|\\"|[^"])*")|(\/\*([^*]|\*(?=[^\/]))*\*\/)|(\/\/.*)/[[$1 ]]/g;
printf "[[%s]]\n\n",$s;
____________________________________________________
Yep, That's it... 5 lines including the shell header.

One bug: Quoted strings have a space inserted after them.
Again: fixable, but not worth the trouble for free.
 
S

Stephen Samuel

Here's a perl script which will handle *MOST* sane C code...

Some things that it will miss (scan manually for it first:

a double quote inside of single quotes (e.g.)
char confusion = '"';

C-99 // comments like this

I'm sure that some people can come up with other convoluted counter-examples.

It reads and plays with the entire file, so it will need to hold
at least two or three copies of it in RAM. (for today's computers,
that would be some number of megabytes).

If you want any of the above fixed, feel free to send me a cheque.
____________________________________________________
#!/usr/bin/perl
$s=join("",<>);
# printf "[[%s]]\n\n",$s;
$s=~ s/("(\\\\|\\"|[^"])*")|(\/\*([^*]|\*(?=[^\/]))*\*\/)|(\/\/.*)/[[$1 ]]/g;
printf "[[%s]]\n\n",$s;
____________________________________________________
Yep, That's it... 5 lines including the shell header.

One bug: Quoted strings have a space inserted after them.
Again: fixable, but not worth the trouble for free.
 
S

Stephen Samuel

Irrwahn said:
<snip>

Since when is perl topical in c.l.c?
It's a C solution .. But Perl is written in C, so if you like,
I can just
#include said:
BTW:
Does your "solution" account for comment delimiters inside string
literals? (I'm unfortunately unable to decrypt the line-noise
provided.)

Yes. It accounts for comment delimiters in quotes and quote
delimiters in comments (One side effect is that double quote
strings have a space added after them. Given the way that I
wrote it, it was a choice between that, replacing comments with
Nothing (possible to cause syntax errors) or added complexity.)

It also handles quoted double-quotes inside of strings.

It does NOT handle double-quote or comment-start delimiters inside
of single-quotes (char literals), but that would be easy enough to add.
 
J

Joona I Palaste

Stephen Samuel said:
It's a C solution .. But Perl is written in C, so if you like,
I can just
#include <perl-source.c>

Are Perl implementations *required* to be written in C? And are
Perl implementations *required* to ship with the source code?

--
/-- Joona Palaste ([email protected]) ------------- Finland --------\
\-- http://www.helsinki.fi/~palaste --------------------- rules! --------/
"'So called' means: 'There is a long explanation for this, but I have no
time to explain it here.'"
- JIPsoft
 
S

Stephen Samuel

CBFalconer said:
*** rude top-posting fixed ***
Hmm.. This must be a relatively recent addition to usenet
ettiquete (i.e. in the last decade or so).

Appologies. I'm an old foggie, and it's probably been an decade
since I've posted here.
 
I

Irrwahn Grausewitz

Stephen Samuel said:
It's a C solution

Err, no.
.. But Perl is written in C, so if you like,
I can just
#include <perl-source.c>

Non-standard header file. ;-)
Nice.

It accounts for comment delimiters in quotes and quote
delimiters in comments (One side effect is that double quote
strings have a space added after them. Given the way that I
wrote it, it was a choice between that, replacing comments with
Nothing (possible to cause syntax errors) or added complexity.)

Hm. AFAICT that shouldn't cause much trouble, OK.
It also handles quoted double-quotes inside of strings.

ITYM something like "\""?
It does NOT handle double-quote or comment-start delimiters inside
of single-quotes (char literals), but that would be easy enough to
add.

Fair enough.
But still there might be "strange" cases caused where your script may
fail. Consider:

/* gotcha! *\
/

A C preprocessor would have deleted the <backslash><new-line> sequence
in translation phase 2 *before* the tokenization and comment replacement
takes place in phase 3. And if the backslash is written as a trigraph
sequence we need to "fake" translation phase 1 as well... :-(

Admittedly, these are rare situations, but you see: sophisticated
comment replacement in C files isn't /that/ easy after all, you have to
provide quite an amount of preprocessor functionality to get it right.

Best Regards
 
K

Keith Thompson

Joona I Palaste said:
Are Perl implementations *required* to be written in C? And are
Perl implementations *required* to ship with the source code?

<OT>
Perl is pretty much defined by its implementation, not by a language
standard. The implementation (there's basically only one) is written
in C. It's distributed under one of two open source licenses, both of
which require the source to be available (but not necessarily shipped
with the binaries).

This is probably incorrect in some minor details. If I had posted to
a more appropriate newsgroup, someone would jump in and correct me.
</OT>
 
J

Joona I Palaste

<OT>
Perl is pretty much defined by its implementation, not by a language
standard. The implementation (there's basically only one) is written
in C. It's distributed under one of two open source licenses, both of
which require the source to be available (but not necessarily shipped
with the binaries).
This is probably incorrect in some minor details. If I had posted to
a more appropriate newsgroup, someone would jump in and correct me.
</OT>

OK, I have to concede with that, but Samuel's answer still wasn't
sufficient. Writing #include <perl_source.h> at the top of the Perl
file will change the program into a mix-and-match of C and Perl,
which will not compile as either language.
 
K

Keith Thompson

Irrwahn Grausewitz said:
<snip>

Since when is perl topical in c.l.c?

This is an interesing edge case with respect to topicality. One could
argue that we're talking *about* C (which is clearly topical), but
we're using a mixture of Perl and English to discuss it. Think of the
Perl regular expression as a description of how to strip comments from
C source code.

On the other hand, not everyone here can be expected to speak Perl
regexps fluently.
 
I

Irrwahn Grausewitz

Stephen Samuel said:
Hmm.. This must be a relatively recent addition to usenet
ettiquete (i.e. in the last decade or so).

It's a convention in comp.lang.c (and several other technical
newsgroups) to place your comments after the part of the original
post you are responding to, in order to retain context. Thus
top-posting is discouraged in c.l.c.
Appologies. I'm an old foggie, and it's probably been an decade
since I've posted here.

Again, please do not send email copies of your replies; thank you.

Regards
 
I

Irrwahn Grausewitz

Keith Thompson said:
This is an interesing edge case with respect to topicality. One could
argue that we're talking *about* C (which is clearly topical), but
we're using a mixture of Perl and English to discuss it. Think of the
Perl regular expression as a description of how to strip comments from
C source code.

That would make any solution to manipulate C sources implemented in
any language other than C topical in c.l.c. IMHO that would not be a
Good Thing[tm].
On the other hand, not everyone here can be expected to speak Perl
regexps fluently.

Indeed.

Regards
 
A

Arthur J. O'Dwyer

Stephen said:
Timex said:
I want to delete all comments in .c file.

#!/usr/bin/perl
$s=join("",<>);
# printf "[[%s]]\n\n",$s;
$s=~ s/("(\\\\|\\"|[^"])*")|(\/\*([^*]|\*(?=[^\/]))*\*\/)|(\/\/.*)/[[$1 ]]/g;
printf "[[%s]]\n\n",$s;
/* File uncmntc.c - demo of a text filter
Strips C comments. Tested to strip itself
by C.B. Falconer. 2002-08-15
Public Domain. Attribution appreciated
report bugs to <mailto:[email protected]>
*/
<snip code>

I ran your program through some hurdles, and found that
it couldn't handle multibyte character constants for some
reason. I didn't bother to track down why; I just re-wrote
the filter from scratch. ;-) Here's my version, whose
algorithm may be completely different from yours.
This algorithm, on the other hand, completely fails to
handle line-splicing in the middle of comment delimiters: /\
* this is a comment */ does not work, nor does /* this either *\
/. Comment removal really is tricky in the most general case!
Proper error-checking on getc() and putc(), and a good
command-line interface, left as exercises for the interested
reader.


/* File uncmntc2.c - demo of a different text filter
Strips C comments. Tested to strip itself
Improves on CBFalconer's design by correctly handling '/*'
and by having a C89/C99 switch, but doesn't handle the /\
* delimiter correctly.
by Arthur O'Dwyer, 2002-11-03
Public Domain. Attribution appreciated
don't bother reporting bugs, just fix 'em...
*/

#include <stdio.h>
#include <stdlib.h>

/* Strip C99-style end-of-line comments? */
int AllowEOLComments = 1;

int strip_comments(FILE *fp, FILE *outfp);
static int put_carefully(int lastch, int ch, FILE *outfp);


int main(void)
{
strip_comments(stdin, stdout);
return 0;
}


int strip_comments(FILE *fp, FILE *outfp)
{
int ch;
int lastch;
int inchotes = 0;
int inquotes = 0;
int incomment = 0;
int ineolcomment = 0;

for (lastch = ' '; (ch = getc(fp)) != EOF; lastch = ch)
{
if (!incomment && !ineolcomment)
{
if (inquotes || inchotes)
putc(ch, outfp);
else
put_carefully(lastch, ch, outfp);
}

if (inchotes) {
if (ch == '\'' && lastch != '\\')
inchotes = 0;
} else if (inquotes) {
if (ch == '"' && lastch != '\\')
inquotes = 0;
} else if (incomment) {
if (ch == '/' && lastch == '*')
incomment = 0, ch = ' ';
} else if (ineolcomment) {
if (ch == '\n' && lastch != '\\')
ineolcomment = 0;
} else {
if (ch == '\'')
inchotes = 1;
else if (ch == '"')
inquotes = 1;
else if (lastch == '/' && ch == '*') {
putc(' ', outfp);
incomment = 1;
}
else if (AllowEOLComments && lastch == '/' && ch == '/')
ineolcomment = 1;
}
}

if (lastch == '/')
putc(lastch, outfp);

return 0;
}


static int put_carefully(int lastch, int ch, FILE *outfp)
{
/* Print out 'ch', but be very careful not to print
* any characters that might be part of a comment
* delimiter. Contrariwise, if 'lastch' is now
* definitely *not* a comment delimiter, we must now
* print it, too.
*/

if (AllowEOLComments) {
if (lastch == '/' && ch == '/')
return 0;
}
if (lastch == '/' && ch == '*')
return 0;
if (lastch == '/')
putc(lastch, outfp);
if (ch != '/')
putc(ch, outfp);
return 0;
}
 
C

CBFalconer

Arthur J. O'Dwyer said:
.... snip ...

<snip code>

I ran your program through some hurdles, and found that
it couldn't handle multibyte character constants for some
reason. I didn't bother to track down why; I just re-wrote
the filter from scratch. ;-) Here's my version, whose
algorithm may be completely different from yours.
.... snip ...

A known failing. It also fails miserably with trigraphs. The
multibyte char is probably easily handled analogously to handling
quoted strings.
/* File uncmntc2.c - demo of a different text filter
Strips C comments. Tested to strip itself
Improves on CBFalconer's design by correctly handling '/*'
and by having a C89/C99 switch, but doesn't handle the /\
* delimiter correctly.
by Arthur O'Dwyer, 2002-11-03
^^^^
That is the year I wrote mine :)

All of which shows that there are multiple ways to implement a
black box. I omitted any reference to cats because I happen to
like them.
 
P

Patrick Foley

In said:
[snip] Comment removal really is tricky in the most general case!

Since this is exercise 1-23 in K&R2, there are several solutions
available at Richard's site:

http://users.powernet.co.uk/eton/kandr2/index.html

including a 556-line entry from Chris Torek that I think also brews
coffee...

Pat

BTW, Richard: Would you consider adding a plaintext version of the
"naming conventions" page to the zipfile as a sort of "README"?
 
R

Richard Heathfield

Patrick said:
BTW, Richard: Would you consider adding a plaintext version of the
"naming conventions" page to the zipfile as a sort of "README"?

I am currently re-evaluating the Answers section of my site. I'll get back
to you when I have a bit more time.
 
C

Chris Torek

In said:
[snip] Comment removal really is tricky in the most general case!

Since this is exercise 1-23 in K&R2, there are several solutions
available at Richard's site:

http://users.powernet.co.uk/eton/kandr2/index.html

including a 556-line entry from Chris Torek that I think also brews
coffee...

But it has (gasp!) a *bug*. :) The "level 2 state machine" for
handling comments fails to reconsider characters in a few cases.
I think the main (only?) problem can be fixed without too much
fuss:

case L2_SLASH:
if (c == '*')
l2state = L2_COMM;
else if (c99 && c == '/')
l2state = L2_SLASHSLASH;
else {
SYNCLINES();
OUTPUT('/', 0);
--> if (c != '/') {
--> if (c != EOF)
--> COPY();
--> l2state = L2_NORMAL;
--> }
}
break;

The bug is in the marked lines, which output the first slash
and then change the level-2 state. But the new state should
be "that which results in seeing character c as if the initial
state had been L2_NORMAL", so we could replace all of them with:

l2state = L2_NORMAL;
goto l2_normal_case;

and add an "l2_normal_case" label under case L2_NORMAL: above.
Alternatively, the assignment to l2state can be changed to:

l2state = c == '\'' ? L2_CC :
c == '"' ? L2_SC : L2_NORMAL;

which avoids the dreaded "goto", and simply duplicates what would
have happened in L2_NORMAL state (except of course that instead of
replacing l2state with L2_SLASH for '/', we have to replace it with
L2_NORMAL for characters that are not in [/'"]).
 
T

Thomas Matthews

Timex said:
I want to delete all comments in .c file.

Size of .c file is very big.

Any good idea to do this?

Please show me example code.

Perhaps a better idea is to break the file into
smaller pieces upon better themes.

I believe that delete all the comments is crime
against programming ethics. After all, one of
the greatest ideals to achieve is to make a
program readable by a programming illiterate
person.

--
Thomas Matthews

C++ newsgroup welcome message:
http://www.slack.net/~shiva/welcome.txt
C++ Faq: http://www.parashift.com/c++-faq-lite
C Faq: http://www.eskimo.com/~scs/c-faq/top.html
alt.comp.lang.learn.c-c++ faq:
http://www.raos.demon.uk/acllc-c++/faq.html
Other sites:
http://www.josuttis.com -- C++ STL Library book
 
G

Gary E. Ansok

I tested Arthur's program and, despite its claim, it couldn't
even strip its own comments (it left in the comment in
put_carefully()). The bug is that it thought the backslash
meant that '\\' was not a complete character constant (nor
would it think "\\" was a complete string).

Is this a complete C99-style comment?
// \\
If it is, a similar fix may be needed in that part of the code.

Lesson:
Comment removal really is tricky in the most general case!
Agreed.

-- Gary

My attempt at a bug fix:
/* File uncmntc2.c - demo of a different text filter
Strips C comments. Tested to strip itself
Improves on CBFalconer's design by correctly handling '/*'
and by having a C89/C99 switch, but doesn't handle the /\
* delimiter correctly.
by Arthur O'Dwyer, 2002-11-03
bug fix by Gary Ansok, 2003-11-06 to handle '\\' and "\\"
Public Domain. Attribution appreciated
don't bother reporting bugs, just fix 'em...
*/

#include <stdio.h>
#include <stdlib.h>

/* Strip C99-style end-of-line comments? */
int AllowEOLComments = 1;

int strip_comments(FILE *fp, FILE *outfp);
static int put_carefully(int lastch, int ch, FILE *outfp);


int main(void)
{
strip_comments(stdin, stdout);
return 0;
}


int strip_comments(FILE *fp, FILE *outfp)
{
int ch;
int lastch;
int inchotes = 0;
int inquotes = 0;
int incomment = 0;
int ineolcomment = 0;
int backslashed = 0;

for (lastch = ' '; (ch = getc(fp)) != EOF; lastch = ch)
{
if (!incomment && !ineolcomment)
{
if (inquotes || inchotes)
putc(ch, outfp);
else
put_carefully(lastch, ch, outfp);
}

if (inchotes) {
if (lastch == '\\')
backslashed ^= 1;
else
backslashed = 0;
if (ch == '\'' && !backslashed)
inchotes = 0;
} else if (inquotes) {
if (lastch == '\\')
backslashed ^= 1;
else
backslashed = 0;
if (ch == '"' && !backslashed)
inquotes = 0;
} else if (incomment) {
if (ch == '/' && lastch == '*')
incomment = 0, ch = ' ';
} else if (ineolcomment) {
if (ch == '\n' && lastch != '\\')
ineolcomment = 0;
} else {
if (ch == '\'')
inchotes = 1;
else if (ch == '"')
inquotes = 1;
else if (lastch == '/' && ch == '*') {
putc(' ', outfp);
incomment = 1;
}
else if (AllowEOLComments && lastch == '/' && ch == '/')
ineolcomment = 1;
}
}

if (lastch == '/')
putc(lastch, outfp);

return 0;
}


static int put_carefully(int lastch, int ch, FILE *outfp)
{
/* Print out 'ch', but be very careful not to print
* any characters that might be part of a comment
* delimiter. Contrariwise, if 'lastch' is now
* definitely *not* a comment delimiter, we must now
* print it, too.
*/

if (AllowEOLComments) {
if (lastch == '/' && ch == '/')
return 0;
}
if (lastch == '/' && ch == '*')
return 0;
if (lastch == '/')
putc(lastch, outfp);
if (ch != '/')
putc(ch, outfp);
return 0;
}
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
474,302
Messages
2,571,547
Members
48,349
Latest member
JolieEey92
Top