Hi
I'm trying to write a simple parser for my application, the purpose is to
allow application understand the command line arguments in the form:
my_app 1-3,5,9
or
my_app 1,4,8-24
...
so it should support both ranges and enumerators. But my function doesn't
print what I expect:
int parseLine(char *buf)
{
char *token, *subtoken;
char buftmp[20];
for (token = strtok(buf, ","); token != NULL; token = strtok(NULL, ","))
{
printf("%s: ", token);
strcpy(buftmp, token); /* strtok modifies buffer, so we save a
copy */
for (subtoken = strtok(buftmp, "-"); subtoken != NULL;
subtoken = strtok(NULL, "-")) {
printf("%s ", buf,subtoken);
}
putchar('\n');
}
return 0;
}
For example, buf="1-3,5,8", and I'd expect to have such output:
1-3: 1 3
5: 5
8: 8
Where is my mistake?
Thanks!
I have been through this so many times: hacking up a little parser
with strtok() and sscanf()/atoi(), then throwing it away when the
input language gets just a bit more sophisticated. These days I
always go ahead and implement a traditional scanner and simple EBNF
parser. Once you have the framework, it's very quick to adapt it to
new problems, and it's liberating to know this extra power can be
tapped with no code rewriting. Here's what I'm talking about:
#include <stdio.h>
#include <ctype.h>
// Tokens our scanner can discover.
typedef enum token_e {
T_NULL,
T_ERROR,
T_END_OF_INPUT,
T_INT,
T_COMMA,
T_DASH,
} TOKEN;
// Encapsulated state of an input token scanner.
typedef struct scanner_state_s {
char *text; // Input to scan
TOKEN token; // Last token found.
int p0, p1; // Last token string is text[t0..t1).
} SCANNER_STATE;
// Initialize a scanner's state.
void init_scanner_state(SCANNER_STATE *ss, char *text)
{
ss->text = text;
ss->token = T_NULL;
ss->p0 = ss->p1 = 0;
}
// Return current character.
static int current_char(SCANNER_STATE *ss)
{
return ss->text[ss->p1];
}
// Advance the scanner to the next token.
static void advance(SCANNER_STATE *ss)
{
if (current_char(ss) != '\0')
++ss->p1;
}
// Return the current token.
TOKEN current_token(SCANNER_STATE *ss)
{
return ss->token;
}
// Return the integer value of an INT token.
int get_int_value(SCANNER_STATE *ss, int *value) {
if (ss->token == T_INT) {
sscanf(&ss->text[ss->p0], "%d", value);
return 0;
}
return 1;
}
// Mark the beginning of a token.
static void start_token(SCANNER_STATE *ss, TOKEN token)
{
ss->p0 = ss->p1;
ss->token = token;
}
// Action on discovering the end of a token.
static void end_token(SCANNER_STATE *ss)
{
// Do nothing in this scanner.
}
// Scan a token without advancing the input.
static void scan_zero_char_token(SCANNER_STATE *ss, TOKEN token)
{
start_token(ss, token);
end_token(ss);
}
// Scan a single character token from the input.
static void scan_one_char_token(SCANNER_STATE *ss, TOKEN token)
{
start_token(ss, token);
advance(ss);
end_token(ss);
}
// Scan the next token from the input.
void scan(SCANNER_STATE *ss)
{
// Skip whitespace.
while (isspace(current_char(ss))) advance(ss);
// Use a switch() here if speed is necessary.
// The if's let us use ctype.h predicates.
if (isdigit(current_char(ss))) {
start_token(ss, T_INT);
do {
advance(ss);
} while (isdigit(current_char(ss)));
end_token(ss);
}
else if (current_char(ss) == ',')
scan_one_char_token(ss, T_COMMA);
else if (current_char(ss) == '-')
scan_one_char_token(ss, T_DASH);
else if (current_char(ss) == '\0')
scan_zero_char_token(ss, T_END_OF_INPUT);
else
scan_zero_char_token(ss, T_ERROR);
}
// Match a given token and scan past it to the next
// or else raise a syntax error if it's not there.
// It's usually best to longjmp out of the parser on error.
void match(SCANNER_STATE *ss, TOKEN token)
{
if (current_token(ss) == token)
scan(ss);
else {
fprintf(stderr, "syntax error (%d) at end of '%.*s'\n",
ss->token, ss->p1 + 1, ss->text);
ss->token = T_ERROR;
}
}
// Parse the EBNF form: <range> ::= INT [ '-' INT ]
static void range(SCANNER_STATE *ss)
{
int lo, hi;
get_int_value(ss, &lo);
match(ss, T_INT);
if (current_token(ss) == T_DASH) {
scan(ss);
get_int_value(ss, &hi);
match(ss, T_INT);
}
else
hi = lo;
// Action code.
printf(lo == hi ? "%d\n" : "[%d-%d]\n", lo, hi);
}
// Parse the EBNF form:
// <line> ::= [ <range> { ',' <range> } ] END_OF_INPUT
void parse_line(char *text)
{
SCANNER_STATE ss[1];
init_scanner_state(ss, text);
scan(ss); // scan the initial token
if (current_token(ss) == T_INT) {
range(ss);
while (current_token(ss) == T_COMMA) {
scan(ss);
range(ss);
}
}
match(ss, T_END_OF_INPUT);
}
// Simple test.
int main(int argc, char *argv[])
{
if (argc == 2)
parse_line(argv[1]);
return 0;
}