G
Gene
Ben Bacarisse said:How would you write the OP matcher?
I posted the code below. I hope this does all the things I suggested!
There is a function lex() which extracts the next token from the input ina
general manner (and reports, to some extent, what type of token it is).
And there is a function parsetest(), which takes a given string, and checks
it corresponds to the OP's syntax spec.
Here, the details of the character-level handling, and the syntax parsing,
are largely separate.
(I haven't bothered with a formal table lookup; lex() uses file globals for
simplicity; and overflow of the lextoken[] array isn't checked. Some checks,
eg. lex()!='A', could probably be dispensed with. And there's supposed to be
something funny about using atoi(), but I can't remember what..)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/* globals set by lex() */
int lextype; /* One of 'A' 'N' 'P' 'L' 'E' '?' means Alpha, Number,
Punctuation, EOL, End of string, Error */
int lexleadsp; /* Set to 1 when leading whitespace seen (requirement
of OP) */
char lextoken[1000]; /* Contains token for 'A', 'N' and 'P' lexical types*/
char *lexptr; /* Points to next character of input */
int lex(void) {
char c;
int i;
lextoken[0]=0;
lexleadsp=0;
while (*lexptr!='\n' && isspace(*lexptr)) {
lexleadsp=1;
++lexptr;
}
c=*lexptr;
i=0;
if (c==0)
lextype='E';
else if (isalpha(c)) {
do
lextoken[i++]=toupper(*lexptr++);
while (isalpha(*lexptr));
lextype='A';
}
else if (isdigit(c)) {
do
lextoken[i++]=toupper(*lexptr++);
while (isdigit(*lexptr));
lextype='N';
}
else if (c=='\n') {
lextype='L';
++lexptr;
}
else if (ispunct(c)) {
lextoken[i++]=*lexptr++;
lextype='P';
}
else {
lextype='?';
++lexptr;
}
lextoken=0;
return lextype;
}
/* Make use of lex() function to test an input string in this syntax:
OPEN|CLOSE|LOCK DOOR [+]n where n must be 1 to 10, and without
leading or trailing white space.
Returns 1 (passed) or 0 (failed)
*/
int parsetest(char *input) {
int n,t;
lexptr=input;
if (lex()!='A' || lexleadsp) return 0;
if (strcmp(lextoken,"OPEN")!=0 &&
strcmp(lextoken,"CLOSE")!=0 &&
strcmp(lextoken,"LOCK")!=0)
return 0;
if (lex()!='A' || strcmp(lextoken,"DOOR")!=0) return 0;
t=lex();
if (t=='P' && lextoken[0]=='+') t=lex();
if (t!='N') return 0;
n=atoi(lextoken);
if (n<1 || n>10) return 0;
switch (lex()) {
case 'E': case 'L':
if (lexleadsp) return 0;
break;
default:
return 0;
}
return 1;
}
int main(void) {
char* teststr = "Lock DOOR 10";
int status;
printf("Testing: \"%s\"\n",teststr);
status=parsetest(teststr);
printf("Result: %s\n",status?"Passed":"Failed");
}
If there is a realistic chance the language will expand in the future
to a branching syntax, then tokenizing now is probably a good idea.
But if the language is such that the parser will always know what to
expect based on what it's already read (essentially the grammar is
LR(0)), then a separate tokenizing step is overkill. It adds error
opportunities and slows things down.
What the OP did is fine, but I'll bet his tests didn't work the first
time. Character-by-character approaches like this are fiddly.
A middle ground is to develop a set of prefix matchers that can be
reused. I've tried many different APIs and idioms and settled on the
following over the years. With this approach the code of the parser
can look just like the spec. With a good inlining compiler, the
result will be as good or better than OP's code for performance.
typedef enum token_e {
OPEN_TOKEN,
CLOSE_TOKEN,
LOCK_TOKEN,
DOOR_TOKEN,
} TOKEN;
/*
* Return true iff the given token is a proper directive.
*/
int is_directive(TOKEN token)
{
return OPEN_TOKEN <= token && token <= LOCK_TOKEN;
}
/* Must be in alpha order, lower case */
KEYWORD_PAIR keywords[] = {
{ "close", CLOSE_TOKEN },
{ "door", DOOR_TOKEN }, // Could handle door specially....
{ "lock", LOCK_TOKEN },
{ "open", OPEN_TOKEN },
};
/*
* Return true iff the given integer is a proper door number.
*/
int door_number_ok(unsigned n)
{
return n <= 10;
}
/*
* Parse a string with the following format:
* - a directive token, case insensitive
* - one or more whitespaces (spaces, tabs, newlines...)
* - the word "door" (without quotes), case insensitive
* - an optional number of whitespaces, even zero
* - a door number in the range 0..10
* - no other characters after the number
* Return the directive as a token and the door number.
* Function return is error code or 0 for success.
*/
int parse(char *s, TOKEN *directive, unsigned *door_number)
{
int index;
// Probably overkill to demand no whitespace.
if (match_whitespace(s, &s))
return -1;
if (!match_keyword(s, &s,
keywords, TABLE_SIZE(keywords),
keyword_cmp_ignoring_case,
&index))
return -2; // Missing token.
*directive = (TOKEN)keywords[index].token;
if (!is_directive(*directive))
return -3; // Token not a directive.
if (!match_token(s, &s,
keywords, TABLE_SIZE(keywords),
keyword_cmp_ignoring_case,
DOOR_TOKEN))
return -4; // Door keyword missing or misspelled
if (!match_unsigned(s, &s, door_number))
return -5; // Door number missing
if (!door_number_ok(*door_number))
return -6; // Door number out of range
if (*s)
return -7; // Junk at end check.
return 0; // 0 is more conventional for success
}
.... and the reusable code looks essentially like this. I've de-
parameterized some of my actual code to shorten it.
/*
* Set *end one char past whitespace prefix of s.
* Return true iff any characters matched.
*/
int match_whitespace(char *s, char **end)
{
char *p = s;
while (isspace(*p))
++p;
*end = p;
return p > s;
}
/*
* Set *end one char past alpha prefix of s.
* Return true iff any characters matched.
*/
int match_alpha(char *s, char **end)
{
char *p = s;
while (isalpha(*p))
++p;
*end = p;
return p > s;
}
/*
* Skip whitespace and then scan as many decimal digits
* as possible. Convert to an unsigned value with no
* overflow check. Return true iff any digits were
* matched. Set *end one past the scanned chars.
* FIXME: Add overflow check.
*/
int match_unsigned(char *s, char **end, unsigned *val_rtn)
{
char *p;
unsigned val;
match_whitespace(s, &s);
for (val = 0, p = s; isdigit(*p); p++)
val = 10 * val + (*p - '0');
*end = p;
*val_rtn = val;
return p > s;
}
/*
* Convert a character to lower case.
*/
char lowercase(char c)
{
return ('A' <= c && c <= 'Z') ? c - 'A' + 'a' : c;
}
/*
* Key comparison function for keyword lookups.
*/
typedef int (*KEYWORD_CMP)(char *s, unsigned n, char *kw);
/*
* Case insensitive keyword comparision. This compares
* a prefix of s of up to n characters with kw.
*/
int keyword_cmp_ignoring_case(char *s, size_t n, char *kw)
{
while (n > 0 && *s && *kw && lowercase(*s) == *kw) {
--n; ++s; ++kw;
}
return (n == 0) ? -*kw : lowercase(*s) - *kw;
}
/*
* Keyword entries and tables. An empty key string in
* a table produces undefined results.
*/
typedef struct keyword_pair_s {
char *key;
int token;
} KEYWORD_PAIR, KEYWORD_TABLE[];
#define TABLE_SIZE(A) (sizeof A / sizeof A[0])
/*
* Match a maximal prefix of alphabetic characters of
* string s to keywords in a table. Binary search is
* used, so the keys must be sorted. The keyword table
* index is returned (not the token value).
* On success return true and set *end one past the keyword.
* If none found, set *end one past any whitespace and return 0
* with index unchanged.
*/
int match_keyword(char *s, char **end,
KEYWORD_TABLE tbl, unsigned tbl_size,
KEYWORD_CMP cmp,
int *index)
{
int diff, lo, hi, mid;
match_whitespace(s, &s);
if (match_alpha(s, end)) {
lo = 0;
hi = tbl_size - 1;
while (lo <= hi) {
mid = (lo + hi) / 2;
diff = cmp(s, *end - s, tbl[mid].key);
if (diff < 0)
hi = mid - 1;
else if (diff > 0)
lo = mid + 1;
else {
*index = mid;
return 1;
}
}
}
*end = s;
return 0;
}
/*
* Match a maximal prefix of string s to keywords in a table.
* If found, verify its token matches the given value.
* On success return true and set *end to character past
* scanned keyword. On any failure, set *end one past any
* leading whitespace.
*/
int match_token(char *s, char **end,
KEYWORD_TABLE tbl, unsigned tbl_size,
KEYWORD_CMP cmp,
int token)
{
int index;
match_whitespace(s, &s);
if (!match_keyword(s, end, tbl, tbl_size, cmp, &index) ||
tbl[index].token != token)
return 0;
return 1;
}