token buffer not correct
Jason
jason2... at jasonjobe.com
Tue Feb 27 19:40:42 UTC 2007
BTW, this is pull scanner.
Essentially I see a possible problem / bug in 2 areas. One is the
return value of a single character. Specifically in the enclosed
example TK_Char should be return the int value of the char matched
(e.g. '{') but it isn't.
The second involves the *non* advancement of the token / data pointer
illustrated below. The digraph value, "<:" appears at the beginning of
the output of the following token TK_TEXT.
My input is this
(app MyApp
New: (
User.Submit.Application: (
>> Submitted
<: {#<xml> pre_action </xml>#}
:> { (one) }
)
)
)
The problem tokens are printed thusly
parser: TK_PRE(271):6 "<:"
parser: TK_TEXT(266):6 "<: {#<xml> pre_action </xml>#}"
The excerpted rule is
# Consume text delimited by <xml> ... </xml>
xml := (any_count_line* -- "</xml>") :>> "</xml>"
@{
/* Save p and pe. fbreak does not advance p. */
s->token = TK_XML;
s->token_name = "TK_XML";
s->p = p + 1;
s->pe = pe;
s->len = s->p - s->data;
return TK_XML;
};
I've tried a number of things with the grammar to no avail. So I don't
know if this a problem with ragel or my spec.
Any pointers would be most appreciated.
many thanks,
Jason
---------------------------- Full .rl
----------------------------------------------------------------------
#include "reader_s.h"
#ifndef SCOPE
#define SCOPE
#endif
%%{
machine Scanner;
write data;
}%%
SCOPE void scan_init_buf( Scanner *s, char *buf )
{
memset (s, '\0', sizeof(Scanner));
s->curline = 1;
s->buf = buf;
s->p = s->buf;
%%{ write init; }%%
}
SCOPE void scan_finalize( Scanner *s )
{
}
#define ret_tok( _tok ) token = _tok; s->token = _tok; s->data = s-
>tokstart; s->token_name = #_tok
#define ret_char( _tok ) token = _tok; s->token = *s->tokstart; s-
>data = s->tokstart; s->token_name = "TK_Char"
SCOPE int scan( Scanner *s )
{
char *p = s->p;
char *pe = s->pe;
int token = TK_NO_TOKEN;
while ( 1 ) {
%%{
machine Scanner;
access s->;
newline = '\n' @{s->curline += 1;};
any_count_line = any | newline;
# Consume a C comment.
c_comment := any_count_line* :>> '*/' @{fgoto main;};
# Consume text delimited by <xml> ... </xml>
xml := (any_count_line* -- "</xml>") :>> "</xml>"
@{
/* Save p and pe. fbreak does not advance p. */
s->token = TK_XML;
s->token_name = "TK_XML";
s->p = p + 1;
s->pe = pe;
s->len = s->p - s->data;
return TK_XML;
};
text_block := (any_count_line* -- '#}') :>> '#}'
@{
/* Save p and pe. fbreak does not advance p. */
s->token = TK_TEXT;
s->token_name = "TK_TEXT";
s->p = p + 1;
s->pe = pe;
s->len = s->p - s->data;
return TK_XML;
};
main := |*
newline;
# Alpha numberic characters or underscore.
alnum_u = alnum | '_';
# Alpha charactres or underscore.
alpha_u = alpha | '_';
ident = alpha_u alnum_u*;
# Identifiers
ident =>
{ ret_tok( TK_Identifier ); fbreak; };
# Keypath
keypath = ident ('.' ident)*;
keypath => { ret_tok( TK_Keypath ); fbreak; };
# Keywords
ident ':' => {
ret_tok (TK_Keyword); fbreak;
};
keypath ':' => {
ret_tok (TK_Keyword); fbreak;
};
# Strings and Text
"'" ( [^'\\] | /\\./ )* "'" => { ret_tok (TK_String); fbreak; };
'"' ( [^"\\] | /\\./ )* '"' => { ret_tok (TK_String); fbreak; };
"<xml>" { fgoto xml; };
'{#' { fgoto text_block; };
# Special Digraphs
">>" @ { ret_tok (TK_SHIFT_RT); fbreak; };
"<<" @ { ret_tok (TK_SHIFT_LT); fbreak; };
":>" @ { ret_tok (TK_POST); fbreak; };
"<:" @ { ret_tok (TK_PRE); fbreak; };
"<=" => { ret_tok (TK_LE); fbreak; };
">=" => { ret_tok (TK_GE); fbreak; };
"!=" => { ret_tok (TK_NE); fbreak; };
"++" => { ret_tok (TK_Increment); fbreak; };
"--" => { ret_tok (TK_Decrement); fbreak; };
# Whitespace
[ \t\n];
# Numbers
digit+ => {
ret_tok (TK_Integer); fbreak;
};
digit+'.' digit+ => {
ret_tok (TK_Real); fbreak;
};
digit{1,3} (',' digit{3})+ => { ret_tok (TK_Integer); fbreak; };
digit{1,3} (',' digit{3})+ '.' digit+ => { ret_tok (TK_Real);
fbreak; };
'0x' xdigit+ => { ret_tok (TK_Hex); fbreak; };
# Describe both c style comments and c++ style comments. The
# priority bump on tne terminator of the comments brings us
# out of the extend* which matches everything.
'//' [^\n]* newline;
'/*' { fgoto c_comment; };
# EOF
0 =>
{ ret_tok( TK_EOF ); fbreak; };
# Anything else
any =>
{ ret_char( *p ); fbreak; };
*|;
write exec;
}%%
if ( s->cs == Scanner_error )
return TK_ERR;
if ( token != TK_NO_TOKEN ) {
/* Save p and pe. fbreak does not advance p. */
s->p = p + 1;
s->pe = pe;
s->len = s->p - s->data;
s->token = token;
return token;
}
}
}
#ifdef TEST
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
void output(Scanner *ss)
{
int tok;
while ( 1 ) {
tok = scan (ss);
if ( tok == TK_EOF ) {
printf ("parser: EOF\n");
break;
}
else if ( tok == TK_ERR ) {
printf ("parser: ERR\n");
break;
}
else {
printf ("parser: %s(%d):%d \"", ss->token_name, tok, ss->curline);
fwrite ( ss->data, 1, ss->len, stdout );
printf ("\"\n" );
}
}
}
#define BUFSIZE 4096
int main (int argc, char** argv)
{
Scanner ss;
char buf[BUFSIZE];
int len = fread ( buf, sizeof(char), BUFSIZE, stdin );
buf[len] = '\0';
scan_init_buf (&ss, buf);
// char *input = "(do with:1,345.99 and: \"some string\")";
// scan_init_buf(&ss, input);
output (&ss);
scan_finalize (&ss);
return 0;
}
#endif
More information about the ragel-users
mailing list