[ragel-users] token buffer not correct

Adrian Thurston thurs... at cs.queensu.ca
Fri Mar 2 22:36:59 UTC 2007


Hi Jason,

In scanner pattern actions *p is not reliable. Sometimes the action is
executed on the character following the match, other times it is executed on
the last character of the match. This is an optimization which depends on
the nature of the patterns. Use *tokstart instead. This will fix the problem
with '{'.

In the second issue, I noticed that you jump to a new machine definition to
process { ... } blocks. But before you return you don't set s->data. It
stays at the old value, which is why it appears that the data pointer is not
advanced.

Adrian

Jason wrote:
> BTW, this is pull scanner.
> 
> Essentially I see a possible problem / bug in 2 areas. One is the
> return value of a single character. Specifically in the enclosed
> example TK_Char should be return the int value of the char matched
> (e.g. '{') but it isn't.
> 
> The second involves the *non* advancement of the token / data pointer
> illustrated below. The digraph value, "<:" appears at the beginning of
> the output of the following token TK_TEXT.
> 
> My input is this
> 
> (app MyApp
> 
> 	New: (
> 		User.Submit.Application: (
> 			>> Submitted
> 			<: {#<xml> pre_action </xml>#}
> 			:>  { (one) }
> 		)
> 	)
> )
> 
> The problem tokens are printed thusly
> 
> 	parser: TK_PRE(271):6 "<:"
> 	parser: TK_TEXT(266):6 "<: {#<xml> pre_action </xml>#}"
> 
> The excerpted rule is
> 
> 			# Consume text delimited by <xml> ... </xml>
> 			xml  := (any_count_line* -- "</xml>") :>> "</xml>"
> 			@{
> 				/* Save p and pe. fbreak does not advance p. */
> 				s->token = TK_XML;
> 				s->token_name = "TK_XML";
> 
> 				s->p = p + 1;
> 				s->pe = pe;
> 				s->len = s->p - s->data;
> 				return TK_XML;
> 			};
> 
> I've tried a number of things with the grammar to no avail. So I don't
> know if this a problem with ragel or my spec.
> 
> Any pointers would be most appreciated.
> 
> many thanks,
> Jason
> 
> ---------------------------- Full .rl
> ----------------------------------------------------------------------
> #include "reader_s.h"
> 
> #ifndef SCOPE
> #define SCOPE
> #endif
> 
> 
> %%{
> 	machine Scanner;
> 	write data;
> }%%
> 
> 
> SCOPE void scan_init_buf( Scanner *s, char *buf )
> {
> 	memset (s, '\0', sizeof(Scanner));
> 	s->curline = 1;
> 	s->buf = buf;
> 	s->p = s->buf;
> 	%%{ write init; }%%
> }
> 
> SCOPE void scan_finalize( Scanner *s )
> {
> }
> 
> #define ret_tok( _tok ) token = _tok; s->token = _tok; s->data = s-
>> tokstart; s->token_name = #_tok
> #define ret_char( _tok ) token = _tok; s->token = *s->tokstart; s-
>> data = s->tokstart; s->token_name = "TK_Char"
> 
> SCOPE int scan( Scanner *s )
> {
> 	char *p = s->p;
> 	char *pe = s->pe;
> 	int token = TK_NO_TOKEN;
> 
> 	while ( 1 ) {
> 
> 		%%{
> 			machine Scanner;
> 			access s->;
> 
> 			newline = '\n' @{s->curline += 1;};
> 			any_count_line = any | newline;
> 
> 			# Consume a C comment.
> 			c_comment := any_count_line* :>> '*/' @{fgoto main;};
> 
> 			# Consume text delimited by <xml> ... </xml>
> 			xml  := (any_count_line* -- "</xml>") :>> "</xml>"
> 			@{
> 				/* Save p and pe. fbreak does not advance p. */
> 				s->token = TK_XML;
> 				s->token_name = "TK_XML";
> 
> 				s->p = p + 1;
> 				s->pe = pe;
> 				s->len = s->p - s->data;
> 				return TK_XML;
> 			};
> 
> 			text_block := (any_count_line* -- '#}') :>> '#}'
> 			@{
> 				/* Save p and pe. fbreak does not advance p. */
> 				s->token = TK_TEXT;
> 				s->token_name = "TK_TEXT";
> 
> 				s->p = p + 1;
> 				s->pe = pe;
> 				s->len = s->p - s->data;
> 				return TK_XML;
> 			};
> 
> 			main := |*
> 
> 			newline;
> 
> 			# Alpha numberic characters or underscore.
> 			alnum_u = alnum | '_';
> 
> 			# Alpha charactres or underscore.
> 			alpha_u = alpha | '_';
> 
> 			ident = alpha_u alnum_u*;
> 
> 			# Identifiers
> 			ident =>
> 				{ ret_tok( TK_Identifier ); fbreak; };
> 
> 			# Keypath
> 			keypath = ident ('.' ident)*;
> 			keypath => { ret_tok( TK_Keypath ); fbreak; };
> 
> 			# Keywords
> 			ident ':' => {
> 				ret_tok (TK_Keyword); fbreak;
> 			};
> 
> 			keypath ':' => {
> 				ret_tok (TK_Keyword); fbreak;
> 			};
> 
> 			# Strings and Text
> 			"'" ( [^'\\] | /\\./ )* "'" => { ret_tok (TK_String); fbreak; };
> 			'"' ( [^"\\] | /\\./ )* '"' => { ret_tok (TK_String); fbreak; };
> 
> 			"<xml>" { fgoto xml; };
> 			'{#' 	{ fgoto text_block; };
> 
> 			# Special Digraphs
> 			">>" @ { ret_tok (TK_SHIFT_RT); fbreak; };
> 			"<<" @ { ret_tok (TK_SHIFT_LT); fbreak; };
> 			":>" @ { ret_tok (TK_POST); fbreak; };
> 			"<:" @ { ret_tok (TK_PRE); fbreak; };
> 
> 			"<=" => { ret_tok (TK_LE); fbreak; };
> 			">=" => { ret_tok (TK_GE); fbreak; };
> 			"!=" => { ret_tok (TK_NE); fbreak; };
> 
> 			"++" => { ret_tok (TK_Increment); fbreak; };
> 			"--" => { ret_tok (TK_Decrement); fbreak; };
> 
> 			# Whitespace
> 			[ \t\n];
> 
> 	# Numbers
> 	digit+ => {
> 		ret_tok (TK_Integer); fbreak;
> 	};
> 
> 	digit+'.' digit+ => {
> 		ret_tok (TK_Real); fbreak;
> 	};
> 
> 	digit{1,3} (',' digit{3})+ => { ret_tok (TK_Integer); fbreak; };
> 
> 	digit{1,3} (',' digit{3})+ '.' digit+ => { ret_tok (TK_Real);
> fbreak; };
> 
> 	'0x' xdigit+ => { ret_tok (TK_Hex); fbreak; };
> 
>         # Describe both c style comments and c++ style comments. The
>         # priority bump on tne terminator of the comments brings us
>         # out of the extend* which matches everything.
>         '//' [^\n]* newline;
> 
> 	'/*' { fgoto c_comment; };
> 
> 
> 			# EOF
> 			0 =>
> 				{ ret_tok( TK_EOF ); fbreak; };
> 
> 			# Anything else
> 			any =>
> 				{ ret_char( *p ); fbreak; };
> 
> 			*|;
> 
> 			write exec;
> 		}%%
> 
> 		if ( s->cs == Scanner_error )
> 			return TK_ERR;
> 
> 		if ( token != TK_NO_TOKEN ) {
> 			/* Save p and pe. fbreak does not advance p. */
> 			s->p = p + 1;
> 			s->pe = pe;
> 			s->len = s->p - s->data;
> 			s->token = token;
> 			return token;
> 		}
> 	}
> }
> 
> #ifdef TEST
> 
> #include <stdlib.h>
> #include <stdio.h>
> #include <string.h>
> 
> 
> void output(Scanner *ss)
> {
> 	int tok;
> 
> 	while ( 1 ) {
> 		tok = scan (ss);
> 		if ( tok == TK_EOF ) {
> 			printf ("parser: EOF\n");
> 			break;
> 		}
> 		else if ( tok == TK_ERR ) {
> 			printf ("parser: ERR\n");
> 			break;
> 		}
> 		else {
> 			printf ("parser: %s(%d):%d \"", ss->token_name, tok, ss->curline);
> 			fwrite ( ss->data, 1, ss->len, stdout );
> 			printf ("\"\n" );
> 		}
> 	}
> }
> 
> #define BUFSIZE 4096
> 
> int main (int argc, char** argv)
> {
> 	Scanner ss;
>    	char buf[BUFSIZE];
> 
> 	int len = fread ( buf, sizeof(char), BUFSIZE, stdin );
> 	buf[len] = '\0';
> 	scan_init_buf (&ss, buf);
> 
> //	char *input = "(do with:1,345.99 and: \"some string\")";
> //	scan_init_buf(&ss, input);
> 
> 	output (&ss);
> 	scan_finalize (&ss);
> 
> 	return 0;
> }
> 
> #endif
> 
> 
> 



More information about the ragel-users mailing list