token buffer not correct

Jason jason2... at jasonjobe.com
Tue Feb 27 19:40:42 UTC 2007


BTW, this is pull scanner.

Essentially I see a possible problem / bug in 2 areas. One is the
return value of a single character. Specifically in the enclosed
example TK_Char should be return the int value of the char matched
(e.g. '{') but it isn't.

The second involves the *non* advancement of the token / data pointer
illustrated below. The digraph value, "<:" appears at the beginning of
the output of the following token TK_TEXT.

My input is this

(app MyApp

	New: (
		User.Submit.Application: (
			>> Submitted
			<: {#<xml> pre_action </xml>#}
			:>  { (one) }
		)
	)
)

The problem tokens are printed thusly

	parser: TK_PRE(271):6 "<:"
	parser: TK_TEXT(266):6 "<: {#<xml> pre_action </xml>#}"

The excerpted rule is

			# Consume text delimited by <xml> ... </xml>
			xml  := (any_count_line* -- "</xml>") :>> "</xml>"
			@{
				/* Save p and pe. fbreak does not advance p. */
				s->token = TK_XML;
				s->token_name = "TK_XML";

				s->p = p + 1;
				s->pe = pe;
				s->len = s->p - s->data;
				return TK_XML;
			};

I've tried a number of things with the grammar to no avail. So I don't
know if this a problem with ragel or my spec.

Any pointers would be most appreciated.

many thanks,
Jason

---------------------------- Full .rl
----------------------------------------------------------------------
#include "reader_s.h"

#ifndef SCOPE
#define SCOPE
#endif


%%{
	machine Scanner;
	write data;
}%%


SCOPE void scan_init_buf( Scanner *s, char *buf )
{
	memset (s, '\0', sizeof(Scanner));
	s->curline = 1;
	s->buf = buf;
	s->p = s->buf;
	%%{ write init; }%%
}

SCOPE void scan_finalize( Scanner *s )
{
}

#define ret_tok( _tok ) token = _tok; s->token = _tok; s->data = s-
>tokstart; s->token_name = #_tok
#define ret_char( _tok ) token = _tok; s->token = *s->tokstart; s-
>data = s->tokstart; s->token_name = "TK_Char"

SCOPE int scan( Scanner *s )
{
	char *p = s->p;
	char *pe = s->pe;
	int token = TK_NO_TOKEN;

	while ( 1 ) {

		%%{
			machine Scanner;
			access s->;

			newline = '\n' @{s->curline += 1;};
			any_count_line = any | newline;

			# Consume a C comment.
			c_comment := any_count_line* :>> '*/' @{fgoto main;};

			# Consume text delimited by <xml> ... </xml>
			xml  := (any_count_line* -- "</xml>") :>> "</xml>"
			@{
				/* Save p and pe. fbreak does not advance p. */
				s->token = TK_XML;
				s->token_name = "TK_XML";

				s->p = p + 1;
				s->pe = pe;
				s->len = s->p - s->data;
				return TK_XML;
			};

			text_block := (any_count_line* -- '#}') :>> '#}'
			@{
				/* Save p and pe. fbreak does not advance p. */
				s->token = TK_TEXT;
				s->token_name = "TK_TEXT";

				s->p = p + 1;
				s->pe = pe;
				s->len = s->p - s->data;
				return TK_XML;
			};

			main := |*

			newline;

			# Alpha numberic characters or underscore.
			alnum_u = alnum | '_';

			# Alpha charactres or underscore.
			alpha_u = alpha | '_';

			ident = alpha_u alnum_u*;

			# Identifiers
			ident =>
				{ ret_tok( TK_Identifier ); fbreak; };

			# Keypath
			keypath = ident ('.' ident)*;
			keypath => { ret_tok( TK_Keypath ); fbreak; };

			# Keywords
			ident ':' => {
				ret_tok (TK_Keyword); fbreak;
			};

			keypath ':' => {
				ret_tok (TK_Keyword); fbreak;
			};

			# Strings and Text
			"'" ( [^'\\] | /\\./ )* "'" => { ret_tok (TK_String); fbreak; };
			'"' ( [^"\\] | /\\./ )* '"' => { ret_tok (TK_String); fbreak; };

			"<xml>" { fgoto xml; };
			'{#' 	{ fgoto text_block; };

			# Special Digraphs
			">>" @ { ret_tok (TK_SHIFT_RT); fbreak; };
			"<<" @ { ret_tok (TK_SHIFT_LT); fbreak; };
			":>" @ { ret_tok (TK_POST); fbreak; };
			"<:" @ { ret_tok (TK_PRE); fbreak; };

			"<=" => { ret_tok (TK_LE); fbreak; };
			">=" => { ret_tok (TK_GE); fbreak; };
			"!=" => { ret_tok (TK_NE); fbreak; };

			"++" => { ret_tok (TK_Increment); fbreak; };
			"--" => { ret_tok (TK_Decrement); fbreak; };

			# Whitespace
			[ \t\n];

	# Numbers
	digit+ => {
		ret_tok (TK_Integer); fbreak;
	};

	digit+'.' digit+ => {
		ret_tok (TK_Real); fbreak;
	};

	digit{1,3} (',' digit{3})+ => { ret_tok (TK_Integer); fbreak; };

	digit{1,3} (',' digit{3})+ '.' digit+ => { ret_tok (TK_Real);
fbreak; };

	'0x' xdigit+ => { ret_tok (TK_Hex); fbreak; };

        # Describe both c style comments and c++ style comments. The
        # priority bump on tne terminator of the comments brings us
        # out of the extend* which matches everything.
        '//' [^\n]* newline;

	'/*' { fgoto c_comment; };


			# EOF
			0 =>
				{ ret_tok( TK_EOF ); fbreak; };

			# Anything else
			any =>
				{ ret_char( *p ); fbreak; };

			*|;

			write exec;
		}%%

		if ( s->cs == Scanner_error )
			return TK_ERR;

		if ( token != TK_NO_TOKEN ) {
			/* Save p and pe. fbreak does not advance p. */
			s->p = p + 1;
			s->pe = pe;
			s->len = s->p - s->data;
			s->token = token;
			return token;
		}
	}
}

#ifdef TEST

#include <stdlib.h>
#include <stdio.h>
#include <string.h>


void output(Scanner *ss)
{
	int tok;

	while ( 1 ) {
		tok = scan (ss);
		if ( tok == TK_EOF ) {
			printf ("parser: EOF\n");
			break;
		}
		else if ( tok == TK_ERR ) {
			printf ("parser: ERR\n");
			break;
		}
		else {
			printf ("parser: %s(%d):%d \"", ss->token_name, tok, ss->curline);
			fwrite ( ss->data, 1, ss->len, stdout );
			printf ("\"\n" );
		}
	}
}

#define BUFSIZE 4096

int main (int argc, char** argv)
{
	Scanner ss;
   	char buf[BUFSIZE];

	int len = fread ( buf, sizeof(char), BUFSIZE, stdin );
	buf[len] = '\0';
	scan_init_buf (&ss, buf);

//	char *input = "(do with:1,345.99 and: \"some string\")";
//	scan_init_buf(&ss, input);

	output (&ss);
	scan_finalize (&ss);

	return 0;
}

#endif



More information about the ragel-users mailing list