[ragel-users] parser getting gigantic ?
M P
buserror at gmail.com
Thu Sep 2 17:14:02 UTC 2010
I found one problem already,
> string = '"' ((([^"] | '\"')*) >str_init %str_done) '"';
Needs a '**'. That reduces the size considerably...
It's still >100KB tho. I've been digging a bit, and there are a LOT of
instances of that "W" rule that gets inlined, even tho the output (and
the value) is just discarded.
I attempted to factor that a bit by doing :
eatme := space** @{ fret; }
W = '' @{ fcall eatme; }
But that didn't work... I wish we could mark a rule as "discard,
factor" or something...
Michael
On Thu, Sep 2, 2010 at 3:41 PM, M P <buserror at gmail.com> wrote:
> I decided to play with Ragel by making a parser for an extended
> version of JSON.
>
> After falling into every single pitfall possible, I managed to get the
> whole thing working pretty well, but I just had a look at the
> generated code and it reached 600KB total, using -G1 (!)
>
> So, what am I doing wrong ? I'm certain there's probably a lot of
> stuff wrong with the way I made the parser, but really, I'm not sure
> what so having experts eyeballing this would be very nice...
>
> There are 2 different json parsers in the file; one is for the string
> constants, one (the main one) is for the language proper...
>
>
> /*
> * IF YOU ARE LOOKING AT A .c FILE, YOUR ARE LOOKING AT THE WRONG ONE
> *
> * This file is autogenerated from a .rl source file for 'ragel'
> parser generator.
> */
> #include <stdlib.h>
> #include <string.h>
> #include <stdio.h>
>
> #include "json.h"
>
> %%{
> machine json_str;
> write data;
> }%%
>
> static char * json_append_utf8_glyph(
> char * dst,
> unsigned long inUnicode )
> {
> if (!(inUnicode & ~0x7f)) {
> *dst++ = ((char)inUnicode);
> return dst; // that was easy
> }
> unsigned char *cur = dst;
>
> unsigned long currentMask = ~0x7ff;
> int bits = 6;
> int header = 5;
>
> while ((inUnicode & currentMask) && bits <= 24) {
> currentMask = currentMask << 6;
> bits += 6; header--;
> }
> *cur++ = (0xfe << header) | (unsigned char)(inUnicode >> (bits));
> bits -= 6;
> while (bits >= 0) {
> *cur++ = 0x80 | ((unsigned char)(inUnicode >> bits) & 0x3f);
> bits -= 6;
> }
> return cur;
> }
>
> int json_parse_string(char * str, char *end, char * out)
> {
> char *p = str, *pe = end ? end : str + strlen( str ), *eof = pe;
> int cs;
> out = out ? out : str;
> uint16_t u;
> %%{
> machine json_str;
>
> xxdigit = (
> ([0-9] @{ u = (u << 4) | fc - '0'; }) |
> ([a-f] @{ u = (u << 4) | fc - 'a' + 0xa; }) |
> ([A-F] @{ u = (u << 4) | fc - 'A' + 0xa; })
> );
> utf16 = ( xxdigit{4} ) >{ u = 0; } @{ out = json_append_utf8_glyph(out, u); };
>
> normal = any @{*out++ = fc;};
> escape =
> ('\\' %{ *out++ = '\\'; } ) |
> ('t' %{ *out++ = '\t'; } ) |
> ('b' %{ *out++ = '\b'; } ) |
> ('f' %{ *out++ = '\f'; } ) |
> ('n' %{ *out++ = '\n'; } ) |
> ('r' %{ *out++ = '\r'; } ) |
> ('u' utf16 ) |
> ( normal -- [\\tbfntu] )
> ;
> main := (
> ('\\' escape) |
> ( normal -- '\\' )
> )*;
>
> # Initialize and execute.
> write init;
> write exec;
> }%%
> *out = 0;
>
> return 0;
> }
>
> %%{
> machine json;
> write data;
> }%%
>
> int json_parse( json_driver_t *d, char * str )
> {
> char *p = str, *pe = str + strlen( str ), *eof = pe;
> int cs;
> int stack[32], top = 0;
> int integer_sign; // for integer decode
> char * float_start;
> json_driver_value_t v;
> uint32_t b64;
> int b64_cnt;
>
> %%{
> machine json;
> action obj_field_list_start { d->open_object(d); }
> action obj_field_list_done { d->close_object(d); }
> action obj_value_list_start { d->open_array(d); }
> action obj_value_list_done { d->close_array(d); }
> action obj_create_name { d->set_name(d, &v); }
> action obj_set_flag { if (d->add_flag) d->add_flag(d, &v); }
> action obj_set_string { d->set_value(d, json_driver_type_string, &v); }
> action obj_set_integer { d->set_value(d, json_driver_type_integer, &v); }
> action obj_set_float { d->set_value(d, json_driver_type_float, &v); }
> action obj_set_hex { d->set_value(d, json_driver_type_hex, &v); }
> action obj_set_true { v.u.v_bool = 1; d->set_value(d,
> json_driver_type_bool, &v); }
> action obj_set_false { v.u.v_bool = 0; d->set_value(d,
> json_driver_type_bool, &v); }
> action obj_set_null { d->set_value(d, json_driver_type_null, NULL); }
>
> action obj_start_data { if (d->open_data) d->open_data(d); }
> action obj_flush_data { if (d->add_data) for (int s=16,i = 0;
> i<b64_cnt; i++,s-=8) d->add_data(d, (b64 >> s) & 0xff); }
> action obj_end_data { if (d->close_data) d->close_data(d); }
>
> W = [ \t\n]**;
>
> #
> # quoted or unquoted string
> #
> action str_init { v.u.v_str.start = v.u.v_str.end = fpc; }
> action str_done { v.u.v_str.end = fpc; }
>
> string = '"' ((([^"] | '\"')*) >str_init %str_done) '"';
> ident = ((alpha | '_') (alnum | '_')*) >str_init %str_done;
>
> #
> # negative/positive Integer
> #
> action integer_init { v.u.v_int = 0; integer_sign = 1; }
> action integer_minus { integer_sign = -1; }
> action integer_digit { v.u.v_int = (v.u.v_int * 10) + (fc - '0'); }
> action integer_done { v.u.v_int *= integer_sign; }
>
> integer = (('-' @integer_minus | '+')? (digit+ @integer_digit))
> >integer_init %integer_done;
>
> #
> # hex integer
> #
> xxdigit = (
> ([0-9] @{ v.u.v_int = (v.u.v_int << 4) | fc - '0'; }) |
> ([a-f] @{ v.u.v_int = (v.u.v_int << 4) | fc - 'a' + 0xa; }) |
> ([A-F] @{ v.u.v_int = (v.u.v_int << 4) | fc - 'A' + 0xa; })
> );
> hex = (('-' @integer_minus | '+')?( '0x' xxdigit+))
> >integer_init %integer_done;
>
> #
> # float/double value
> #
> action float_init { float_start = fpc; }
> action float_done { sscanf(float_start, "%lf", &v.u.v_float); }
> #
> # float values
> #
> float = (
> ('-' | '+')? digit* '.' digit+ [fd]?
> ) >float_init %float_done;
>
> #
> # base64 decoder
> #
> base64_char = (
> ([A-Z] @{ b64 = (b64 << 6) | (fc - 'A'); }) |
> ([a-z] @{ b64 = (b64 << 6) | (fc - 'a' + 26 ); }) |
> ([0-9] @{ b64 = (b64 << 6) | (fc - '0' + 52 ); }) |
> ('+' @{ b64 = (b64 << 6) | 62; }) |
> ('/' @{ b64 = (b64 << 6) | 63; })
> );
> base64_pad = '=' @{ b64 = (b64 << 6); };
> base64_four = (
> base64_char base64_char base64_char base64_char
> ) %{ b64_cnt = 3; } %obj_flush_data;
> base64_padder = (
> base64_char base64_char
> (
> (( base64_char base64_pad )
> %{ b64_cnt = 2; } ) |
> (( base64_pad base64_pad )
> %{ b64_cnt = 1; } )
> )
> ) %obj_flush_data;
>
> base64 = ( base64_four** (base64_four | base64_padder) ) >{b64 = 0;}
> %err{ printf("### base64 Error : '%s'\n", p); };
>
> #
> # JSON value, extended
> #
> json_value = (
> (string %obj_set_string) |
> (integer %obj_set_integer) |
> (hex %obj_set_hex ) |
> (float %obj_set_float) |
> ('true' %obj_set_true) |
> ('false' %obj_set_false) |
> ('null' %obj_set_null) |
> ('{' @{ fhold; fcall obj_field_list; } ) |
> ('[' @{ fhold; fcall json_value_list; } ) |
> (('%' (W base64)* W '%') >obj_start_data %obj_end_data)
> );
>
> json_value_list := (
> '[' (
> '' |
> (W json_value (W ',' W json_value)* )
> ) W ','? W ']'
> ) >obj_value_list_start @obj_value_list_done @{ fret; }
> %err{ printf("### Array[%d] Error : '%s'\n", top, p); };
>
> obj_field_flag = ( ident ) %obj_set_flag;
> obj_field_flags = (
> '(' W obj_field_flag (W ',' W obj_field_flag)** ')'
> );
> obj_field = ((string | ident) %obj_create_name) W obj_field_flags? W
> ':' W json_value;
>
> obj_field_list := (
> '{' (
> '' |
> (W obj_field (W ',' W obj_field)** )
> ) W ','? W '}'
> ) >obj_field_list_start @obj_field_list_done @{ fret; }
> %err{ printf("### Object[%d] Error : '%s'\n", top, p); };
>
> main := (
> W json_value
> ) %err{ printf("### JSON Error : '%s'\n", p); };
>
> # Initialize and execute.
> write init;
> write exec;
> }%%
>
> return 0;
> };
>
> #ifdef JSON_TEST_UNIT
> static void d_set_name(struct json_driver_t *d,
> json_driver_value_t * v)
> {
> int l = v->u.v_str.end - v->u.v_str.start;
> printf("\"%*.*s\": ", l, l, v->u.v_str.start);
> }
>
> static void d_open_array(struct json_driver_t *d)
> {
> printf("[");fflush(stdout);
> }
>
> static void d_open_object(struct json_driver_t *d)
> {
> printf("{");fflush(stdout);
> }
>
> static void d_set_value(struct json_driver_t *d,
> int type,
> json_driver_value_t * v)
> {
> switch (type) {
> case json_driver_type_null:
> printf("null, ");
> break;
> case json_driver_type_bool:
> printf("%s, ", v->u.v_bool ? "true" : "false");
> break;
> case json_driver_type_integer:
> printf("%d, ", (int)v->u.v_int);
> break;
> case json_driver_type_hex:
> printf("0x%x, ", (int)v->u.v_int);
> break;
> case json_driver_type_float:
> printf("%f, ", (float)v->u.v_float);
> break;
> case json_driver_type_string: {
> char buf[256];
> json_parse_string(v->u.v_str.start, v->u.v_str.end, buf);
> printf("\"%s\": ", buf);
> } break;
> }
> fflush(stdout);
> }
>
> static void d_close_array(struct json_driver_t *d)
> {
> printf("],");fflush(stdout);
> }
>
> static void d_close_object(struct json_driver_t *d)
> {
> printf("},");fflush(stdout);
> }
>
> static void d_open_data(struct json_driver_t *d)
> {
> printf("%% '");fflush(stdout);
> }
> static void d_add_data(struct json_driver_t *d, uint8_t data)
> {
> printf("%c",data);fflush(stdout);
> }
> static void d_close_data(struct json_driver_t *d)
> {
> printf("' %%,");fflush(stdout);
> }
>
> json_driver_t driver = {
> .set_name = d_set_name,
> .open_array = d_open_array,
> .open_object = d_open_object,
> .set_value = d_set_value,
> .close_array = d_close_array,
> .close_object = d_close_object,
>
> .open_data = d_open_data,
> .add_data = d_add_data,
> .close_data = d_close_data,
> };
>
> int main(int argc, char * argv[])
> {
>
> for (int i = 1; i < argc; i++) {
> printf("### parsing '%s'\n", argv[i]);
> json_parse(&driver, argv[i]);printf("\n");
> }
> return 0;
> }
>
> #endif
>
_______________________________________________
ragel-users mailing list
ragel-users at complang.org
http://www.complang.org/mailman/listinfo/ragel-users
More information about the ragel-users
mailing list