[ragel-users] Newbie question: an extended comma or tab separated (CSV/TSV) Ragel scanner
Erich Ocean
er... at atlasocean.com
Sat Jun 28 16:01:29 UTC 2008
Heiko,
Not sure if this will help, but this is from a PDF 1.4 parsing machine
I wrote years ago:
%% PDFParsingMachine
alphtype unsigned char;
# the whitespace, eol, delimiter, regular, and comment machines
# whitespace in a PDF file includes the NULL character,
# and consecutive whitespace is treated as one
whitespace = /[\t\f\n\r\0 ]/ ;
eol = /[\r\n]/ | '\r\n' ;
delimiter = [()<>\[\]{}/%] ;
regular = any - ( whitespace | delimiter ) ;
# The priority bump on the terminator of the comments brings us
# out of the extend* which matches everything.
comment = '%' . extend* $0 . eol @1 ;
#
# (Other machines not shown.)
#
main := (
whitespace |
comment |
boolean |
number |
hexString |
name |
literalString |
beginArray |
endArray |
beginDict |
endDict |
pdfNull |
stream |
beginIndirectObject |
endIndirectObject |
indirectObjectReference |
beginXref |
beginTrailer |
beginStartxref |
beginFree |
beginInUse )**;
%%
Best, Erich
On Jun 28, 2008, at 8:52 AM, Heiko wrote:
>
> Dear list members,
>
> I am trying to implement a CSV scanner based on the fantastic Ragel,
> with a small few modifications to the standard:
>
> - it should work for different types of Unix/Mac/Windows line endings
> ( \n, \r, \r\n)
> - it should use both commas and tabs as item separators
> - it should honour quoted values
> - it should collapse multiple empty lines into single line
> separators ...
>
> I have written the following ragel code (below) to accomplish this,
> but there is a problem that I cannot locate ...
> When the line endings are \r (CR), everything seems to work fine,
> however, if they are \n (LF), the first character of the next field is
> swallowed by the scanner ....
> I am pulling my hair out, and suspect it has to do with ambiguities in
> the definition of the scanner. I am also unsure as to which transition
> action I should choose (currently '@').
>
> Does anyone have any ideas?
>
> Cheers,
> Heiko
>
>
>
>
>
>
>
> //
> // csv_parse.m
> // RagelCsv
> //
> //
>
>
>
> #define HPDEBUG 1
> #import <Foundation/Foundation.h>
>
> void csv_parse(unsigned char *input, size_t len, NSMutableArray
> **parsedData, long *maxcols, long *nlines, long *nrecords) {
> long tempmaxcols=0, tempnrecords=0;
> int cs=0, act, curline = 1; //ragel variables to keep states
> unsigned char *tokstart = NULL, *tokend = NULL; //ragel variables
> for Scanner
> unsigned char *p = input, *pe = input + len; //ragel variables to
> keep track of position in stream
> NSMutableArray *row=[[NSMutableArray alloc] init]; //Array to hold
> elements from each row/record
> NSMutableArray *tempParsedData=[[NSMutableArray alloc] init]; //
> Array of row arrays
> NSString *coldata; //string that holds entry in field
> NSMutableString *tempInputString=[NSString stringWithUTF8String:(char
> *)input];
>
> //Discussion
> //On UNIX, text file line-endings are terminated with a newline (n),
> also referred to as a linefeed (LF).
> //On Windows, line-endings are terminated with a combination of a
> carriage return (r) and a newline(n), also referred to as CR/LF.
> //On the Mac Classic, line-endings are terminated with a single
> carriage return (CR). (Mac OS X uses the UNIX convention.)
>
> //A line is delimited by any of these characters, the longest possible
> sequence being preferred to any shorter:
> //U+000D (\r or CR) //U+2028 (Unicode line separator) //U+000A (\n or
> LF) //U+2029 (Unicode paragraph separator)
> // \r\n, in that order (also known as CRLF)
>
>
> //append end of line if not present so we can obtain all records.
> [row autorelease];
>
> %%{
> machine csv_scan;
> alphtype unsigned char;
>
> newline =('\r\n') | ('\n') | ('\r') %{
> curline += 1;
> };
> multiline =(('\r\n') | ('\n') | ('\r')).(('\r\n') | ('\n') | ('\r'))
> + @{
> curline += 1;
> };
>
> ws = ' ';
> Separator = [,\t];
> UnQuotedValue = [^ \t",\r\n].[^"\t,\r\n]*;
> QuotedChar = ( '""' | [^"] | (newline|multiline) );
> QuotedValue = '"' . QuotedChar* . '"';
>
> main := |*
> ws;
> multiline @{
> if ([coldata length]==0)
> coldata=(NSString *)[NSNull null];
> [row addObject:coldata];
> coldata=nil;
> tempnrecords++;
> if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> [tempParsedData addObject:row];
> if ([row count] >tempmaxcols) tempmaxcols=[row count];
> row=[NSMutableArray array];
> if (HPDEBUG) NSLog(@"multiline");
> };
> newline @{
> if ([coldata length]==0)
> coldata=(NSString *)[NSNull null];
> [row addObject:coldata];
> coldata=nil;
> tempnrecords++;
> if(!row) row=[NSMutableArray arrayWithObject:[NSNull null]];
> [tempParsedData addObject:row];
> if ([row count] >tempmaxcols) tempmaxcols=[row count];
> row=[NSMutableArray array];
> if (HPDEBUG) NSLog(@"newline");
> };
> Separator {
> if ([coldata length]==0)
> coldata=(NSString *)[NSNull null];
> [row addObject:coldata];
> tempnrecords++;
> coldata=nil;
> if (HPDEBUG) NSLog(@"separator");
> };
>
> UnQuotedValue {
> unsigned char ch, *endp;
> int datalen;
> datalen = tokend - tokstart;
> endp = tokend - 1;
> while(datalen>0) {
> ch = *endp--;
> /* if (ch==' ' || ch=='\t') {*/
> if (ch==' ') {
> datalen--;
> } else {
> break;
> }
> }
>
> if (datalen==0) {
> coldata = (NSString *)[NSNull null];
> } else {
> coldata=[NSString stringWithString:[tempInputString
> substringWithRange:NSMakeRange((int)(tokstart-input), datalen)]];
> }
> if (HPDEBUG) NSLog(@"Unquoted value: %@",coldata);
> };
> QuotedValue {
> unsigned char ch, *start_p, *wptr, *rptr;
> int rest, datalen;
> start_p = wptr = tokstart;
> rptr = tokstart + 1;
> rest = tokend - tokstart - 2;
> datalen = 0;
> while(rest>0) {
> ch = *rptr++;
> if (ch=='"') {
> rptr++;
> rest--;
> }
> *wptr++ = ch;
> datalen++;
> rest--;
> }
> tempInputString=[NSString stringWithUTF8String:(char *)input]; //
> reset tempInputString after messing with chars in input
> coldata=[NSString stringWithString:[tempInputString
> substringWithRange:NSMakeRange((int)(start_p-input), datalen)]];
> if (HPDEBUG) NSLog(@" Quoted value: %@",coldata);
> };
> *|;
> }%%
> %% write data nofinal;
>
>
> %%write init;
> %%write exec;
> %%write eof;
>
> if(row) {
> // [tempParsedData addObject:row];
> // if ([row count] >tempmaxcols) tempmaxcols=[row count];
> // [row autorelease];
> row=nil;
> }
>
> *parsedData=tempParsedData;
>
>
> if ( cs == csv_scan_error ) {
> NSLog(@"CSVscan parse error on line %d.", curline);
> }
>
> *nrecords=tempnrecords;
> *nlines=curline-1;
> *maxcols=tempmaxcols;
>
> }
>
>
>
> >
More information about the ragel-users
mailing list