User:Haus/Hanzo/lexer
From Wikipedia, the free encyclopedia
Here's a (somewhat dated) version of the guts of a flex specification for parsing infoboxes:
... %% %{ private int comment_count = 0; private String name; private String value; %} %line %char %caseless %unicode %standalone //%debug %state SHIPBOX %state NAME %state VALUE ALPHA=[A-Za-z] DIGIT=[0-9] NONNEWLINE_WHITE_SPACE_CHAR=[\ \t\b\012] WHITE_SPACE_CHAR=[\n\ \t\b\012] STRING_TEXT=(\\\"|[^\n\"]|\\{WHITE_SPACE_CHAR}+\\)* LineTerminator = \r|\n|\r\n InputCharacter = [^\r\n] WhiteSpace = {LineTerminator} | [ \t\f] %% <YYINITIAL> { "{{Infobox Ship"[|]*{WhiteSpace} | "{{Ship table"[|]*{WhiteSpace} { yybegin(SHIPBOX); comment_count +=1; return (1); } [^\n]*[\n]* { //printlns replaced to preserve UTF-8 System.out.println(yytext()); return (100); } } <SHIPBOX> { "{{" { comment_count = comment_count + 1; } [\|]*"}}" { comment_count = comment_count - 1; Utility.Assert(comment_count >= 0); if (comment_count == 0) { ShipBox.printnv(name,value); ShipBox.printbox(); yybegin(YYINITIAL); } } \| { if(name!=null && value!=null){ ShipBox.printnv(name,value); } yybegin(NAME); } [^\|] { value += yytext();} } <NAME> { [^=]*"=" { name = new String(yytext()); yybegin(VALUE); } } <VALUE> { [^\n\r]*[\n\r]+ { value = new String(yytext()); yybegin(SHIPBOX); } }