User:Haus/Hanzo/lexer

From Wikipedia, the free encyclopedia

Here's a (somewhat dated) version of the guts of a flex specification for parsing infoboxes:


...
%%
%{
  private int comment_count = 0;
  private String name;
  private String value;
%} 
%line
%char
%caseless
%unicode
%standalone
//%debug
%state SHIPBOX
%state NAME
%state VALUE
 
ALPHA=[A-Za-z]
DIGIT=[0-9]
NONNEWLINE_WHITE_SPACE_CHAR=[\ \t\b\012]
WHITE_SPACE_CHAR=[\n\ \t\b\012]
STRING_TEXT=(\\\"|[^\n\"]|\\{WHITE_SPACE_CHAR}+\\)*
LineTerminator = \r|\n|\r\n
InputCharacter = [^\r\n]
WhiteSpace     = {LineTerminator} | [ \t\f]
 
%% 
<YYINITIAL> {
 
"{{Infobox Ship"[|]*{WhiteSpace} |
"{{Ship table"[|]*{WhiteSpace} {
   yybegin(SHIPBOX);
   comment_count +=1;
   return (1); 
   }
 
[^\n]*[\n]* {
    //printlns replaced to preserve UTF-8
    System.out.println(yytext());  
    return (100);
   }
}
 
 
<SHIPBOX> {
"{{" { comment_count = comment_count + 1; }
[\|]*"}}" { 
        comment_count = comment_count - 1; 
        Utility.Assert(comment_count >= 0);
        if (comment_count == 0) {
                ShipBox.printnv(name,value);
                ShipBox.printbox();
                yybegin(YYINITIAL);
        }
     }
 
\|    {
        if(name!=null && value!=null){
                ShipBox.printnv(name,value);
        } 
        yybegin(NAME);
}
 
[^\|] { value += yytext();}
}
 
<NAME> {
[^=]*"=" {
        name = new String(yytext());
        yybegin(VALUE);
      }
}
 
<VALUE> {
[^\n\r]*[\n\r]+ {
        value = new String(yytext());
        yybegin(SHIPBOX);
      }
}