Language parsing

Parsing and transpiling


Long switch in a loop, can be used for state machine, but it's not adviced to use long switch for parsing, because it's often need multiple states (to restore previous ones). Use recursive descent parser instead.


Parser examples

Tokenizer + parser

Use gotos, "This is called computed (or assigned) goto and is a GCC extension"




Syntax exemples:

foobar : "foo" | "bar";

// (4 * 2 * 11 + 2) - 5
// 29 + 2 * 3 - 99 - (5 + 5 + 2) / 100
expression : <product> (('+' | '-') <product>)*;
product : <value>   (('*' | '/')   <value>)*;
value : /[0-9]+/ | '(' <expression> ')';
maths : /^/ <expression> /$/;

// so c so c so c so c so c so c so c so c so c
// wow c so language such book
adjective : "wow" | "many" | "so" | "such";
noun      : "lisp" | "language" | "c" | "book" | "build";
phrase    : <adjective> <noun>;
doge      : /^/ <phrase>* /$/;

ident     : /[a-zA-Z_][a-zA-Z0-9_]*/ ;
number    : /[0-9]+/ ;
character : /'.'/ ;
string    : /\"(\\\\.|[^\"])*\"/ ;

factor    : '(' <lexp> ')'
          | <number>
          | <character>
          | <string>
          | <ident> '(' <lexp>? (',' <lexp>)* ')'
          | <ident> ;

term      : <factor> (('*' | '/' | '%') <factor>)* ;
lexp      : <term> (('+' | '-') <term>)* ;

stmt      : '{' <stmt>* '}'
          | \"while\" '(' <exp> ')' <stmt>
          | \"if\"    '(' <exp> ')' <stmt>
          | <ident> '=' <lexp> ';'
          | \"print\" '(' <lexp>? ')' ';'
          | \"return\" <lexp>? ';'
          | <ident> '(' <ident>? (',' <ident>)* ')' ';' ;

exp       : <lexp> '>' <lexp>
          | <lexp> '<' <lexp>
          | <lexp> \">=\" <lexp>
          | <lexp> \"<=\" <lexp>
          | <lexp> \"!=\" <lexp>
          | <lexp> \"==\" <lexp> ;

typeident : (\"int\" | \"char\") <ident> ;
decls     : (<typeident> ';')* ;
args      : <typeident>? (',' <typeident>)* ;
body      : '{' <decls> <stmt>* '}' ;
procedure : (\"int\" | \"char\") <ident> '(' <args> ')' <body> ;
main      : \"main\" '(' ')' <body> ;
includes  : (\"#include\" <string>)* ;
smallc    : /^/ <includes> <decls> <procedure>* <main> /$/ ;

node : '(' <node> ',' /foo/ ',' <node> ')' | <leaf>;
leaf : /bar/;
input : /^/ <node> /$/;

number  \"number\"  : /[0-9]+/ ;
symbol  \"symbol\"  : /[a-zA-Z0-9_+\\-*\\/\\\\=<>!&]+/ ;
string  \"string\"  : /\"(\\\\.|[^\"])*\"/ ;
comment             : /;[^\\r\\n]*/ ;
sexpr               : '(' <expr>* ')' ;
qexpr               : '{' <expr>* '}' ;
expr                : <number>  | <symbol> | <string>
                    | <comment> | <sexpr>  | <qexpr> ;
lispy               : /^/ <expr>* /$/ ;

// #line 10 "test"
number        : /[0-9]+/ ;
quoted_string : /\"(\\.|[^\"])*\"/ ;
linepragma    : <line> <number> <quoted_string>;
parser        : /^/ (<linepragma>)* /$/ ;

// [my_func]\n  echo (a b c)\n
qscript        : /^/ (<comment> | <resource>)* /$/ ;
	comment     : '#' /[^\\n]*/ ;
resource       : '[' (<rtype> <rname>) ']' <inner_block> ;
	rtype       : /[*]*/ ;
	rname       : <qstring> ;

inner_block    : (<comment> | <statement>)* ;
	statement   : <function> '(' (<comment> | <parameter> | <block>)* ')'  <seperator> ;
	function    : <qstring> ;
	parameter   : (<statement> | <literal>) ;
		literal  : (<number> | <qstring>) <seperator> ;
	block       : '{' <inner_block> '}' ;
	seperator   : ',' | \"\" ;

qstring        : (<complexstr> | <simplestr>) <qstring>* ;
	simplestr   : /[a-zA-Z0-9_!@#$%^&\\*_+\\-\\.=\\/<>]+/ ;
	complexstr  : (/\"[^\"]*\"/ | /'[^']*'/) ;

number         : (<float> | <int>) ;
	float       : /[-+]?[0-9]+\\.[0-9]+/ ;
	int         : /[-+]?[0-9]+/ ;

