#!/usr/bin/perl
use strict; # strict programming style
sub code_parse($);
sub load_symtable();
#################
## Math module ##
#################
sub math_addNode($$$$);
sub math_insertNode($$);
sub math_collapseTree($);
sub math_lex($);
sub parse_expr($);
###
## GLOBAL VARIABLES
###
my(%SYMTAB); # Global Symbol Table
my %STATE = (
LINE_NUMBER => 1,
STMT_ERROR => 0,
GLOBAL_ERROR => 0,
IF_COUNT => 0,
WHILE_COUNT => 0
);
my(%PARSETABLE) = (
IF => {
IF_BEGIN => 'if',
IF_CONDITION => qr/.+?/s,
IF_CONDITION_END => 'then',
IF_BODY => qr/.*/s,
IF_END => 'endif'
},
IFELSE => {
IF_BEGIN => 'if',
IF_CONDITION => qr/.+?/s,
IF_CONDITION_END => 'then',
IF_BODY => qr/.*/s,
IF_ELSE => 'else',
IF_ELSE_BODY => qr/.*/s,
IF_END => 'endif'
},
WHILE => {
WHILE_BEGIN => 'while',
WHILE_CONDITION => qr/.+?/s,
WHILE_CONDITION_END => 'do',
WHILE_BODY => qr/.*/s,
WHILE_END => 'endwhile'
},
PRINT => {
PRINT => 'print',
EXPRESSION => qr/[^;]+/s
},
ASSIGNMENT => {
IDENTIFIER => qr/[a-z_]\w*/i,
T_EQUALS => '=',
EXPRESSION => qr/[^;]+/s
}
);
load_symtable();
# Grammar definitions - BASIC
my $SPACE = '[\t\n\r\s]';
my $KEYWORDS; # Filled by load_symtable()
my $IDENTIFIER = qr/[a-z_]\w*/i; # Identifier match
my $NUMBER = qr/\d*\.?\d+/i; # Number match, error checking is needed when using this
my $STRICTNUMBER = qr/\d+(?:\.\d+)?/; # Strict number match
my $VALUE = qr/$IDENTIFIER|$NUMBER/; # Value match, this is a number or identifier
my $OPERATOR = qr/$SPACE*[+-\/*]$SPACE*/; # Operators, +-/*
my $BRACKET = qr/\($SPACE*(?:$VALUE|$OPERATOR|[\(\)])+?$SPACE*\)/;
my $EXPR = qr/$SPACE*(?:$VALUE|$BRACKET)+$SPACE*/;
# Grammar definitions -- CONSTRUCTS
# Grammar definitions - STATEMENTS
my $IF_STMT = qr/$PARSETABLE{IF}{IF_BEGIN} $SPACE+ # if
($PARSETABLE{IF}{IF_CONDITION}) $SPACE+ # CONDITION
$PARSETABLE{IF}{IF_CONDITION_END} $SPACE+ # then
($PARSETABLE{IF}{IF_BODY}) $SPACE+ # BODY
$PARSETABLE{IF}{IF_END}/x; # endif
my $IFELSE_STMT = qr/$SPACE*$PARSETABLE{IFELSE}{IF_BEGIN}$SPACE+($PARSETABLE{IFELSE}{IF_CONDITION})$SPACE+$PARSETABLE{IFELSE}{IF_CONDITION_END}$SPACE+($PARSETABLE{IFELSE}{IF_BODY})$SPACE+($PARSETABLE{IFELSE}{IF_ELSE_BODY})$SPACE+$PARSETABLE{IFELSE}{IF_END}/x;
my $WHILE_STMT = qr/$SPACE*$PARSETABLE{WHILE}{WHILE_BEGIN}$SPACE+($PARSETABLE{WHILE}{WHILE_CONDITION})$SPACE+$PARSETABLE{WHILE}{WHILE_CONDITION_END}$SPACE+($PARSETABLE{WHILE}{WHILE_BODY})$SPACE+$PARSETABLE{WHILE}{WHILE_END}/x;
my $PRINT_STMT = qr/$SPACE*$PARSETABLE{PRINT}{PRINT}$SPACE+($PARSETABLE{PRINT}{EXPRESSION})/;
my $ASSIGN_STMT = qr/$SPACE*($PARSETABLE{ASSIGNMENT}{IDENTIFIER})$SPACE*$PARSETABLE{ASSIGNMENT}{T_EQUALS}$SPACE*($PARSETABLE{ASSIGNMENT}{EXPRESSION})/;
my $END_STMT = '(?:;|$)';
my $STMT = qr/$IF_STMT|$WHILE_STMT|$PRINT_STMT|$ASSIGN_STMT/;
my $STMT_LIST = qr/($STMT)$SPACE*(;$SPACE*$STMT)*/;
my %MATHTABLE = (
'+' => { NAME => 'add', VALUE => 1 },
'-' => { NAME => 'sub', VALUE => 2 },
'*' => { NAME => 'mul', VALUE => 3 },
'/' => { NAME => 'div', VALUE => 4 }
);
sub load_symtable()
{
my @keywords;
while (my($name, $values) = each(%PARSETABLE))
{
while (my($type, $value) = each(%$values))
{
if ($value =~ m/^[a-z_]\w+$/)
{
$SYMTAB{$value} = 'keyword';
}
}
}
}
## Function: syntax_error(REASON)
#
## Description:
# This function will throw an error and halt output of the T assembly file as well as halt error output until
# the next statement.
#
sub syntax_error($)
{
print STDERR "% Parse error at line $STATE{LINE_NUMBER}: $_[0]\n" if (!$STATE{STMT_ERROR});
$STATE{STMT_ERROR} = 1;
$STATE{GLOBAL_ERROR} = 1;
}
## Function: tsm_print(DATA)
#
## Description:
# This function prints the assembled T code to the output buffer
# but makes sure there were no previous errors in the code.
#
sub tsm_print($)
{
print $_[0]."\n" if (!$STATE{GLOBAL_ERROR});
}
sub code_parse($)
{
local $_;
my($file, $found_match) = @_; # file is arg1, found_match will not be set
# Open file, read into one big scalar, and close
open(CODE, $file) or die "% Error: could not open file '$file': $!\n";
$_ = join "", <CODE>;
close CODE;
$STATE{LINE_NUMBER} = 1;
parse_stmt($_);
tsm_print("halt");
}
sub parse_stmt($)
{
local($_) = @_;
$STATE{LINE_NUMBER} += tr/\n/\n/;
s/^$SPACE*(.+)$SPACE*$/$1/;
return if ($_ eq '');
$STATE{STMT_ERROR} = 0;
if (m/^$ASSIGN_STMT$/s)
{
$STATE{TYPE} = 'ASSIGNMENT';
if (defined($SYMTAB{$1}) && $SYMTAB{$1} eq 'keyword')
{
syntax_error("Cannot assign value to keyword");
}
parse_expr($2);
tsm_print("lvalue $1");
tsm_print("assign");
$SYMTAB{$1} = 'variable';
}
elsif (m/^$PRINT_STMT$/s)
{
$STATE{TYPE} = 'PRINT_STATEMENT';
parse_expr($1);
tsm_print("print");
}
elsif (m/^$IF_STMT$/s)
{
$STATE{TYPE} = 'IF_BLOCK';
$STATE{IF_COUNT}++;
parse_expr($1);
parse_stmt($2);
}
elsif (m/^$IFELSE_STMT$/s)
{
$STATE{TYPE} = 'IFELSE_BLOCK';
$STATE{IF_COUNT}++;
parse_expr($1);
parse_stmt($2);
parse_stmt($3);
}
elsif (m/^$WHILE_STMT$/s)
{
$STATE{TYPE} = 'WHILE_BLOCK';
$STATE{WHILE_COUNT}++;
print "In while statement with body: $2\n";
my $count = $STATE{WHILE_COUNT};
tsm_print("label __while_begin_".$count);
parse_expr($1);
tsm_print("gofalse __while_end_".$count);
parse_stmt($2);
tsm_print("goto __while_begin_".$count);
tsm_print("label __while_end_".$count);
}
elsif (m/;/ && m/^$STMT_LIST/s)
{
my($stmt, $stmt_list) = ($1, $_);
$stmt_list =~ s/$SPACE*\Q$stmt\E$SPACE*;//;
#print "statement $stmt and list $stmt_list\n";
parse_stmt($stmt) if ($stmt_list !~ m/^$SPACE*$/);
#if ($stmt_list =~ m/^$SPACE*$/)
#{
# m/\S+$/;
# syntax_error("Misplaced statement seperator at '$&'");
#}
#else {
parse_stmt($stmt_list);
#}
}
else
{
m/(.+?)$END_STMT/;
syntax_error("Error at: $1");
}
$STATE{LINE_NUMBER} += tr/\n/\n/ if (!$STATE{STMT_ERROR});
}
##########################################################
# MATH MODULE
##########################################################
## Function: parse_expr(EXPR)
#
## Description: This subroutine basically does nothing but builds a tree from the string input
# and collapses it, returning the answer which is located at the root node after
# the tree has been collapsed. This is just an interface to using the tree subs in
# a more intuitive fashion.
#
sub parse_expr($)
{
my $root = math_lex($_[0]);
math_collapseTree($root);
return $root->{VALUE};
}
## Function: math_addNode(NODE, VALUE, LEFT_VALUE, RIGHT_VALUE)
#
## Description: Creates a node reference with a value which can then be attached
# to a tree. Also makes a left and right child if they are defined.
#
sub math_addNode($$$$)
{
my ($node, $value, $left, $right) = @_;
$node->{VALUE} = $value;
$node->{LEFT}{VALUE} = $left if (defined($left));
$node->{RIGHT}{VALUE} = $right if (defined($right));
return $node;
}
## Function: math_insertNode(NODE, VALUE)
#
## Description: Inserts a node inside a tree by replacing the node with a value and setting
# the old node as a left child of the new one.
#
sub math_insertNode($$)
{
my($node, $value) = @_;
my %tmpnode = %$node;
math_addNode($node, $value, undef, undef);
$node->{LEFT} = \%tmpnode;
}
## Function: math_collapseTree(ROOT)
#
## Description: Resursively travels through the tree starting from root and performs the mathematical
# operations when it encounters a operator node with two number children. It then stores
# this calculated value as the new node value and collapses the children. This recursive
# collapsing creates an end result of the final value being collapsed into the root of the
# tree.
#
sub math_collapseTree($)
{
my($node,$collapse) = @_;
# Recursive traversal to the end of the tree, collapse all left and right children if we have any
math_collapseTree($node->{LEFT}) if ($node->{LEFT});
math_collapseTree($node->{RIGHT}) if ($node->{RIGHT});
return if (!defined($node->{VALUE}));
# If we're here we've reached a bottom node that is non-empty
if ($node->{VALUE} =~ m/^$NUMBER$/)
{
tsm_print("push $node->{VALUE}");
}
elsif ($node->{VALUE} =~ m/^$IDENTIFIER$/)
{
tsm_print("rvalue $node->{VALUE}");
}
elsif ($node->{VALUE} =~ m/^$OPERATOR$/)
{
tsm_print($MATHTABLE{$node->{VALUE}}{NAME});
}
$node = undef;
}
## Function: math_lex(EXPRESSION)
#
## Description: This subroutine will analyze a mathematical expression and bust it up into a binary
# tree based on operator precedence. If an operator has equal or lower precedence than
# the previous operator, it will be inserted above the operator with higher precedence in
# the tree. This will guarantee that a bottom up node traversal will compute the operations
# with highest precedence first.
#
sub math_lex($)
{
local ($_) = @_;
s/\s+//g;
my $root = {};
my $node = $root;
my $prevnode = undef;
my $rest = undef;
my $last;
# Basic syntax check: no input
if (!defined($_))
{
syntax_error("Invalid expression or statement in $STATE{TYPE}.");
return;
}
# Basic syntax check: look for consecutive operators in a row
while (s/($OPERATOR)$OPERATOR+/$1/)
{
syntax_error("Syntax error at term '$&': found ".length($&)." consecutive operators.");
}
# Basic syntax check: We also don't want operators at the beginning or end
while (s/^$OPERATOR// || s/$OPERATOR$//)
{
syntax_error("Syntax error at term '$&': invalid operator at beginning/end of expression");
}
# Basic syntax check: Make sure all opened brackets are closed
if (tr/(/(/ != tr/)/)/)
{
syntax_error("Syntax error in expression: unmatched brackets.");
}
# Main lexical analysis loop
while (!($_ eq ''))
{
if (m/^($EXPR)($OPERATOR)?/)
{
$rest = $'; #'
my($expr, $op) = ($1, $2);
# Error checking
if ($expr =~ m/^$VALUE$/)
{
# Don't allow VALUE followed by VALUE without an operator sitting in between
if ($last && $last =~ m/^$VALUE$/)
{
syntax_error("Ambiguous expression format or invalid symbol/value at '$last$expr'.");
}
if ($expr =~ m/^$IDENTIFIER$/) # Check for identifier
{
# If the symbol is not defined then we have an error
syntax_error("Variable '$expr' used without having been declared.") if (!defined($SYMTAB{$expr}));
# Or if the symbol is a keyword
if (defined($SYMTAB{$expr}) && $SYMTAB{$expr} eq 'keyword')
{
syntax_error("Keyword '$expr' cannot be used in an expression.");
}
}
}
elsif ($expr eq '.')
{
# Number match allows for a single ., but it is not considered an 'invalid floating point format'
syntax_error("Invalid character at '.'");
}
elsif ($expr =~ m/^$NUMBER$/ && $expr =~ m/^\./ || $expr =~ m/\.$/) # Check for number
{
# This number began or ended with '.', which is illegal according to the requirements.
# So we have to generate an error message.
syntax_error("Invalid floating point format at '$expr'.");
}
if (m/^\((.+?)\)/)
{
if ($op)
{
math_addNode($node, $op, undef, undef);
$node->{LEFT} = math_lex($1);
}
else
{
$node->{PARENT}{RIGHT} = math_lex($1);
}
}
elsif ($op) # The match had an operator following the value
{
if (!defined($node->{PARENT}{VALUE})
|| $MATHTABLE{$op}{VALUE} > $MATHTABLE{$node->{PARENT}{VALUE}}{VALUE})
{
# In this case, the operator had higher precendence than the last,
# and therefore must be added below the last node.
math_addNode($node, $op, $expr, undef);
}
else
{
# If the operator has a lower (or equal) precendence than the last operator,
# we must float up the tree until the condition is no longer met. I call this
# the buoyancy effect.
$node->{VALUE} = $expr;
while ($node->{PARENT}{VALUE}
&& $MATHTABLE{$op}{VALUE} <= $MATHTABLE{$node->{PARENT}{VALUE}}{VALUE})
{
$node = $node->{PARENT}; # Rise above parent node...
}
# We can finally add the node, but we have to insert it before the current node
# So we need a special function that will re-add the current node as a left child
math_insertNode($node, $op);
}
$last = $op; # Our last token was an operator
}
else
{
# If all we matched was a number (and the end of the line), stick it into the current
# nodes value. This will most likely take place in a right child, though it is at the
# root if the input is solely a number.
$node->{VALUE} = $expr;
$last = $expr; # Last token was a number
}
}
else
{
# Found no valid tokens (not number, operator or identifier)
syntax_error("Invalid character at '".substr($_,0,1)."'.\n");
}
if ($STATE{STMT_ERROR})
{
$rest = ''; # If an error occured clear the expression and go to the next statement
}
$_ = $rest; # Goto next token
$prevnode = $node; # Keep track of the last node
$node->{RIGHT} = {}; # Create a new node from the current node's right leg
$node = $node->{RIGHT}; # Change current node to right leg
$node->{PARENT} = $prevnode; # Mark the parent node
}
return $root;
}
code_parse($ARGV[0]); Powered by
GeSHi Syntax Highlighting software.
Author of all (other) material unless otherwise specified:
Loren Segal. Copyright 2005.