Viewing file: soen229/tcompile2.pl | Back to directory listing
Author: Loren Segal | Last modified: February 20 2006 07:00 pm | Download

#!/usr/bin/perl
use strict;     # strict programming style
 
sub code_parse($);
sub load_symtable();
#################
## Math module ##
#################
sub math_addNode($$$$);
sub math_insertNode($$);
sub math_collapseTree($);
sub math_lex($);
sub parse_expr($);
 
###
## GLOBAL VARIABLES
###
my(%SYMTAB);                    # Global Symbol Table
my %STATE = (
        LINE_NUMBER => 1,
        STMT_ERROR => 0,
        GLOBAL_ERROR => 0,
        IF_COUNT => 0,
        WHILE_COUNT => 0
);
 
my(%PARSETABLE) = (
  IF => {
    IF_BEGIN            => 'if',
    IF_CONDITION        => qr/.+?/s,
    IF_CONDITION_END    => 'then',
    IF_BODY             => qr/.*/s,
    IF_END              => 'endif'
  },
  IFELSE => {
    IF_BEGIN            => 'if',
    IF_CONDITION        => qr/.+?/s,
    IF_CONDITION_END    => 'then',
    IF_BODY             => qr/.*/s,
    IF_ELSE             => 'else',
    IF_ELSE_BODY        => qr/.*/s,
    IF_END              => 'endif'
  },
  WHILE => {
    WHILE_BEGIN            => 'while',
    WHILE_CONDITION        => qr/.+?/s,
    WHILE_CONDITION_END    => 'do',
    WHILE_BODY             => qr/.*/s,
    WHILE_END              => 'endwhile'
  },
  PRINT => {
    PRINT       => 'print',
    EXPRESSION  => qr/[^;]+/s
  },
  ASSIGNMENT => {
    IDENTIFIER  => qr/[a-z_]\w*/i,
    T_EQUALS    => '=',
    EXPRESSION  => qr/[^;]+/s
  }
);
 
load_symtable();
 
# Grammar definitions - BASIC
my $SPACE = '[\t\n\r\s]';
my $KEYWORDS;                                                        # Filled by load_symtable()
my $IDENTIFIER = qr/[a-z_]\w*/i;                                     # Identifier match
my $NUMBER = qr/\d*\.?\d+/i;                                         # Number match, error checking is needed when using this
my $STRICTNUMBER = qr/\d+(?:\.\d+)?/;                                # Strict number match
my $VALUE = qr/$IDENTIFIER|$NUMBER/;                                 # Value match, this is a number or identifier
my $OPERATOR = qr/$SPACE*[+-\/*]$SPACE*/;                            # Operators, +-/*
my $BRACKET = qr/\($SPACE*(?:$VALUE|$OPERATOR|[\(\)])+?$SPACE*\)/;
my $EXPR = qr/$SPACE*(?:$VALUE|$BRACKET)+$SPACE*/;
 
# Grammar definitions --  CONSTRUCTS
 
 
# Grammar definitions - STATEMENTS
my $IF_STMT = qr/$PARSETABLE{IF}{IF_BEGIN} $SPACE+              # if
                ($PARSETABLE{IF}{IF_CONDITION}) $SPACE+         # CONDITION
                $PARSETABLE{IF}{IF_CONDITION_END} $SPACE+       # then
                ($PARSETABLE{IF}{IF_BODY}) $SPACE+              # BODY
                $PARSETABLE{IF}{IF_END}/x;                      # endif
my $IFELSE_STMT = qr/$SPACE*$PARSETABLE{IFELSE}{IF_BEGIN}$SPACE+($PARSETABLE{IFELSE}{IF_CONDITION})$SPACE+$PARSETABLE{IFELSE}{IF_CONDITION_END}$SPACE+($PARSETABLE{IFELSE}{IF_BODY})$SPACE+($PARSETABLE{IFELSE}{IF_ELSE_BODY})$SPACE+$PARSETABLE{IFELSE}{IF_END}/x;
my $WHILE_STMT = qr/$SPACE*$PARSETABLE{WHILE}{WHILE_BEGIN}$SPACE+($PARSETABLE{WHILE}{WHILE_CONDITION})$SPACE+$PARSETABLE{WHILE}{WHILE_CONDITION_END}$SPACE+($PARSETABLE{WHILE}{WHILE_BODY})$SPACE+$PARSETABLE{WHILE}{WHILE_END}/x;
my $PRINT_STMT = qr/$SPACE*$PARSETABLE{PRINT}{PRINT}$SPACE+($PARSETABLE{PRINT}{EXPRESSION})/;
my $ASSIGN_STMT = qr/$SPACE*($PARSETABLE{ASSIGNMENT}{IDENTIFIER})$SPACE*$PARSETABLE{ASSIGNMENT}{T_EQUALS}$SPACE*($PARSETABLE{ASSIGNMENT}{EXPRESSION})/;
my $END_STMT = '(?:;|$)';
my $STMT = qr/$IF_STMT|$WHILE_STMT|$PRINT_STMT|$ASSIGN_STMT/;
my $STMT_LIST = qr/($STMT)$SPACE*(;$SPACE*$STMT)*/;
 
my %MATHTABLE = (
        '+' => { NAME => 'add', VALUE => 1 },
        '-' => { NAME => 'sub', VALUE => 2 },
        '*' => { NAME => 'mul', VALUE => 3 },
        '/' => { NAME => 'div', VALUE => 4 }
);
 
sub load_symtable()
{
        my @keywords;
        while (my($name, $values) = each(%PARSETABLE))
        {
                while (my($type, $value) = each(%$values))
                {
                        if ($value =~ m/^[a-z_]\w+$/)
                        {
                                $SYMTAB{$value} = 'keyword';
                        }
                }
        }
}
 
## Function: syntax_error(REASON)
#
## Description:
# This function will throw an error and halt output of the T assembly file as well as halt error output until
# the next statement.
#
sub syntax_error($)
{
        print STDERR "% Parse error at line $STATE{LINE_NUMBER}: $_[0]\n" if (!$STATE{STMT_ERROR});
        $STATE{STMT_ERROR} = 1;
        $STATE{GLOBAL_ERROR} = 1;
}
 
## Function: tsm_print(DATA)
#
## Description:
# This function prints the assembled T code to the output buffer
# but makes sure there were no previous errors in the code.
#
sub tsm_print($)
{
        print $_[0]."\n" if (!$STATE{GLOBAL_ERROR});
}
 
sub code_parse($)
{
        local $_;
        my($file, $found_match) = @_;   # file is arg1, found_match will not be set
 
        # Open file, read into one big scalar, and close
        open(CODE, $file) or die "% Error: could not open file '$file': $!\n";
        $_ = join "", <CODE>;
        close CODE;
 
        $STATE{LINE_NUMBER} = 1;
        parse_stmt($_);
        tsm_print("halt");
}
 
sub parse_stmt($)
{
        local($_) = @_;
        $STATE{LINE_NUMBER} += tr/\n/\n/;
        s/^$SPACE*(.+)$SPACE*$/$1/;
        return if ($_ eq '');
        $STATE{STMT_ERROR} = 0;
        if (m/^$ASSIGN_STMT$/s)
        {
                $STATE{TYPE} = 'ASSIGNMENT';
                if (defined($SYMTAB{$1}) && $SYMTAB{$1} eq 'keyword')
                {
                        syntax_error("Cannot assign value to keyword");
                }
                parse_expr($2);
                tsm_print("lvalue $1");
                tsm_print("assign");
                $SYMTAB{$1} = 'variable';
        }
        elsif (m/^$PRINT_STMT$/s)
        {
                $STATE{TYPE} = 'PRINT_STATEMENT';
                parse_expr($1);
                tsm_print("print");
        }
        elsif (m/^$IF_STMT$/s)
        {
                $STATE{TYPE} = 'IF_BLOCK';
                $STATE{IF_COUNT}++;
                parse_expr($1);
                parse_stmt($2);
        }
        elsif (m/^$IFELSE_STMT$/s)
        {
                $STATE{TYPE} = 'IFELSE_BLOCK';
                $STATE{IF_COUNT}++;
                parse_expr($1);
                parse_stmt($2);
                parse_stmt($3);
        }
        elsif (m/^$WHILE_STMT$/s)
        {
                $STATE{TYPE} = 'WHILE_BLOCK';
                $STATE{WHILE_COUNT}++;
                print "In while statement with body: $2\n";
                my $count = $STATE{WHILE_COUNT};
                tsm_print("label __while_begin_".$count);
                parse_expr($1);
                tsm_print("gofalse __while_end_".$count);
                parse_stmt($2);
                tsm_print("goto __while_begin_".$count);
                tsm_print("label __while_end_".$count);
        }
        elsif (m/;/ && m/^$STMT_LIST/s)
        {
                my($stmt, $stmt_list) = ($1, $_);
                $stmt_list =~ s/$SPACE*\Q$stmt\E$SPACE*;//;
                #print "statement $stmt and list $stmt_list\n";
                parse_stmt($stmt) if ($stmt_list !~ m/^$SPACE*$/);
                #if ($stmt_list =~ m/^$SPACE*$/)
                #{
                #        m/\S+$/;
                #        syntax_error("Misplaced statement seperator at '$&'");
                #}
                #else {
                parse_stmt($stmt_list);
                #}
        }
        else
        {
                m/(.+?)$END_STMT/;
                syntax_error("Error at: $1");
        }
 
        $STATE{LINE_NUMBER} += tr/\n/\n/ if (!$STATE{STMT_ERROR});
}
 
##########################################################
# MATH MODULE
##########################################################
 
 
## Function: parse_expr(EXPR)
#
## Description: This subroutine basically does nothing but builds a tree from the string input
#               and collapses it, returning the answer which is located at the root node after
#               the tree has been collapsed. This is just an interface to using the tree subs in
#               a more intuitive fashion.
#
sub parse_expr($)
{
        my $root = math_lex($_[0]);
        math_collapseTree($root);
        return $root->{VALUE};
}
 
## Function: math_addNode(NODE, VALUE, LEFT_VALUE, RIGHT_VALUE)
#
## Description: Creates a node reference with a value which can then be attached
#               to a tree. Also makes a left and right child if they are defined.
#
sub math_addNode($$$$)
{
        my ($node, $value, $left, $right) = @_;
        $node->{VALUE} = $value;
        $node->{LEFT}{VALUE} = $left if (defined($left));
        $node->{RIGHT}{VALUE} = $right if (defined($right));
        return $node;
}
 
## Function: math_insertNode(NODE, VALUE)
#
## Description: Inserts a node inside a tree by replacing the node with a value and setting
#               the old node as a left child of the new one.
#
sub math_insertNode($$)
{
        my($node, $value) = @_;
        my %tmpnode = %$node;
        math_addNode($node, $value, undef, undef);
        $node->{LEFT} = \%tmpnode;
}
 
## Function: math_collapseTree(ROOT)
#
## Description: Resursively travels through the tree starting from root and performs the mathematical
#               operations when it encounters a operator node with two number children. It then stores
#               this calculated value as the new node value and collapses the children. This recursive
#               collapsing creates an end result of the final value being collapsed into the root of the
#               tree.
#
sub math_collapseTree($)
{
        my($node,$collapse) = @_;
        # Recursive traversal to the end of the tree, collapse all left and right children if we have any
        math_collapseTree($node->{LEFT}) if ($node->{LEFT});
        math_collapseTree($node->{RIGHT}) if ($node->{RIGHT});
        return if (!defined($node->{VALUE}));
        # If we're here we've reached a bottom node that is non-empty
        if ($node->{VALUE} =~ m/^$NUMBER$/)
        {
                tsm_print("push $node->{VALUE}");
        }
        elsif ($node->{VALUE} =~ m/^$IDENTIFIER$/)
        {
                tsm_print("rvalue $node->{VALUE}");
        }
        elsif ($node->{VALUE} =~ m/^$OPERATOR$/)
        {
                tsm_print($MATHTABLE{$node->{VALUE}}{NAME});
        }
        $node = undef;
}
 
## Function: math_lex(EXPRESSION)
#
## Description: This subroutine will analyze a mathematical expression and bust it up into a binary
#               tree based on operator precedence. If an operator has equal or lower precedence than
#               the previous operator, it will be inserted above the operator with higher precedence in
#               the tree. This will guarantee that a bottom up node traversal will compute the operations
#               with highest precedence first.
#
sub math_lex($)
{
        local ($_) = @_;
        s/\s+//g;
        my $root = {};
        my $node = $root;
        my $prevnode = undef;
        my $rest = undef;
        my $last;
 
        # Basic syntax check: no input
        if (!defined($_))
        {
                syntax_error("Invalid expression or statement in $STATE{TYPE}.");
                return;
        }
 
        # Basic syntax check: look for consecutive operators in a row
        while (s/($OPERATOR)$OPERATOR+/$1/)
        {
                syntax_error("Syntax error at term '$&': found ".length($&)." consecutive operators.");
        }
        # Basic syntax check: We also don't want operators at the beginning or end
        while (s/^$OPERATOR// || s/$OPERATOR$//)
        {
                syntax_error("Syntax error at term '$&': invalid operator at beginning/end of expression");
        }
        # Basic syntax check: Make sure all opened brackets are closed
        if (tr/(/(/ != tr/)/)/)
        {
                syntax_error("Syntax error in expression: unmatched brackets.");
        }
 
        # Main lexical analysis loop
        while (!($_ eq ''))
        {
                if (m/^($EXPR)($OPERATOR)?/)
                {
                        $rest = $'; #'
                        my($expr, $op) = ($1, $2);
 
                        # Error checking
                        if ($expr =~ m/^$VALUE$/)
                        {
                                # Don't allow VALUE followed by VALUE without an operator sitting in between
                                if ($last && $last =~ m/^$VALUE$/)
                                {
                                        syntax_error("Ambiguous expression format or invalid symbol/value at '$last$expr'.");
                                }
                                if ($expr =~ m/^$IDENTIFIER$/)  # Check for identifier
                                {
                                        # If the symbol is not defined then we have an error
                                        syntax_error("Variable '$expr' used without having been declared.") if (!defined($SYMTAB{$expr}));
                                        # Or if the symbol is a keyword
                                        if (defined($SYMTAB{$expr}) && $SYMTAB{$expr} eq 'keyword')
                                        {
                                                syntax_error("Keyword '$expr' cannot be used in an expression.");
                                        }
                                }
                        }
                        elsif ($expr eq '.')
                        {
                                # Number match allows for a single ., but it is not considered an 'invalid floating point format'
                                syntax_error("Invalid character at '.'");
                        }
                        elsif ($expr =~ m/^$NUMBER$/ && $expr =~ m/^\./ || $expr =~ m/\.$/) # Check for number
                        {
                                # This number began or ended with '.', which is illegal according to the requirements.
                                # So we have to generate an error message.
                                syntax_error("Invalid floating point format at '$expr'.");
                        }
 
                        if (m/^\((.+?)\)/)
                        {
                                if ($op)
                                {
                                        math_addNode($node, $op, undef, undef);
                                        $node->{LEFT} = math_lex($1);
                                }
                                else
                                {
                                        $node->{PARENT}{RIGHT} = math_lex($1);
                                }
                        }
                        elsif ($op) # The match had an operator following the value
                        {
                                if (!defined($node->{PARENT}{VALUE})
                                        || $MATHTABLE{$op}{VALUE} > $MATHTABLE{$node->{PARENT}{VALUE}}{VALUE})
                                {
                                        # In this case, the operator had higher precendence than the last,
                                        # and therefore must be added below the last node.
                                        math_addNode($node, $op, $expr, undef);
                                }
                                else
                                {
                                        # If the operator has a lower (or equal) precendence than the last operator,
                                        # we must float up the tree until the condition is no longer met. I call this
                                        # the buoyancy effect.
                                        $node->{VALUE} = $expr;
                                        while ($node->{PARENT}{VALUE}
                                                && $MATHTABLE{$op}{VALUE} <= $MATHTABLE{$node->{PARENT}{VALUE}}{VALUE})
                                        {
                                                $node = $node->{PARENT};        # Rise above parent node...
                                        }
                                        # We can finally add the node, but we have to insert it before the current node
                                        # So we need a special function that will re-add the current node as a left child
                                        math_insertNode($node, $op);
                                }
                                $last = $op; # Our last token was an operator
                        }
                        else
                        {
                                # If all we matched was a number (and the end of the line), stick it into the current
                                # nodes value. This will most likely take place in a right child, though it is at the
                                # root if the input is solely a number.
                                $node->{VALUE} = $expr;
                                $last = $expr; # Last token was a number
                        }
                }
                else
                {
                        # Found no valid tokens (not number, operator or identifier)
                        syntax_error("Invalid character at '".substr($_,0,1)."'.\n");
                }
                if ($STATE{STMT_ERROR})
                {
                        $rest = '';             # If an error occured clear the expression and go to the next statement
                }
 
                $_ = $rest;                     # Goto next token
                $prevnode = $node;              # Keep track of the last node
                $node->{RIGHT} = {};            # Create a new node from the current node's right leg
                $node = $node->{RIGHT};         # Change current node to right leg
                $node->{PARENT} = $prevnode;    # Mark the parent node
        }
        return $root;
}
 
 
 
code_parse($ARGV[0]);