# Scanner module for ITEC 380
# Uses python tokenizer to initially find tokens
#  which are then classified based on the list below.

# Author: Ned Okie
# 11/11/10: Initial version
# 11/18/10: Updated comments and fixed some white space 
# 11/18/10: Created checkforspecialtokens().
# 11/18/10: Checked for special tokens and skipped white space in more places

# Client Routines:
#   initialize(filename)
#   currentToken()   # returns current token - see list below
#   currentLexeme()  # returns lexeme of current token
#   advanceToken()   # also advances current lexeme

# TOKEN LIST - all are strings
#   Keywords: GET, PRINT, NEWLINE, IF, THEN, ELSE, END, WHILE, LOOP
#
#   Identifiers and literals: IDENT, INT_LIT, STRING
#
#   Arithmetic operators: MINUS_OP, PLUS_OP, TIMES_OP, DIV_OP
#
#   Relational Operators: EQ, NE, LT, LE, GT, GE
#
#   Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMI
#
#   Other: EOF, ERROR


import tokenize
import sys

def lookupident(lex):
    # Could be implemented with a table
    if   lex == 'get': thetok = 'GET'
    elif lex == 'print': thetok = 'PRINT'
    elif lex == 'newLine': thetok = 'NEWLINE'
    elif lex == 'if': thetok = 'IF'

    elif lex == 'then': thetok = 'THEN'
    elif lex == 'else': thetok = 'ELSE'
    elif lex == 'end': thetok = 'END'
    elif lex == 'while': thetok = 'WHILE'
    elif lex == 'loop': thetok = 'LOOP'
    else: thetok = 'IDENT'

    return thetok

def lookupop(lex):   # IDENTIFY AN OPERATOR
    # Could be implemented with a table
    if lex == ':': thetok = 'COLON'

    elif lex == '-': thetok = 'MINUS_OP'

    elif lex == '+': thetok = 'PLUS_OP'
    elif lex == '*': thetok = 'TIMES_OP'
    elif lex == '/': thetok = 'DIV_OP'

    elif lex == ';': thetok = 'SEMI'

    elif lex == '(': thetok = 'LEFT_PAREN'
    elif lex == ')': thetok = 'RIGHT_PAREN'

    elif lex == '=': thetok = 'EQ'
    elif lex == '<': thetok = 'LT'
    elif lex == '>': thetok = 'GT'

    elif lex == '/=': thetok = 'NE'
    elif lex == '<=': thetok = 'LE'
    elif lex == '>=': thetok = 'GE'
    else:             thetok = 'ERROR'

    return thetok

def lookuptok(ptoknum, lex): 
    # THE MAGIC NUMBERS FOR PYTHON OPERATORS CAN BE FOUND IN MODULE token
    # FOR SOME REASON, ALL OPERATORS ARE RETURNED AS 51
    # Could be implemented with a table
    if   ptoknum == 0: thetok = 'EOF'
    elif ptoknum == 1: thetok = lookupident(lex)
    elif ptoknum == 2: thetok = 'INT_LIT'
    elif ptoknum == 3: thetok = 'STRING'

    elif ptoknum in [4, 54]: thetok = 'NL'     # [4, 54] == [NL, NEWLINE]

    elif ptoknum in [5, 6]: thetok = 'WHITE'   # [5,6] == [indent, dedent]

    elif ptoknum == 51: thetok = lookupop(lex) # 51 == operator

    else:               thetok = "ERROR"       # Nothing else is allowed 

    return thetok

# IS THE CURRENT PYTHON TOKEN EOF?
def ptokiseof():
    return curptok[0] == 0                     # MAGIC NUMBER FOR EOF

# OPEN FILE AND HANDLE POSSIBLE FILE ERROR
def openfile(filename):
    try:
        file=open(filename)
    except IOError:
        sys.exit("Error opening file " + filename)
    else:
        return file

# SET UP TOKENIZER AND INITIALIZE CURRENT AND NEXT PYTHON TOKENS
def initptok(filename):
    file = openfile(filename)

    global g
    g = tokenize.generate_tokens(file.readline)

    global curptok, nextptok
    curptok = g.next()
    nextptok = g.next()

# MOVE TO NEXT PYTHON TOKEN
def advanceptok():
    global curptok, nextptok
    curptok = nextptok
    if ptokiseof():
        nextptok = curptok
    else:
        nextptok = g.next()
    # SHOULD PROBABLY CALL setcurnexttoklex HERE INSTEAD OF AFTER EACH CALL

# SETUP PYTHON TOKENS AND CURRENT AND NEXT LEXEMES
#    AND SKIP ANY INITIAL WHITE SPACE AND SPECIAL TOKENS
def inittoklex(filename):
    # Could initialize tables here, if I'd used them
    initptok(filename)
    setcurnexttoklex()
    checkforspecialtokens()

    # IF NEXT TOKEN IS WHITE SPACE TOKENS OR A COMMENT, SKIP IT
    while curtok in ['WHITE', 'NL']:
        advanceptok()
        setcurnexttoklex()
        checkforspecialtokens()

# SET TOKEN AND LEXEME FOR ASSIGNMENT
def setassign():
    global curtok, curlex
    curtok = 'ASSIGN'
    curlex = ':='

# SET TOKEN FOR ERROR; LEAVE LEXEME UNCHANGED
def seterror():
    global curtok
    curtok = 'ERROR'

# SET CURRENT AND NEXT TOKENS AND LEXEMES
# Python tokens are a tuple with 
#    the magic number for token type as element [0]
#    and the lexeme as element [1]
# Maintain the current and next tokens so that
#    we can check for -- and :=
def setcurnexttoklex():
    global curtok, curlex, nexttok, nextlex

    curlex = curptok[1]
    curtok = lookuptok(curptok[0], curlex)

    nextlex = nextptok[1]
    nexttok = lookuptok(nextptok[0], nextlex)

# IF THE CURRENT TOKEN IS A SPECIAL CASE
#    HANDLE IT DIFFERENTLY
def checkforspecialtokens():
    # HANDLE MINUS SEPARATELY BECAUSE IT MIGHT START A COMMENT
    if curtok == 'MINUS_OP':
        if nexttok == 'MINUS_OP':   
            ## It's a comment.  Skip the rest of the line.
            advanceptok()           # These lines are redundant
            setcurnexttoklex()
            while curtok != 'NL':   # SKIP UNTIL END OF LINE
                advanceptok()
                setcurnexttoklex()

    # HANDLE COLON SEPARATELY BECAUSE IT SHOULD START ASSIGN
    elif curtok == 'COLON':
        if nexttok == 'EQ':         
            # IT'S ASSIGN
            advanceptok()
            setassign()
        else:
            # DON'T ADVANCE TO NEXT TOKEN
            seterror()


# ADVANCE TO NEXT TOKEN (and associated lexeme)
#   SKIPPING ANY WHITE SPACE AND COMMENTS
def advancetoklex():
    # MOVE TO NEXT TOKEN AND SEE IF IT'S A SPECIAL CASE
    advanceptok()
    setcurnexttoklex()
    checkforspecialtokens()

    # IF NEXT TOKEN IS WHITE SPACE TOKENS OR A COMMENT, SKIP IT
    while curtok in ['WHITE', 'NL']:
        advanceptok()
        setcurnexttoklex()
        checkforspecialtokens()
    # END OF WHILE LOOP: 
    #     Current Token is not white space 
    #     Have skipped comments and white space


# A ROUTINE FOR TESTING
def ptoktest():
    initptok("p1.ad")

    while curptok[0] != 0:
        print curptok, 
        print nextptok
        advanceptok()

    print curptok, 
    print nextptok

# A ROUTINE FOR TESTING
def maintest():
    filename = sys.argv[1]
    inittoklex(filename)

    while curtok != 'EOF':
        print curtok, 
        print curlex
        advancetoklex()

    print 'Last token: ', curtok, 
    print curlex

# DON'T CALL THE TESTING ROUTINE IN THE MODULE
# maintest()

# ROUTINES PROVIDED FOR CLIENTS:
def initialize(filename):
    inittoklex(filename)

def currentToken():
    return curtok
def currentLexeme():
    return curlex

def advanceToken():
    advancetoklex()

# MAKE THESE VARIABLES GLOBAL:
curtok = ""
curlex = ""
curptok = ''
nextpto = ''