# Scanner module for ITEC 380 # Uses python tokenizer to initially find tokens # which are then classified based on the list below. # Author: Ned Okie # 11/11/10: Initial version # 11/18/10: Updated comments and fixed some white space # 11/18/10: Created checkforspecialtokens(). # 11/18/10: Checked for special tokens and skipped white space in more places # Client Routines: # initialize(filename) # currentToken() # returns current token - see list below # currentLexeme() # returns lexeme of current token # advanceToken() # also advances current lexeme # TOKEN LIST - all are strings # Keywords: GET, PRINT, NEWLINE, IF, THEN, ELSE, END, WHILE, LOOP # # Identifiers and literals: IDENT, INT_LIT, STRING # # Arithmetic operators: MINUS_OP, PLUS_OP, TIMES_OP, DIV_OP # # Relational Operators: EQ, NE, LT, LE, GT, GE # # Punctuation: LEFT_PAREN, RIGHT_PAREN, SEMI # # Other: EOF, ERROR import tokenize import sys def lookupident(lex): # Could be implemented with a table if lex == 'get': thetok = 'GET' elif lex == 'print': thetok = 'PRINT' elif lex == 'newLine': thetok = 'NEWLINE' elif lex == 'if': thetok = 'IF' elif lex == 'then': thetok = 'THEN' elif lex == 'else': thetok = 'ELSE' elif lex == 'end': thetok = 'END' elif lex == 'while': thetok = 'WHILE' elif lex == 'loop': thetok = 'LOOP' else: thetok = 'IDENT' return thetok def lookupop(lex): # IDENTIFY AN OPERATOR # Could be implemented with a table if lex == ':': thetok = 'COLON' elif lex == '-': thetok = 'MINUS_OP' elif lex == '+': thetok = 'PLUS_OP' elif lex == '*': thetok = 'TIMES_OP' elif lex == '/': thetok = 'DIV_OP' elif lex == ';': thetok = 'SEMI' elif lex == '(': thetok = 'LEFT_PAREN' elif lex == ')': thetok = 'RIGHT_PAREN' elif lex == '=': thetok = 'EQ' elif lex == '<': thetok = 'LT' elif lex == '>': thetok = 'GT' elif lex == '/=': thetok = 'NE' elif lex == '<=': thetok = 'LE' elif lex == '>=': thetok = 'GE' else: thetok = 'ERROR' return thetok def lookuptok(ptoknum, lex): # THE MAGIC NUMBERS FOR PYTHON OPERATORS CAN BE FOUND IN MODULE token # FOR SOME REASON, ALL OPERATORS ARE RETURNED AS 51 # Could be implemented with a table if ptoknum == 0: thetok = 'EOF' elif ptoknum == 1: thetok = lookupident(lex) elif ptoknum == 2: thetok = 'INT_LIT' elif ptoknum == 3: thetok = 'STRING' elif ptoknum in [4, 54]: thetok = 'NL' # [4, 54] == [NL, NEWLINE] elif ptoknum in [5, 6]: thetok = 'WHITE' # [5,6] == [indent, dedent] elif ptoknum == 51: thetok = lookupop(lex) # 51 == operator else: thetok = "ERROR" # Nothing else is allowed return thetok # IS THE CURRENT PYTHON TOKEN EOF? def ptokiseof(): return curptok[0] == 0 # MAGIC NUMBER FOR EOF # OPEN FILE AND HANDLE POSSIBLE FILE ERROR def openfile(filename): try: file=open(filename) except IOError: sys.exit("Error opening file " + filename) else: return file # SET UP TOKENIZER AND INITIALIZE CURRENT AND NEXT PYTHON TOKENS def initptok(filename): file = openfile(filename) global g g = tokenize.generate_tokens(file.readline) global curptok, nextptok curptok = g.next() nextptok = g.next() # MOVE TO NEXT PYTHON TOKEN def advanceptok(): global curptok, nextptok curptok = nextptok if ptokiseof(): nextptok = curptok else: nextptok = g.next() # SHOULD PROBABLY CALL setcurnexttoklex HERE INSTEAD OF AFTER EACH CALL # SETUP PYTHON TOKENS AND CURRENT AND NEXT LEXEMES # AND SKIP ANY INITIAL WHITE SPACE AND SPECIAL TOKENS def inittoklex(filename): # Could initialize tables here, if I'd used them initptok(filename) setcurnexttoklex() checkforspecialtokens() # IF NEXT TOKEN IS WHITE SPACE TOKENS OR A COMMENT, SKIP IT while curtok in ['WHITE', 'NL']: advanceptok() setcurnexttoklex() checkforspecialtokens() # SET TOKEN AND LEXEME FOR ASSIGNMENT def setassign(): global curtok, curlex curtok = 'ASSIGN' curlex = ':=' # SET TOKEN FOR ERROR; LEAVE LEXEME UNCHANGED def seterror(): global curtok curtok = 'ERROR' # SET CURRENT AND NEXT TOKENS AND LEXEMES # Python tokens are a tuple with # the magic number for token type as element [0] # and the lexeme as element [1] # Maintain the current and next tokens so that # we can check for -- and := def setcurnexttoklex(): global curtok, curlex, nexttok, nextlex curlex = curptok[1] curtok = lookuptok(curptok[0], curlex) nextlex = nextptok[1] nexttok = lookuptok(nextptok[0], nextlex) # IF THE CURRENT TOKEN IS A SPECIAL CASE # HANDLE IT DIFFERENTLY def checkforspecialtokens(): # HANDLE MINUS SEPARATELY BECAUSE IT MIGHT START A COMMENT if curtok == 'MINUS_OP': if nexttok == 'MINUS_OP': ## It's a comment. Skip the rest of the line. advanceptok() # These lines are redundant setcurnexttoklex() while curtok != 'NL': # SKIP UNTIL END OF LINE advanceptok() setcurnexttoklex() # HANDLE COLON SEPARATELY BECAUSE IT SHOULD START ASSIGN elif curtok == 'COLON': if nexttok == 'EQ': # IT'S ASSIGN advanceptok() setassign() else: # DON'T ADVANCE TO NEXT TOKEN seterror() # ADVANCE TO NEXT TOKEN (and associated lexeme) # SKIPPING ANY WHITE SPACE AND COMMENTS def advancetoklex(): # MOVE TO NEXT TOKEN AND SEE IF IT'S A SPECIAL CASE advanceptok() setcurnexttoklex() checkforspecialtokens() # IF NEXT TOKEN IS WHITE SPACE TOKENS OR A COMMENT, SKIP IT while curtok in ['WHITE', 'NL']: advanceptok() setcurnexttoklex() checkforspecialtokens() # END OF WHILE LOOP: # Current Token is not white space # Have skipped comments and white space # A ROUTINE FOR TESTING def ptoktest(): initptok("p1.ad") while curptok[0] != 0: print curptok, print nextptok advanceptok() print curptok, print nextptok # A ROUTINE FOR TESTING def maintest(): filename = sys.argv[1] inittoklex(filename) while curtok != 'EOF': print curtok, print curlex advancetoklex() print 'Last token: ', curtok, print curlex # DON'T CALL THE TESTING ROUTINE IN THE MODULE # maintest() # ROUTINES PROVIDED FOR CLIENTS: def initialize(filename): inittoklex(filename) def currentToken(): return curtok def currentLexeme(): return curlex def advanceToken(): advancetoklex() # MAKE THESE VARIABLES GLOBAL: curtok = "" curlex = "" curptok = '' nextpto = ''