使用python和regex的微型语言编译器

问题描述 投票:0回答:1

你好堆栈溢出用户希望你一切都好所以我正在做这个小巧的语言编译器来做作业尝试使用正则表达式但是输出是如此奇怪首先,我得到一个名为“ t”的标识符,该标识符未在我的输入中使用而且它不会将标识符“ x”与分号分开预先感谢您的帮助

这是我的输入内容

read x;   {input an integer }
     if  0 < x   then     {  don’t compute if x <= 0 }
        fact  := 1;
        repeat 
           fact  := fact *  x;
            x  := x  -  1 
        until  x  =  0;
        write  fact   {  output  factorial of x }
     end 

这是我使用正则表达式的代码

 # -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020

@author: PC
"""

class OwnCompiler (object):
    def __init__ (self,file):
        import re
        self.file=open(file,"r").readlines()
        self.symbols = {
                "+":"PLUS_OP",
                "-":"MINUS_OP",
                "*":"MUL_OP",
                "/":"DIV_OP",
                "=":"EQUAL_OP",
                "<":"LESS_OP",
                ">":"GREATER_OP",
                "(":"LEFT_PARENTHESIS",
                ")":"RIGHT_PARENTHESIS",
                ":=":"ASSIGN",
                ";":"SEMICOLON",
                }
        self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
        self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
        self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
        self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")

    def compileOutput(self):
        self.fileWrite=open("output.txt","w")
        self.fileWrite.write("Type        Token\n==================\n")
        for i in self.file :
            print(i)
            self.getComment(i)
            self.getReserveWord(i)
            self.getIdentify(i)
        self.fileWrite.close()#end
    def getComment(self,text):
        try:
            self.fileWrite.write("COMMENT        "+self.commentPattern.match(text).group(1)+"\n")
        except:
            print("NO_COMMENT")
    def getReserveWord(self,text):
        self.Compiled = self.reservePattern.match(text)
        try:
            self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(1)+"\n")
            self.getSymbols(self.Compiled.group(2))
            try:
                self.fileWrite.write("RESERVE_WORD        "+self.Compiled.group(3)+"\n")
            except:
                print("NO_RESERVE_WORD2")
        except:
            print("NO_RESERVE_WORD")
    def getSymbols(self,text):
        self.Compiled= self.symbolPattern.match(text)
        self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
        try:
            self.fileWrite.write(self.GOT_TOKEN+"        "+self.Compiled.group()+"\n")
        except:
            print("NO_SYMBOLS")
    def getIdentify(self,text):
        self.Compiled = self.identifierSymbol.match(text)
        try:
            self.fileWrite.write("IDENTIFIER        "+self.Compiled.group(1)+"\n")
            self.getSymbols(text)
            for i in self.Compiled.group(3):
                if i ==" " :
                    continue
                if self.isNumber(i):
                    self.fileWrite.write("NUMBER        ")
                else:
                    self.fileWrite.write("WORD        ")
                self.fileWrite.write(self.Compiled.group(3)+"\n")
        except:
            print("NO_IDENTIFIRES")
    def getTokensSymbols(self,symbol):
        try: 
            return self.symbols[symbol]
        except:
            print("NOT_DEFINED_IN_SYMBOL_DICT")
            return "UNKNOWN"

    def isNumber(self,text):
         try:
             int(text)
             return True
         except:
             return False

if __name__ == "__main__":
    instance = OwnCompiler("input.txt")
    instance.compileOutput()

这是我的输出

Type        Token
==================
COMMENT        { Sample program in TINY language – computes factorial }
COMMENT        {input an integer }
RESERVE_WORD        read
UNKNOWN        x;
COMMENT        {  don’t compute if x <= 0 }
RESERVE_WORD        if
UNKNOWN        0 < x   then     {  don’t compute if x <=
IDENTIFIER        t
UNKNOWN                fact  := 1;
RESERVE_WORD        repeat
IDENTIFIER        t
UNKNOWN                   fact  := fact *  x;
IDENTIFIER        x
UNKNOWN                    x  := x  -
RESERVE_WORD        until
UNKNOWN        x  =  0;
COMMENT        {  output  factorial of x }
RESERVE_WORD        write
RESERVE_WORD        end
python regex compiler-construction
1个回答
1
投票

如果您要解析一种语言,则需要一个“词法分析器”,该词法分析器将返回individual tokens,而忽略空格和注释。按照这些思路,例如:

import re, collections

Token = collections.namedtuple('Token', ['type','value'])

class Lexer(object):
    def __init__ (self,file):

        WHITESPACE = r'(?P<WHITESPACE>\s+)'
        COMMENT = r'(?P<COMMENT>{[^}]*})' # questionable definition
        RESERVED_WORD = r'(?P<RESERVED_WORD>\b(?:read|write|if|then|else|end|repeat|until|then)\b)'
        OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
        LPAREN = r'(?P<LPAREN>\()'
        RPAREN = r'(?P<RPAREN>\))'
        IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
        INTEGER = r'(?P<INTEGER>\d+)'
        SEMICOLON = r'(?P<SEMICOLON>;)'

        self.regex = re.compile('|'.join([WHITESPACE, COMMENT, RESERVED_WORD, OPERATOR, LPAREN, RPAREN, IDENTIFIER, INTEGER, SEMICOLON]))

        with open(file, "r") as f:
            self.text = f.read()


    def generate_tokens(self):
        scanner = self.regex.finditer(self.text)
        last_end = 0
        for m in scanner:
            start = m.start()
            end = m.end()
            if start != last_end:
                # skipped over text: unrecognizable token
                text = self.text[last_end:start]
                token = Token('ERROR', text)
                yield token
            last_end = end
            token = Token(m.lastgroup, m.group())
            if token.type != 'WHITESPACE' and token.type != 'COMMENT':
                yield token
        yield Token('EOF', '<end-of-file>')
#
if __name__ == "__main__":
    lexer = Lexer("test.txt")
    for token in lexer.generate_tokens():
        print(token)

打印:

Token(type='RESERVED_WORD', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=';')
Token(type='RESERVED_WORD', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='RESERVED_WORD', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='OPERATOR', value=';')
Token(type='RESERVED_WORD', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='RESERVED_WORD', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value=';')
Token(type='RESERVED_WORD', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='RESERVED_WORD', value='end')
Token(type='EOF', value='<end-of-file>')

使用Lexer的另一种样式,与解析器一起使用可能会更方便:

lexer = Lexer("input.txt")
token_generator = lexer.generate_tokens()
while True:
    token = next(token_generator)
    print(token)
    if token.type == 'EOF':
        break
© www.soinside.com 2019 - 2024. All rights reserved.