diff --git a/parse_regex.py b/parse_regex.py new file mode 100644 index 0000000..2957c25 --- /dev/null +++ b/parse_regex.py @@ -0,0 +1,234 @@ +import enum +import sys + +from collections import namedtuple + +from regex import Literal, Concatenation, Alternation, Star, lit, concat, bar, star + +class token_types(enum.Enum): + char, left_paren, right_paren, bar, star, start, end = range(7) + +Token = namedtuple('Token', ['type', 'position', 'extra']) + +class ParseError(Exception): + def __init__(self, text, position): + self.text = text + self.position = position + + def __repr__(self): + if __name__ == '__main__': + name = 'ParseError' + else: + name = __name__ + '.ParseError' + return '%s(%s, %i)' % (name, self.text, self.position) + + def __str__(self): + return '%i: %s' % (self.position, self.text) + +def tokenize(text): + tokens = [Token(token_types.start, 0, None)] + + index = 0 + while index < len(text): + char = text[index] + token_start = index + index += 1 + + if char == '\\': + # Handle \ as one token + # Since we don't have any \ extensions, just + # implement \ as passing + if index >= len(text): + raise ParseError('\\ at end of text, should be followed by a character', token_start) + + tokens.append(Token(token_types.char, token_start, text[index])) + index += 1 + + elif char == '(': + tokens.append(Token(token_types.left_paren, token_start, None)) + + elif char == ')': + tokens.append(Token(token_types.right_paren, token_start, None)) + + elif char == '|': + tokens.append(Token(token_types.bar, token_start, None)) + + elif char == '*': + tokens.append(Token(token_types.star, token_start, None)) + + else: + tokens.append(Token(token_types.char, token_start, char)) + + tokens.append(Token(token_types.end, index, None)) + + return tokens + +def parse(text): + def test_predicate(predicate, element): + if type(predicate) != list: + predicate = [predicate] + + # If any of the funcs in the list pass, consider it as success + for f in predicate: + if f(element): + return True + + return False + + def match(*predicates): + nonlocal working_stack + + # Predicates can only match stuff on the stack if it's there + if len(working_stack) < len(predicates): + return False + + top_of_stack = working_stack[-len(predicates):] + + for predicate, element in zip(predicates, top_of_stack): + if not test_predicate(predicate, element): + return False + + return True + + def pop(n = None): + nonlocal working_stack + + if n == None: + return working_stack.pop() + + else: + r = working_stack[-n:] + working_stack = working_stack[:-n] + return r + + def push(*args): + nonlocal working_stack + for arg in args: + working_stack.append(arg) + + # Predicates + def charp(x): + return type(x) == Token and x.type == token_types.char + + def lparenp(x): + return type(x) == Token and x.type == token_types.left_paren + + def rparenp(x): + return type(x) == Token and x.type == token_types.right_paren + + def barp(x): + return type(x) == Token and x.type == token_types.bar + + def starp(x): + return type(x) == Token and x.type == token_types.star + + def startp(x): + return type(x) == Token and x.type == token_types.start + + def endp(x): + return type(x) == Token and x.type == token_types.end + + def regexp(x): + return type(x) in [Literal, Concatenation, Alternation, Star] + + tokens = tokenize(text) + working_stack = [] + + while True: + # Try all possible reduces + if match(charp): + char = pop() + text = char.extra + push(lit(text)) + + elif match(regexp, regexp, [lparenp, rparenp, barp, endp]): + regex1, regex2, tos = pop(3) + push(concat(regex1, regex2), tos) + + elif match(lparenp, regexp, rparenp): + _, regex, _ = pop(3) + push(regex) + + elif match(lparenp, rparenp): + _, _ = pop(2) + push(lit('')) + + elif match(regexp, barp, regexp, [lparenp, rparenp, barp, endp]): + regex1, _, regex2, tos = pop(4) + push(bar(regex1, regex2), tos) + + elif match([lparenp, startp], barp): + a, b = pop(2) + push(a, lit(''), b) + + elif match(barp, [lparenp, rparenp, barp, endp]): + a, b = pop(2) + push(a, lit(''), b) + + elif match(regexp, starp): + regex, _ = pop(2) + push(star(regex)) + + elif match(startp, endp): + a, b = pop(2) + push(a, lit(''), b) + + elif len(tokens) > 0: + # Can't reduce but can shift, so let's shift + push(tokens.pop(0)) + + elif match(startp, regexp, endp): + # Can't reduce nor shift, and stack is in the form + # signalling that the parsing is complete, so let's + # return the completed regex + _, regex, _ = pop(3) + return regex + + else: + # Can't reduce nor shift + # As we've already handled the case where we finished + # parsing, this means something's gone wrong + break + + # Figure out what's gone wrong + parens_stack = [] + for i in working_stack: + if startp(i) or endp(i) or regexp(i): + continue + + elif lparenp(i): + parens_stack.append(i) + + elif rparenp(i): + if len(parens_stack) == 0: + raise ParseError('Unpaired right parenthesis', i.position) + else: + parens_stack.pop() + + elif starp(i): + raise ParseError('Missing the thing to repeat before a *', i.position) + + elif type(i) == Token: + raise ParseError('(Should be impossible!) Unprocessed token of type %s' % i.type.name) + + else: + raise ParseError('(Should be impossible!) Unprocessed junk in parsing stack: %s' % (i,)) + + if len(parens_stack) > 0: + raise ParseError('Unpaired left parenthesis', parens_stack[0].position) + + # Wasn't anything we could figure out + raise ParseError('(Should be impossible!) Error figuring out the type of parse error') + +def main(): + try: + regex = parse(input('regex> ')) + + print(repr(regex)) + print(regex) + + except ParseError as err: + print('%s: Error: %s' % (sys.argv[0], str(err)), file=sys.stderr) + +if __name__ == '__main__': + main()