diff --git a/regex_to_nfa.py b/regex_to_nfa.py new file mode 100644 index 0000000..e3e4cbc --- /dev/null +++ b/regex_to_nfa.py @@ -0,0 +1,105 @@ +from regex import Literal, Concatenation, Alternation, Star, lit, concat, bar, star +from nfa import NFA, prettyprint + +def to_nfa(regex): + def new_state(): + nonlocal state_name_counter, transitions + + state_name = state_name_counter + state_name_counter += 1 + + transitions[state_name] = {} + + return state_name + + def worker(node): + nonlocal transitions + + if type(node) == Literal: + # text + # (start) ---------> (end) + start_state = new_state() + end_state = new_state() + + transitions[start_state][end_state] = lit(node.text) + + return (start_state, end_state) + + elif type(node) == Concatenation: + # (start) → […] → […] → […] + start_state = new_state() + + prev_state = start_state + for element in node.elements: + inner_start, inner_end = worker(element) + + # (prev) → (inner_start) → […] + transitions[prev_state][inner_start] = lit('') + + # Link next element straight to the inner end + # state + prev_state = inner_end + + return (start_state, prev_state) + + elif type(node) == Alternation: + # +-> […] --+ + # | | + # (start) --+-> […] --+-> (end) + # | | + # +-> […] --+ + start_state = new_state() + end_state = new_state() + + for element in node.elements: + inner_start, inner_end = worker(element) + + # (start) → (inner_start) → […] + transitions[start_state][inner_start] = lit('') + + # […] → (inner_end) → (end) + transitions[inner_end][end_state] = lit('') + + return (start_state, end_state) + + elif type(node) == Star: + # +- […] <-+ + # | | + # v | + # (start) --+--> (end) + start_state = new_state() + end_state = new_state() + + inner_start, inner_end = worker(node.element) + + # (start) → (inner_start) → […] + transitions[start_state][inner_start] = lit('') + + # […] → (inner_end) → (start) + transitions[inner_end][start_state] = lit('') + + # (start) → (end) + transitions[start_state][end_state] = lit('') + + return (start_state, end_state) + + else: + raise ValueError('node has to be Literal, Concatenation, Alternation, or Star') + + state_name_counter = 0 + transitions = {} + + start_state, end_state = worker(regex) + + return NFA(start_state, [end_state], transitions) + +def main(): + regex = concat(lit('x'), star(bar(lit('a'), lit('b'))), lit('y')) + print(regex) + + nfa = to_nfa(regex) + + prettyprint(nfa) + +if __name__ == '__main__': + main()