tokin/lexer.py
2025-02-08 17:17:01 +02:00

109 lines
2.5 KiB
Python

import dataclasses
import enum
import re
import unittest
class kinds(enum.Enum):
unknown, eof, number, plus, minus, multiply, divide, openparen, closeparen = range(9)
@dataclasses.dataclass
class Token:
index: int
text: str
kind: kinds
table = [(re.compile(ex), kind) for ex, kind in [
(r'\d+(\.\d+)?', kinds.number),
(r'\+', kinds.plus),
(r'-', kinds.minus),
(r'\*', kinds.multiply),
(r'/', kinds.divide),
(r'\(', kinds.openparen),
(r'\)', kinds.closeparen),
]]
def lex(string):
index = 0
while index < len(string):
if string[index].isspace():
while index < len(string) and string[index].isspace():
index += 1
continue
for regex, kind in table:
token = re.match(regex, string[index:])
if token is not None:
_, length = token.span()
break
else:
length = 1
kind = kinds.unknown
yield Token(
index=index,
text=string[index:index+length],
kind=kind
)
index += length
yield Token(
index=index,
text='',
kind=kinds.eof
)
class Tests(unittest.TestCase):
def assertTokenization(self, string, expected):
tokenized = list(lex(string))
self.assertEqual(tokenized, expected)
def test_space(self):
self.assertTokenization(' \n\t', [
Token(3, '', kinds.eof)
])
def test_numbers(self):
self.assertTokenization('1234567890 3.3 0 .2', [
Token(0, '1234567890', kinds.number),
Token(11, '3.3', kinds.number),
Token(15, '0', kinds.number),
Token(17, '.', kinds.unknown),
Token(18, '2', kinds.number),
Token(19, '', kinds.eof)
])
def test_expression(self):
self.assertTokenization('1+2 * 3', [
Token(0, '1', kinds.number),
Token(1, '+', kinds.plus),
Token(2, '2', kinds.number),
Token(4, '*', kinds.multiply),
Token(6, '3', kinds.number),
Token(7, '', kinds.eof)
])
def test_parentheses(self):
self.assertTokenization('(1+2) * 3', [
Token(0, '(', kinds.openparen),
Token(1, '1', kinds.number),
Token(2, '+', kinds.plus),
Token(3, '2', kinds.number),
Token(4, ')', kinds.closeparen),
Token(6, '*', kinds.multiply),
Token(8, '3', kinds.number),
Token(9, '', kinds.eof)
])
def test_kinds(self):
tokenization = [
Token(0, '0', kinds.number),
Token(1, '+', kinds.plus),
Token(2, '-', kinds.minus),
Token(3, '*', kinds.multiply),
Token(4, '/', kinds.divide),
Token(5, '(', kinds.openparen),
Token(6, ')', kinds.closeparen),
Token(7, '.', kinds.unknown),
Token(8, '', kinds.eof)
]
self.assertTokenization('0+-*/().', tokenization)
self.assertCountEqual(kinds, [i.kind for i in tokenization])