hello-dosdl/compress-dict.py

51 lines
1.3 KiB
Python
Raw Permalink Normal View History

2022-02-22 22:38:06 +00:00
#!/usr/bin/env python
import json
import sys
alphabet = 'abcdefghijklmnopqrstuvwxyz'
srcpath = sys.argv[1]
excludepath = sys.argv[2]
rarestword = sys.argv[3]
targetpath = sys.argv[4]
2022-02-22 22:38:06 +00:00
with open(srcpath, 'r') as f:
words = json.load(f)
with open(excludepath, 'r') as f:
exclude_all = json.load(f)
2022-02-22 22:38:06 +00:00
# We only care about 5-letter words
words = [word for word in words if len(word) == 5]
exclude_all = [word for word in exclude_all if len(word) == 5]
exclude = set()
for word in exclude_all:
exclude.add(word)
if word == rarestword: break
# Don't include words in the exclude list
words = [word for word in words if word not in exclude]
2022-02-22 22:38:06 +00:00
# Split dictionary into per-startletter arrays
arrays = {letter: [] for letter in alphabet}
for word in words:
assert word[0] in alphabet
number = 0
# First letter is implicit
for index, letter in enumerate(word[1:]):
number += alphabet.index(letter) << (5 * index)
packed = bytes([number & 0xff, (number >> 8) & 0xff, number >> 16])
arrays[word[0]].append(packed)
with open(targetpath, 'w') as f:
for startletter, array in arrays.items():
f.write(f'dictionary_{startletter}:\n')
for packed in array:
f.write(f'\tdb {", ".join(str(byte) for byte in packed)}\n')
f.write('\n')
f.write('dictionaries:\n')
for startletter in arrays:
2022-02-23 01:00:03 +00:00
f.write(f'\tdw dictionary_{startletter}, {len(arrays[startletter])}\n')