We now support the output that you actually wanted!
Who would've thought that shell scripts were awkward for complex situations...
This commit is contained in:
parent
d58a32d07d
commit
a0c6637486
2 changed files with 579 additions and 0 deletions
336
happybot/unicode/unicode.py
Normal file
336
happybot/unicode/unicode.py
Normal file
|
@ -0,0 +1,336 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from subprocess import Popen, PIPE
|
||||||
|
from os import chdir, environ
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Make sure we're in the correct directory, for module imports and such too.
|
||||||
|
basedir = Path(environ.get('basedir', '.'))
|
||||||
|
chdir(basedir)
|
||||||
|
|
||||||
|
def cmd(args):
|
||||||
|
proc = Popen(args, stdout=PIPE)
|
||||||
|
while True:
|
||||||
|
line = proc.stdout.readline()
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
yield str(line[:-1], 'utf-8', 'ignore')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
def irc(chan):
|
||||||
|
global trigger
|
||||||
|
server = environ.get('serv', 'irc.libera.chat')
|
||||||
|
fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
|
||||||
|
fin = fdir + '/in'
|
||||||
|
fout = fdir + '/out'
|
||||||
|
|
||||||
|
for line in cmd(['tail', '-n', '0', '-f', fout]):
|
||||||
|
date, time, nick, line = line.split(' ', 3)
|
||||||
|
if nick[0] != '<' or nick[-1] != '>':
|
||||||
|
continue
|
||||||
|
nick = nick[1:-1]
|
||||||
|
m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
|
||||||
|
if m:
|
||||||
|
flags, query = m.groups()
|
||||||
|
if not flags:
|
||||||
|
flags = ''
|
||||||
|
result = doit(flags, query).split('\n')
|
||||||
|
result = [ f'\u200b{nick}: ' + line for line in result ]
|
||||||
|
result = result[:4] # capping at 4 lines max
|
||||||
|
result = '\n'.join(result)
|
||||||
|
with open(fin, 'w') as fh:
|
||||||
|
fh.write(result + '\n')
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from math import ceil, log
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||||
|
categories = '''
|
||||||
|
Lu Uppercase_Letter an uppercase letter
|
||||||
|
Ll Lowercase_Letter a lowercase letter
|
||||||
|
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
||||||
|
LC Cased_Letter Lu | Ll | Lt
|
||||||
|
Lm Modifier_Letter a modifier letter
|
||||||
|
Lo Other_Letter other letters, including syllables and ideographs
|
||||||
|
L Letter Lu | Ll | Lt | Lm | Lo
|
||||||
|
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
||||||
|
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
||||||
|
Me Enclosing_Mark an enclosing combining mark
|
||||||
|
M Mark Mn | Mc | Me
|
||||||
|
Nd Decimal_Number a decimal digit
|
||||||
|
Nl Letter_Number a letterlike numeric character
|
||||||
|
No Other_Number a numeric character of other type
|
||||||
|
N Number Nd | Nl | No
|
||||||
|
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
||||||
|
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
||||||
|
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
||||||
|
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
||||||
|
Pi Initial_Punctuation an initial quotation mark
|
||||||
|
Pf Final_Punctuation a final quotation mark
|
||||||
|
Po Other_Punctuation a punctuation mark of other type
|
||||||
|
P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po
|
||||||
|
Sm Math_Symbol a symbol of mathematical use
|
||||||
|
Sc Currency_Symbol a currency sign
|
||||||
|
Sk Modifier_Symbol a non-letterlike modifier symbol
|
||||||
|
So Other_Symbol a symbol of other type
|
||||||
|
S Symbol Sm | Sc | Sk | So
|
||||||
|
Zs Space_Separator a space character (of various non-zero widths)
|
||||||
|
Zl Line_Separator U+2028 LINE SEPARATOR only
|
||||||
|
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
||||||
|
Z Separator Zs | Zl | Zp
|
||||||
|
Cc Control a C0 or C1 control code
|
||||||
|
Cf Format a format control character
|
||||||
|
Cs Surrogate a surrogate code point
|
||||||
|
Co Private_Use a private-use character
|
||||||
|
Cn Unassigned a reserved unassigned code point or a noncharacter
|
||||||
|
C Other Cc | Cf | Cs | Co | Cn
|
||||||
|
'''.strip().split('\n')
|
||||||
|
categories = [ row.split('\t', 2) for row in categories ]
|
||||||
|
categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
|
||||||
|
|
||||||
|
def utf8uni(ordinal):
|
||||||
|
s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
|
||||||
|
if len(s) % 2 == 1:
|
||||||
|
s = '0' + s
|
||||||
|
return '0x' + s
|
||||||
|
|
||||||
|
def uniuni(ordinal):
|
||||||
|
return 'U+' + hex(ordinal)[2:].zfill(4).upper()
|
||||||
|
|
||||||
|
UnicodeDataFull = 'UnicodeDataFull.json'
|
||||||
|
|
||||||
|
tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
|
||||||
|
invalid = [None, "Cn", False, ["<invalid>"]]
|
||||||
|
unknown = [None, "Cn", False, ["<unknown>"]]
|
||||||
|
|
||||||
|
def doit(flags, query):
|
||||||
|
quiet = 'q' in flags
|
||||||
|
verbose = 'v' in flags
|
||||||
|
decode = 'd' in flags
|
||||||
|
utf8 = '8' in flags
|
||||||
|
|
||||||
|
if utf8:
|
||||||
|
unif = utf8uni
|
||||||
|
else:
|
||||||
|
unif = uniuni
|
||||||
|
|
||||||
|
cache = dict()
|
||||||
|
|
||||||
|
if len(query) <= 2:
|
||||||
|
decode = True
|
||||||
|
|
||||||
|
if decode:
|
||||||
|
search = list(map(ord, query))
|
||||||
|
|
||||||
|
else:
|
||||||
|
index, merge = 0, False
|
||||||
|
search = []
|
||||||
|
for match in tokens.finditer(query):
|
||||||
|
missed = query[index:match.start()]
|
||||||
|
if missed:
|
||||||
|
if merge:
|
||||||
|
search[-1] += missed
|
||||||
|
else:
|
||||||
|
search.append(missed)
|
||||||
|
index = match.end()
|
||||||
|
merge = False
|
||||||
|
uni1, uni2, hexa, bina = match.groups()
|
||||||
|
uni = uni1 or uni2
|
||||||
|
if uni:
|
||||||
|
search.append(int(uni, 16))
|
||||||
|
elif hexa:
|
||||||
|
try:
|
||||||
|
byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
|
||||||
|
search.extend(map(ord, byt))
|
||||||
|
except:
|
||||||
|
if isinstance(search[-1], str):
|
||||||
|
search[-1] += '0x' + hexa
|
||||||
|
else:
|
||||||
|
search.append('0x' + hexa)
|
||||||
|
merge = True
|
||||||
|
elif bina:
|
||||||
|
try:
|
||||||
|
byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
|
||||||
|
search.extend(map(ord, byt))
|
||||||
|
except:
|
||||||
|
if isinstance(search[-1], str):
|
||||||
|
search[-1] += '0b' + bina
|
||||||
|
else:
|
||||||
|
search.append('0b' + bina)
|
||||||
|
merge = True
|
||||||
|
missed = query[index:]
|
||||||
|
if missed:
|
||||||
|
if merge:
|
||||||
|
search[-1] += missed
|
||||||
|
else:
|
||||||
|
search.append(missed)
|
||||||
|
|
||||||
|
results = [[] for _ in range(len(search))]
|
||||||
|
numbers = defaultdict(list)
|
||||||
|
strings = defaultdict(list)
|
||||||
|
for i, elem in enumerate(search):
|
||||||
|
if isinstance(elem, int):
|
||||||
|
numbers[elem].append(i)
|
||||||
|
elif isinstance(elem, str):
|
||||||
|
strings[elem.lower()].append(i)
|
||||||
|
numbers = list(sorted(numbers.items(), reverse=True))
|
||||||
|
|
||||||
|
# The actual searching.
|
||||||
|
filled = set()
|
||||||
|
with open(UnicodeDataFull, 'r') as fh:
|
||||||
|
for line in fh:
|
||||||
|
row = json.loads(line)
|
||||||
|
if numbers:
|
||||||
|
if row[0] == numbers[-1][0]:
|
||||||
|
cache[row[0]] = row
|
||||||
|
for index in numbers[-1][1]:
|
||||||
|
filled.add(index)
|
||||||
|
results[index].append(row[0])
|
||||||
|
numbers.pop()
|
||||||
|
elif row[1]:
|
||||||
|
while numbers and row[0] <= numbers[-1][0] <= row[1]:
|
||||||
|
cache[numbers[-1][0]] = row
|
||||||
|
for index in numbers[-1][1]:
|
||||||
|
filled.add(index)
|
||||||
|
results[index].append(numbers[-1][0])
|
||||||
|
numbers.pop()
|
||||||
|
elif not strings:
|
||||||
|
break
|
||||||
|
for string, indices in strings.items():
|
||||||
|
if any(string in name.lower() for name in row[4]):
|
||||||
|
num = row[0]
|
||||||
|
if row[1]:
|
||||||
|
num = -num
|
||||||
|
cache[num] = row
|
||||||
|
for index in indices:
|
||||||
|
filled.add(index)
|
||||||
|
results[index].append(num)
|
||||||
|
|
||||||
|
missing = set(range(len(search))) - filled
|
||||||
|
numbers = defaultdict(list)
|
||||||
|
indices = set()
|
||||||
|
for i in missing:
|
||||||
|
elem = search[i]
|
||||||
|
if isinstance(elem, int):
|
||||||
|
cache[elem] = [elem] + invalid
|
||||||
|
results[i].append(elem)
|
||||||
|
elif isinstance(elem, str):
|
||||||
|
results[i] = [None] * len(elem)
|
||||||
|
for j, c in enumerate(elem):
|
||||||
|
numbers[ord(c)].append((i, j))
|
||||||
|
indices.add((i, j))
|
||||||
|
numbers = list(sorted(numbers.items(), reverse=True))
|
||||||
|
|
||||||
|
if indices:
|
||||||
|
# Decoding what we have left, just some numbers.
|
||||||
|
with open(UnicodeDataFull, 'r') as fh:
|
||||||
|
for line in fh:
|
||||||
|
row = json.loads(line)
|
||||||
|
if numbers:
|
||||||
|
if row[0] == numbers[-1][0]:
|
||||||
|
cache[row[0]] = row
|
||||||
|
for i, j in numbers[-1][1]:
|
||||||
|
indices.remove((i, j))
|
||||||
|
results[i][j] = row[0]
|
||||||
|
numbers.pop()
|
||||||
|
elif row[1]:
|
||||||
|
while numbers and row[0] <= numbers[-1][0] <= row[1]:
|
||||||
|
cache[numbers[-1][0]] = row
|
||||||
|
for i, j in numbers[-1][1]:
|
||||||
|
indices.remove((i, j))
|
||||||
|
results[i][j] = numbers[-1][0]
|
||||||
|
numbers.pop()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
for i, j in indices:
|
||||||
|
num = ord(search[i][j])
|
||||||
|
cache[num] = [num] + unknown
|
||||||
|
results[i][j] = num
|
||||||
|
|
||||||
|
if len(search) == 1:
|
||||||
|
# This means we've fallen back on decoding our single input as a string.
|
||||||
|
# Setting this lets us display output differently, hopefully more usefully.
|
||||||
|
decode = True
|
||||||
|
|
||||||
|
results = [r for inner in results for r in inner]
|
||||||
|
|
||||||
|
if quiet and not verbose:
|
||||||
|
if decode:
|
||||||
|
fmt = '{code}'
|
||||||
|
else:
|
||||||
|
fmt = '{char}'
|
||||||
|
join = ' '
|
||||||
|
elif verbose and not quiet:
|
||||||
|
fmt = '{code} [{long_category}] {names}: {char}'
|
||||||
|
join = '\n'
|
||||||
|
else:
|
||||||
|
fmt = '{code} [{category}] {name}: {char}'
|
||||||
|
join = '\n'
|
||||||
|
|
||||||
|
def get_output(results):
|
||||||
|
output = []
|
||||||
|
|
||||||
|
for num in results:
|
||||||
|
is_range = False
|
||||||
|
if num < 0:
|
||||||
|
is_range = True
|
||||||
|
range_start, range_end, category, compose, names = cache[num]
|
||||||
|
if is_range:
|
||||||
|
if compose:
|
||||||
|
char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
|
||||||
|
else:
|
||||||
|
char = chr(range_start) + '..' + chr(range_end)
|
||||||
|
code = unif(range_start) + '..' + unif(range_end)
|
||||||
|
else:
|
||||||
|
char = chr(num)
|
||||||
|
if compose:
|
||||||
|
char = '\u25cc' + char
|
||||||
|
code = unif(num)
|
||||||
|
output.append(fmt.format(
|
||||||
|
code=code,
|
||||||
|
name=names[0],
|
||||||
|
names=', '.join(names),
|
||||||
|
char=char,
|
||||||
|
category=category,
|
||||||
|
long_category=categories[category],
|
||||||
|
))
|
||||||
|
|
||||||
|
return join.join(output)
|
||||||
|
|
||||||
|
if quiet:
|
||||||
|
output = get_output(results)
|
||||||
|
output8 = output.encode('utf-8')
|
||||||
|
if len(output8) > 470:
|
||||||
|
cut = len(results) // 2
|
||||||
|
clen = cut
|
||||||
|
tried_okay = set()
|
||||||
|
for i in range(ceil(log(len(results), 2)) + 1):
|
||||||
|
output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
|
||||||
|
clen //= 2
|
||||||
|
if len(output8) < 450:
|
||||||
|
tried_okay.add(cut)
|
||||||
|
cut += clen
|
||||||
|
else:
|
||||||
|
cut -= clen
|
||||||
|
output = get_output(results[:max(tried_okay)]) + ' ...'
|
||||||
|
return output
|
||||||
|
else:
|
||||||
|
return get_output(results)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from sys import argv
|
||||||
|
|
||||||
|
if len(argv) == 2:
|
||||||
|
irc(argv[1])
|
||||||
|
elif len(argv) == 3:
|
||||||
|
print(doit(argv[1], argv[2]))
|
||||||
|
else:
|
||||||
|
print('Usage:', argv[0], '#channel')
|
||||||
|
print(' or:', argv[0], '[qvd]*', 'query')
|
||||||
|
exit(1)
|
243
happybot/unicode/update.py
Normal file
243
happybot/unicode/update.py
Normal file
|
@ -0,0 +1,243 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from urllib.request import urlretrieve
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Union, Set
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||||
|
categories = '''
|
||||||
|
Lu Uppercase_Letter an uppercase letter
|
||||||
|
Ll Lowercase_Letter a lowercase letter
|
||||||
|
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
||||||
|
LC Cased_Letter Lu | Ll | Lt
|
||||||
|
Lm Modifier_Letter a modifier letter
|
||||||
|
Lo Other_Letter other letters, including syllables and ideographs
|
||||||
|
L Letter Lu | Ll | Lt | Lm | Lo
|
||||||
|
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
||||||
|
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
||||||
|
Me Enclosing_Mark an enclosing combining mark
|
||||||
|
M Mark Mn | Mc | Me
|
||||||
|
Nd Decimal_Number a decimal digit
|
||||||
|
Nl Letter_Number a letterlike numeric character
|
||||||
|
No Other_Number a numeric character of other type
|
||||||
|
N Number Nd | Nl | No
|
||||||
|
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
||||||
|
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
||||||
|
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
||||||
|
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
||||||
|
Pi Initial_Punctuation an initial quotation mark
|
||||||
|
Pf Final_Punctuation a final quotation mark
|
||||||
|
Po Other_Punctuation a punctuation mark of other type
|
||||||
|
P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po
|
||||||
|
Sm Math_Symbol a symbol of mathematical use
|
||||||
|
Sc Currency_Symbol a currency sign
|
||||||
|
Sk Modifier_Symbol a non-letterlike modifier symbol
|
||||||
|
So Other_Symbol a symbol of other type
|
||||||
|
S Symbol Sm | Sc | Sk | So
|
||||||
|
Zs Space_Separator a space character (of various non-zero widths)
|
||||||
|
Zl Line_Separator U+2028 LINE SEPARATOR only
|
||||||
|
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
||||||
|
Z Separator Zs | Zl | Zp
|
||||||
|
Cc Control a C0 or C1 control code
|
||||||
|
Cf Format a format control character
|
||||||
|
Cs Surrogate a surrogate code point
|
||||||
|
Co Private_Use a private-use character
|
||||||
|
Cn Unassigned a reserved unassigned code point or a noncharacter
|
||||||
|
C Other Cc | Cf | Cs | Co | Cn
|
||||||
|
'''.strip().split('\n')
|
||||||
|
categories = [ row.split('\t', 2) for row in categories ]
|
||||||
|
categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
|
||||||
|
|
||||||
|
custom_names = {
|
||||||
|
0x00: "NUL '\\0' (null character)",
|
||||||
|
0x01: "SOH (start of heading)",
|
||||||
|
0x02: "STX (start of text)",
|
||||||
|
0x03: "ETX (end of text)",
|
||||||
|
0x04: "EOT (end of transmission)",
|
||||||
|
0x05: "ENQ (enquiry)",
|
||||||
|
0x06: "ACK (acknowledge)",
|
||||||
|
0x07: "BEL '\\a' (bell)",
|
||||||
|
0x08: "BS '\\b' (backspace)",
|
||||||
|
0x09: "HT '\\t' (horizontal tab)",
|
||||||
|
0x0A: "LF '\\n' (new line)",
|
||||||
|
0x0B: "VT '\\v' (vertical tab)",
|
||||||
|
0x0C: "FF '\\f' (form feed)",
|
||||||
|
0x0D: "CR '\\r' (carriage return)",
|
||||||
|
0x0E: "SO (shift out)",
|
||||||
|
0x0F: "SI (shift in)",
|
||||||
|
0x10: "DLE (data link escape)",
|
||||||
|
0x11: "DC1 (device control 1)",
|
||||||
|
0x12: "DC2 (device control 2)",
|
||||||
|
0x13: "DC3 (device control 3)",
|
||||||
|
0x14: "DC4 (device control 4)",
|
||||||
|
0x15: "NAK (negative acknowledge)",
|
||||||
|
0x16: "SYN (synchronous idle)",
|
||||||
|
0x17: "ETB (end of transmission block)",
|
||||||
|
0x18: "CAN (cancel)",
|
||||||
|
0x19: "EM (end of medium)",
|
||||||
|
0x1A: "SUB (substitute)",
|
||||||
|
0x1B: "ESC (escape)",
|
||||||
|
0x1C: "FS (file separator)",
|
||||||
|
0x1D: "GS (group separator)",
|
||||||
|
0x1E: "RS (record separator)",
|
||||||
|
0x1F: "US (unit separator)",
|
||||||
|
0x7F: "DEL (delete)",
|
||||||
|
}
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Character:
|
||||||
|
ordinal: int
|
||||||
|
ordinal_end: Union[None, int]
|
||||||
|
category: str
|
||||||
|
names: Set[str]
|
||||||
|
name: str
|
||||||
|
combining: int
|
||||||
|
in_range: Union[bool, str]
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f'<U+{uni(self.ordinal)}>'
|
||||||
|
|
||||||
|
def uni(ordinal):
|
||||||
|
return hex(ordinal)[2:].zfill(4)
|
||||||
|
|
||||||
|
def main(update_uni, update_names):
|
||||||
|
UnicodeData = 'UnicodeData.txt'
|
||||||
|
NamesList = 'NamesList.txt'
|
||||||
|
|
||||||
|
if update_uni:
|
||||||
|
UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData)
|
||||||
|
if update_names:
|
||||||
|
NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList)
|
||||||
|
|
||||||
|
database = dict()
|
||||||
|
ranges = dict()
|
||||||
|
|
||||||
|
# UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html
|
||||||
|
# And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files
|
||||||
|
with open(UnicodeData, 'r') as fh:
|
||||||
|
for line in fh:
|
||||||
|
row = line.rstrip().split(';')
|
||||||
|
code, name, category, combining,\
|
||||||
|
bidirectional, decomposition, decimal,\
|
||||||
|
digit, numeric, mirrored, old_name,\
|
||||||
|
iso_comment, upper, lower, title = row
|
||||||
|
|
||||||
|
ordinal = int(code, 16)
|
||||||
|
combining = int(combining)
|
||||||
|
names = set()
|
||||||
|
|
||||||
|
if old_name:
|
||||||
|
names.add(old_name)
|
||||||
|
|
||||||
|
firstlast = None
|
||||||
|
if name == '<control>':
|
||||||
|
name = old_name
|
||||||
|
elif name[0] == '<' and name[-1] == '>':
|
||||||
|
name = name[1:-1]
|
||||||
|
name, firstlast = name.split(', ', 1)
|
||||||
|
else:
|
||||||
|
names.add(name)
|
||||||
|
|
||||||
|
if ordinal in custom_names:
|
||||||
|
name = custom_names[ordinal]
|
||||||
|
|
||||||
|
char = Character(
|
||||||
|
ordinal=ordinal,
|
||||||
|
ordinal_end=None,
|
||||||
|
category=category,
|
||||||
|
name=name,
|
||||||
|
names=names,
|
||||||
|
in_range=False,
|
||||||
|
combining=combining,
|
||||||
|
)
|
||||||
|
|
||||||
|
if firstlast and name not in ranges:
|
||||||
|
ranges[name] = [None, None]
|
||||||
|
|
||||||
|
if firstlast == 'First':
|
||||||
|
ranges[name][0] = char
|
||||||
|
elif firstlast == 'Last':
|
||||||
|
ranges[name][1] = char
|
||||||
|
else:
|
||||||
|
database[char.ordinal] = char
|
||||||
|
|
||||||
|
# TODO: What's something nicer than this?
|
||||||
|
for range_name, (start, end) in ranges.items():
|
||||||
|
name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])'
|
||||||
|
start.name = name
|
||||||
|
end.name = name
|
||||||
|
assert start.category == end.category
|
||||||
|
assert start.combining == end.combining
|
||||||
|
char = Character(
|
||||||
|
ordinal=start.ordinal,
|
||||||
|
ordinal_end=end.ordinal,
|
||||||
|
category=start.category,
|
||||||
|
name=name,
|
||||||
|
names={name},
|
||||||
|
in_range=range_name,
|
||||||
|
combining=start.combining,
|
||||||
|
)
|
||||||
|
database[char.ordinal] = char
|
||||||
|
|
||||||
|
# NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html
|
||||||
|
# But I sort-of guessed and hoped for the best.
|
||||||
|
with open(NamesList, 'r') as fh:
|
||||||
|
char = None
|
||||||
|
for line in fh:
|
||||||
|
line = line.rstrip()
|
||||||
|
if line[0] in ';@':
|
||||||
|
continue
|
||||||
|
if line[0] == '\t':
|
||||||
|
if line[1] == '=':
|
||||||
|
char.names.add(line[3:])
|
||||||
|
else:
|
||||||
|
char, name = line.split('\t', 1)
|
||||||
|
ordinal = int(char, 16)
|
||||||
|
if ordinal in database:
|
||||||
|
char = database[ordinal]
|
||||||
|
char.names.add(name)
|
||||||
|
else:
|
||||||
|
char = Character(
|
||||||
|
ordinal=ordinal,
|
||||||
|
ordinal_end=None,
|
||||||
|
category='Cn',
|
||||||
|
name=name,
|
||||||
|
names={name},
|
||||||
|
in_range=False,
|
||||||
|
combining=0,
|
||||||
|
)
|
||||||
|
database[ordinal] = char
|
||||||
|
|
||||||
|
for char in database.values():
|
||||||
|
char.names -= {'<control>'}
|
||||||
|
char.names -= { n for n in char.names if n.endswith(' (1.0)') }
|
||||||
|
|
||||||
|
# TODO: Add Nami.txt short names in parens.
|
||||||
|
|
||||||
|
with open('UnicodeDataFull.json', 'w') as fh:
|
||||||
|
for o in sorted(database.keys()):
|
||||||
|
char = database[o]
|
||||||
|
combining = char.category[0] == 'M'
|
||||||
|
names = [char.name] + list(sorted(char.names - {char.name}))
|
||||||
|
row = [char.ordinal, char.ordinal_end, char.category, combining, names]
|
||||||
|
fh.write(json.dumps(row) + '\n')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
|
||||||
|
update_uni = False
|
||||||
|
update_names = False
|
||||||
|
for arg in sys.argv[1:]:
|
||||||
|
if arg in ('-u', '--unicode-data'):
|
||||||
|
update_uni = True
|
||||||
|
elif arg in ('-l', '--names-list'):
|
||||||
|
update_names = True
|
||||||
|
else:
|
||||||
|
print('Usage:', sys.argv[0], '[-n]')
|
||||||
|
print(' ', '-u, --unicode-data: Download UnicodeData.txt.')
|
||||||
|
print(' ', '-l, --names-list: Download NamesList.txt.')
|
||||||
|
print(' ', '-h, --help: Show this help.')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
main(update_uni=update_uni, update_names=update_names)
|
Loading…
Reference in a new issue