We now support the output that you actually wanted!

Who would've thought that shell scripts were awkward for complex situations...
2021-07-11 06:36:30 -04:00 · 2021-07-11 06:36:30 -04:00 · a0c6637486
commit a0c6637486
parent d58a32d07d
2 changed files with 579 additions and 0 deletions
--- a/happybot/unicode/unicode.py
+++ b/happybot/unicode/unicode.py
@ -0,0 +1,336 @@
 #!/usr/bin/env python3
 from subprocess import Popen, PIPE
 from os import chdir, environ
 from pathlib import Path
 import re
 # Make sure we're in the correct directory, for module imports and such too.
 basedir = Path(environ.get('basedir', '.'))
 chdir(basedir)
 def cmd(args):
    proc = Popen(args, stdout=PIPE)
    while True:
        line = proc.stdout.readline()
        if line:
            try:
                yield str(line[:-1], 'utf-8', 'ignore')
            except:
                pass
        else:
            break
 def irc(chan):
    global trigger
    server = environ.get('serv', 'irc.libera.chat')
    fdir = '/home/zgrep/offtopiabday/' + server + '/' + chan
    fin = fdir + '/in'
    fout = fdir + '/out'
    for line in cmd(['tail', '-n', '0', '-f', fout]):
        date, time, nick, line = line.split(' ', 3)
        if nick[0] != '<' or nick[-1] != '>':
            continue
        nick = nick[1:-1]
        m = re.match(r'(?i)^(?:@?(?:happy|hate)bot[:,] (?:unicode|char)|!char) ((?:-[8qvd]+ )+)?(.+)$', line)
        if m:
            flags, query = m.groups()
            if not flags:
                flags = ''
            result = doit(flags, query).split('\n')
            result = [ f'\u200b{nick}: ' + line for line in result ]
            result = result[:4] # capping at 4 lines max
            result = '\n'.join(result)
            with open(fin, 'w') as fh:
                fh.write(result + '\n')
 from collections import defaultdict
 from math import ceil, log
 import json
 # Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
 categories = '''
 Lu	Uppercase_Letter	an uppercase letter
 Ll	Lowercase_Letter	a lowercase letter
 Lt	Titlecase_Letter	a digraphic character, with first part uppercase
 LC	Cased_Letter	Lu | Ll | Lt
 Lm	Modifier_Letter	a modifier letter
 Lo	Other_Letter	other letters, including syllables and ideographs
 L	Letter	Lu | Ll | Lt | Lm | Lo
 Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
 Mc	Spacing_Mark	a spacing combining mark (positive advance width)
 Me	Enclosing_Mark	an enclosing combining mark
 M	Mark	Mn | Mc | Me
 Nd	Decimal_Number	a decimal digit
 Nl	Letter_Number	a letterlike numeric character
 No	Other_Number	a numeric character of other type
 N	Number	Nd | Nl | No
 Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
 Pd	Dash_Punctuation	a dash or hyphen punctuation mark
 Ps	Open_Punctuation	an opening punctuation mark (of a pair)
 Pe	Close_Punctuation	a closing punctuation mark (of a pair)
 Pi	Initial_Punctuation	an initial quotation mark
 Pf	Final_Punctuation	a final quotation mark
 Po	Other_Punctuation	a punctuation mark of other type
 P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
 Sm	Math_Symbol	a symbol of mathematical use
 Sc	Currency_Symbol	a currency sign
 Sk	Modifier_Symbol	a non-letterlike modifier symbol
 So	Other_Symbol	a symbol of other type
 S	Symbol	Sm | Sc | Sk | So
 Zs	Space_Separator	a space character (of various non-zero widths)
 Zl	Line_Separator	U+2028 LINE SEPARATOR only
 Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
 Z	Separator	Zs | Zl | Zp
 Cc	Control	a C0 or C1 control code
 Cf	Format	a format control character
 Cs	Surrogate	a surrogate code point
 Co	Private_Use	a private-use character
 Cn	Unassigned	a reserved unassigned code point or a noncharacter
 C	Other	Cc | Cf | Cs | Co | Cn
 '''.strip().split('\n')
 categories = [ row.split('\t', 2) for row in categories ]
 categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
 def utf8uni(ordinal):
    s = hex(int.from_bytes(chr(ordinal).encode('utf-8'), 'big'))[2:]
    if len(s) % 2 == 1:
        s = '0' + s
    return '0x' + s
 def uniuni(ordinal):
    return 'U+' + hex(ordinal)[2:].zfill(4).upper()
 UnicodeDataFull = 'UnicodeDataFull.json'
 tokens = re.compile(r'\s*\b(?:U\+([0-9A-Fa-f]+)|[Uu]([0-9A-F]{4,6})|0x([0-9a-f]+)|0b([01]+))\b\s*')
 invalid = [None, "Cn", False, ["<invalid>"]]
 unknown = [None, "Cn", False, ["<unknown>"]]
 def doit(flags, query):
    quiet = 'q' in flags
    verbose = 'v' in flags
    decode = 'd' in flags
    utf8 = '8' in flags
    if utf8:
        unif = utf8uni
    else:
        unif = uniuni
    cache = dict()
    if len(query) <= 2:
        decode = True
    if decode:
        search = list(map(ord, query))
    else:
        index, merge = 0, False
        search = []
        for match in tokens.finditer(query):
            missed = query[index:match.start()]
            if missed:
                if merge:
                    search[-1] += missed
                else:
                    search.append(missed)
            index = match.end()
            merge = False
            uni1, uni2, hexa, bina = match.groups()
            uni = uni1 or uni2
            if uni:
                search.append(int(uni, 16))
            elif hexa:
                try:
                    byt = int(hexa, 16).to_bytes(ceil(len(hexa)/2), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0x' + hexa
                    else:
                        search.append('0x' + hexa)
                    merge = True
            elif bina:
                try:
                    byt = int(bina, 2).to_bytes(ceil(len(bina)/8), 'big').decode('utf-8', 'error')
                    search.extend(map(ord, byt))
                except:
                    if isinstance(search[-1], str):
                        search[-1] += '0b' + bina
                    else:
                        search.append('0b' + bina)
                    merge = True
        missed = query[index:]
        if missed:
            if merge:
                search[-1] += missed
            else:
                search.append(missed)
    results = [[] for _ in range(len(search))]
    numbers = defaultdict(list)
    strings = defaultdict(list)
    for i, elem in enumerate(search):
        if isinstance(elem, int):
            numbers[elem].append(i)
        elif isinstance(elem, str):
            strings[elem.lower()].append(i)
    numbers = list(sorted(numbers.items(), reverse=True))
    # The actual searching.
    filled = set()
    with open(UnicodeDataFull, 'r') as fh:
        for line in fh:
            row = json.loads(line)
            if numbers:
                if row[0] == numbers[-1][0]:
                    cache[row[0]] = row
                    for index in numbers[-1][1]:
                        filled.add(index)
                        results[index].append(row[0])
                    numbers.pop()
                elif row[1]:
                    while numbers and row[0] <= numbers[-1][0] <= row[1]:
                        cache[numbers[-1][0]] = row
                        for index in numbers[-1][1]:
                            filled.add(index)
                            results[index].append(numbers[-1][0])
                        numbers.pop()
            elif not strings:
                break
            for string, indices in strings.items():
                if any(string in name.lower() for name in row[4]):
                    num = row[0]
                    if row[1]:
                        num = -num
                    cache[num] = row
                    for index in indices:
                        filled.add(index)
                        results[index].append(num)
    missing = set(range(len(search))) - filled
    numbers = defaultdict(list)
    indices = set()
    for i in missing:
        elem = search[i]
        if isinstance(elem, int):
            cache[elem] = [elem] + invalid
            results[i].append(elem)
        elif isinstance(elem, str):
            results[i] = [None] * len(elem)
            for j, c in enumerate(elem):
                numbers[ord(c)].append((i, j))
                indices.add((i, j))
    numbers = list(sorted(numbers.items(), reverse=True))
    if indices:
        # Decoding what we have left, just some numbers.
        with open(UnicodeDataFull, 'r') as fh:
            for line in fh:
                row = json.loads(line)
                if numbers:
                    if row[0] == numbers[-1][0]:
                        cache[row[0]] = row
                        for i, j in numbers[-1][1]:
                            indices.remove((i, j))
                            results[i][j] = row[0]
                        numbers.pop()
                    elif row[1]:
                        while numbers and row[0] <= numbers[-1][0] <= row[1]:
                            cache[numbers[-1][0]] = row
                            for i, j in numbers[-1][1]:
                                indices.remove((i, j))
                                results[i][j] = numbers[-1][0]
                            numbers.pop()
                else:
                    break
        for i, j in indices:
            num = ord(search[i][j])
            cache[num] = [num] + unknown
            results[i][j] = num
        if len(search) == 1:
            # This means we've fallen back on decoding our single input as a string.
            # Setting this lets us display output differently, hopefully more usefully.
            decode = True
    results = [r for inner in results for r in inner]
    if quiet and not verbose:
        if decode:
            fmt = '{code}'
        else:
            fmt = '{char}'
        join = ' '
    elif verbose and not quiet:
        fmt = '{code} [{long_category}] {names}: {char}'
        join = '\n'
    else:
        fmt = '{code} [{category}] {name}: {char}'
        join = '\n'
    def get_output(results):
        output = []
        for num in results:
            is_range = False
            if num < 0:
                is_range = True
            range_start, range_end, category, compose, names = cache[num]
            if is_range:
                if compose:
                    char = '\u25cc' + chr(range_start) + '..\u25cc' + chr(range_end)
                else:
                    char = chr(range_start) + '..' + chr(range_end)
                code = unif(range_start) + '..' + unif(range_end)
            else:
                char = chr(num)
                if compose:
                    char = '\u25cc' + char
                code = unif(num)
            output.append(fmt.format(
                    code=code,
                    name=names[0],
                    names=', '.join(names),
                    char=char,
                    category=category,
                    long_category=categories[category],
                    ))
        return join.join(output)
    if quiet:
        output = get_output(results)
        output8 = output.encode('utf-8')
        if len(output8) > 470:
            cut = len(results) // 2
            clen = cut
            tried_okay = set()
            for i in range(ceil(log(len(results), 2)) + 1):
                output8 = (get_output(results[:cut]) + ' ...').encode('utf-8')
                clen //= 2
                if len(output8) < 450:
                    tried_okay.add(cut)
                    cut += clen
                else:
                    cut -= clen
            output = get_output(results[:max(tried_okay)]) + ' ...'
        return output
    else:
        return get_output(results)
 if __name__ == '__main__':
    from sys import argv
    if len(argv) == 2:
        irc(argv[1])
    elif len(argv) == 3:
        print(doit(argv[1], argv[2]))
    else:
        print('Usage:', argv[0], '#channel')
        print('   or:', argv[0], '[qvd]*', 'query')
        exit(1)
--- a/happybot/unicode/update.py
+++ b/happybot/unicode/update.py
@ -0,0 +1,243 @@
 #!/usr/bin/env python3
 from urllib.request import urlretrieve
 from dataclasses import dataclass
 from typing import Union, Set
 import json
 # Copied from: https://www.unicode.org/reports/tr44/#GC_Values_Table
 categories = '''
 Lu	Uppercase_Letter	an uppercase letter
 Ll	Lowercase_Letter	a lowercase letter
 Lt	Titlecase_Letter	a digraphic character, with first part uppercase
 LC	Cased_Letter	Lu | Ll | Lt
 Lm	Modifier_Letter	a modifier letter
 Lo	Other_Letter	other letters, including syllables and ideographs
 L	Letter	Lu | Ll | Lt | Lm | Lo
 Mn	Nonspacing_Mark	a nonspacing combining mark (zero advance width)
 Mc	Spacing_Mark	a spacing combining mark (positive advance width)
 Me	Enclosing_Mark	an enclosing combining mark
 M	Mark	Mn | Mc | Me
 Nd	Decimal_Number	a decimal digit
 Nl	Letter_Number	a letterlike numeric character
 No	Other_Number	a numeric character of other type
 N	Number	Nd | Nl | No
 Pc	Connector_Punctuation	a connecting punctuation mark, like a tie
 Pd	Dash_Punctuation	a dash or hyphen punctuation mark
 Ps	Open_Punctuation	an opening punctuation mark (of a pair)
 Pe	Close_Punctuation	a closing punctuation mark (of a pair)
 Pi	Initial_Punctuation	an initial quotation mark
 Pf	Final_Punctuation	a final quotation mark
 Po	Other_Punctuation	a punctuation mark of other type
 P	Punctuation	Pc | Pd | Ps | Pe | Pi | Pf | Po
 Sm	Math_Symbol	a symbol of mathematical use
 Sc	Currency_Symbol	a currency sign
 Sk	Modifier_Symbol	a non-letterlike modifier symbol
 So	Other_Symbol	a symbol of other type
 S	Symbol	Sm | Sc | Sk | So
 Zs	Space_Separator	a space character (of various non-zero widths)
 Zl	Line_Separator	U+2028 LINE SEPARATOR only
 Zp	Paragraph_Separator	U+2029 PARAGRAPH SEPARATOR only
 Z	Separator	Zs | Zl | Zp
 Cc	Control	a C0 or C1 control code
 Cf	Format	a format control character
 Cs	Surrogate	a surrogate code point
 Co	Private_Use	a private-use character
 Cn	Unassigned	a reserved unassigned code point or a noncharacter
 C	Other	Cc | Cf | Cs | Co | Cn
 '''.strip().split('\n')
 categories = [ row.split('\t', 2) for row in categories ]
 categories = { left: right.replace('_', ' ') for left, right, ignore in categories if len(left) == 2 }
 custom_names = {
        0x00: "NUL '\\0' (null character)",
        0x01: "SOH (start of heading)",
        0x02: "STX (start of text)",
        0x03: "ETX (end of text)",
        0x04: "EOT (end of transmission)",
        0x05: "ENQ (enquiry)",
        0x06: "ACK (acknowledge)",
        0x07: "BEL '\\a' (bell)",
        0x08: "BS '\\b' (backspace)",
        0x09: "HT '\\t' (horizontal tab)",
        0x0A: "LF '\\n' (new line)",
        0x0B: "VT '\\v' (vertical tab)",
        0x0C: "FF '\\f' (form feed)",
        0x0D: "CR '\\r' (carriage return)",
        0x0E: "SO (shift out)",
        0x0F: "SI (shift in)",
        0x10: "DLE (data link escape)",
        0x11: "DC1 (device control 1)",
        0x12: "DC2 (device control 2)",
        0x13: "DC3 (device control 3)",
        0x14: "DC4 (device control 4)",
        0x15: "NAK (negative acknowledge)",
        0x16: "SYN (synchronous idle)",
        0x17: "ETB (end of transmission block)",
        0x18: "CAN (cancel)",
        0x19: "EM (end of medium)",
        0x1A: "SUB (substitute)",
        0x1B: "ESC (escape)",
        0x1C: "FS (file separator)",
        0x1D: "GS (group separator)",
        0x1E: "RS (record separator)",
        0x1F: "US (unit separator)",
        0x7F: "DEL (delete)",
    }
@dataclass
 class Character:
    ordinal: int
    ordinal_end: Union[None, int]
    category: str
    names: Set[str]
    name: str
    combining: int
    in_range: Union[bool, str]
    def __repr__(self):
        return f'<U+{uni(self.ordinal)}>'
 def uni(ordinal):
    return hex(ordinal)[2:].zfill(4)
 def main(update_uni, update_names):
    UnicodeData = 'UnicodeData.txt'
    NamesList   = 'NamesList.txt'
    if update_uni:
        UnicodeData, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', UnicodeData)
    if update_names:
        NamesList, _ = urlretrieve('http://www.unicode.org/Public/UCD/latest/ucd/NamesList.txt', NamesList)
    database = dict()
    ranges = dict()
    # UnicodeData.txt is described at: http://www.unicode.org/L2/L1999/UnicodeData.html
    # And with more recent additions at: https://www.unicode.org/reports/tr44/#UCD_Files
    with open(UnicodeData, 'r') as fh:
        for line in fh:
            row = line.rstrip().split(';')
            code, name, category, combining,\
            bidirectional, decomposition, decimal,\
            digit, numeric, mirrored, old_name,\
            iso_comment, upper, lower, title = row
            ordinal = int(code, 16)
            combining = int(combining)
            names = set()
            if old_name:
                names.add(old_name)
            firstlast = None
            if name == '<control>':
                name = old_name
            elif name[0] == '<' and name[-1] == '>':
                name = name[1:-1]
                name, firstlast = name.split(', ', 1)
            else:
                names.add(name)
            if ordinal in custom_names:
                name = custom_names[ordinal]
            char = Character(
                    ordinal=ordinal,
                    ordinal_end=None,
                    category=category,
                    name=name,
                    names=names,
                    in_range=False,
                    combining=combining,
                    )
            if firstlast and name not in ranges:
                ranges[name] = [None, None]
            if firstlast == 'First':
                ranges[name][0] = char
            elif firstlast == 'Last':
                ranges[name][1] = char
            else:
                database[char.ordinal] = char
    # TODO: What's something nicer than this?
    for range_name, (start, end) in ranges.items():
        name = f'({range_name} [U+{uni(start.ordinal)}..U+{uni(end.ordinal)}])'
        start.name = name
        end.name   = name
        assert start.category == end.category
        assert start.combining == end.combining
        char = Character(
                ordinal=start.ordinal,
                ordinal_end=end.ordinal,
                category=start.category,
                name=name,
                names={name},
                in_range=range_name,
                combining=start.combining,
                )
        database[char.ordinal] = char
    # NamesList.txt is described at: https://www.unicode.org/Public/UCD/latest/ucd/NamesList.html
    # But I sort-of guessed and hoped for the best.
    with open(NamesList, 'r') as fh:
        char = None
        for line in fh:
            line = line.rstrip()
            if line[0] in ';@':
                continue
            if line[0] == '\t':
                if line[1] == '=':
                    char.names.add(line[3:])
            else:
                char, name = line.split('\t', 1)
                ordinal = int(char, 16)
                if ordinal in database:
                    char = database[ordinal]
                    char.names.add(name)
                else:
                    char = Character(
                            ordinal=ordinal,
                            ordinal_end=None,
                            category='Cn',
                            name=name,
                            names={name},
                            in_range=False,
                            combining=0,
                            )
                    database[ordinal] = char
    for char in database.values():
        char.names -= {'<control>'}
        char.names -= { n for n in char.names if n.endswith(' (1.0)') }
    # TODO: Add Nami.txt short names in parens.
    with open('UnicodeDataFull.json', 'w') as fh:
        for o in sorted(database.keys()):
            char = database[o]
            combining = char.category[0] == 'M'
            names = [char.name] + list(sorted(char.names - {char.name}))
            row = [char.ordinal, char.ordinal_end, char.category, combining, names]
            fh.write(json.dumps(row) + '\n')
 if __name__ == '__main__':
    import sys
    update_uni = False
    update_names = False
    for arg in sys.argv[1:]:
        if arg in ('-u', '--unicode-data'):
            update_uni = True
        elif arg in ('-l', '--names-list'):
            update_names = True
        else:
            print('Usage:', sys.argv[0], '[-n]')
            print(' ', '-u, --unicode-data: Download UnicodeData.txt.')
            print(' ', '-l, --names-list: Download NamesList.txt.')
            print(' ', '-h, --help: Show this help.')
            exit(1)
    main(update_uni=update_uni, update_names=update_names)