Start on the actual titlebot code
This commit is contained in:
parent
87d9a1237b
commit
5bd2ae5410
1 changed files with 96 additions and 1 deletions
97
botcmd.py
97
botcmd.py
|
@ -1,3 +1,6 @@
|
||||||
|
import html
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
# initialize(*, config)
|
# initialize(*, config)
|
||||||
# Called to initialize the IRC bot
|
# Called to initialize the IRC bot
|
||||||
# Runs before even logger is brought up, and blocks further bringup until it's done
|
# Runs before even logger is brought up, and blocks further bringup until it's done
|
||||||
|
@ -20,6 +23,80 @@ def on_connect(*, irc):
|
||||||
def on_quit(*, irc):
|
def on_quit(*, irc):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
def find_urls(message):
|
||||||
|
urls = []
|
||||||
|
index = 0
|
||||||
|
while index < len(message):
|
||||||
|
# Scan for "http" as the common subset of http:// and https://
|
||||||
|
maybe_next_url = message[index:].find('http')
|
||||||
|
# End if there are no longer urls to find
|
||||||
|
if maybe_next_url == -1: break
|
||||||
|
maybe_next_url += index
|
||||||
|
|
||||||
|
if message[maybe_next_url:maybe_next_url+7] == 'http://' or message[maybe_next_url:maybe_next_url+8] == 'https://':
|
||||||
|
# Looks like we found a URL, scan for its end
|
||||||
|
index = maybe_next_url
|
||||||
|
parens = 0
|
||||||
|
while index < len(message):
|
||||||
|
# Since browsers don't urlencode parens nowadays, try to avoid breaking those URLs while allowing (https://example.com) to work as well
|
||||||
|
if message[index] == '(':
|
||||||
|
parens += 1
|
||||||
|
elif message[index] == ')':
|
||||||
|
if parens > 0:
|
||||||
|
parens -= 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# Some people punctuate their URLs
|
||||||
|
elif message[index:index+2] in ('. ', ', '):
|
||||||
|
break
|
||||||
|
elif message[index] in (' ', '>'):
|
||||||
|
break
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
urls.append(message[maybe_next_url:index])
|
||||||
|
else:
|
||||||
|
index = maybe_next_url + 1
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def extract_title(page_data):
|
||||||
|
# Find the <title> tag
|
||||||
|
title_start = None
|
||||||
|
index = 0
|
||||||
|
while True:
|
||||||
|
maybe_tag = page_data[index:].find(b'<')
|
||||||
|
if maybe_tag == -1: break
|
||||||
|
maybe_tag += index
|
||||||
|
|
||||||
|
if page_data[maybe_tag:maybe_tag+7].lower() == b'<title>':
|
||||||
|
title_start = maybe_tag + 7
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
index = maybe_tag + 1
|
||||||
|
|
||||||
|
if title_start is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find the </title> tag
|
||||||
|
title_end = None
|
||||||
|
index = 0
|
||||||
|
while True:
|
||||||
|
maybe_tag = page_data[index:].find(b'<')
|
||||||
|
if maybe_tag == -1: break
|
||||||
|
maybe_tag += index
|
||||||
|
|
||||||
|
if page_data[maybe_tag:maybe_tag+8].lower() == b'</title>':
|
||||||
|
title_end = maybe_tag
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
index = maybe_tag + 1
|
||||||
|
|
||||||
|
if title_end is None:
|
||||||
|
title_end = len(page_data)
|
||||||
|
|
||||||
|
return html.unescape(page_data[title_start:title_end].decode('utf-8', errors = 'replace'))
|
||||||
|
|
||||||
# handle_message(*, prefix, message, nick, channel, irc)
|
# handle_message(*, prefix, message, nick, channel, irc)
|
||||||
# Called for PRIVMSGs.
|
# Called for PRIVMSGs.
|
||||||
# prefix is the prefix at the start of the message, without the leading ':'
|
# prefix is the prefix at the start of the message, without the leading ':'
|
||||||
|
@ -29,7 +106,25 @@ def on_quit(*, irc):
|
||||||
# irc is the IRC API object
|
# irc is the IRC API object
|
||||||
# All strings are bytestrings
|
# All strings are bytestrings
|
||||||
def handle_message(*, prefix, message, nick, channel, irc):
|
def handle_message(*, prefix, message, nick, channel, irc):
|
||||||
...
|
urls = find_urls(message.decode('utf-8'))
|
||||||
|
|
||||||
|
# Don't titlebot >3 urls
|
||||||
|
possible_titles_left = 3
|
||||||
|
for url in urls:
|
||||||
|
if possible_titles_left == 0: break
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout = 1) as response:
|
||||||
|
if response.info().get_content_type() == 'text/html':
|
||||||
|
# First 4KB of a page should be enough for any <title>
|
||||||
|
first_kb = response.read(4 * 1024)
|
||||||
|
title = extract_title(first_kb)
|
||||||
|
print(title)#debg
|
||||||
|
possible_titles_left -= 1
|
||||||
|
|
||||||
|
except IOError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
# handle_nonmessage(*, prefix, command, arguments, irc)
|
# handle_nonmessage(*, prefix, command, arguments, irc)
|
||||||
# Called for all other commands than PINGs and PRIVMSGs.
|
# Called for all other commands than PINGs and PRIVMSGs.
|
||||||
|
|
Loading…
Reference in a new issue