import select
import time
import re
-import urllib.request
-import http.client
-import html
-import html.parser
+import requests
+import bs4
# Defaults, may be overwritten by command line arguments.
SERVER = "irc.freenode.net"
NICKNAME = USERNAME
-class HTMLParser(html.parser.HTMLParser):
- def __init__(self, html, tag):
- super().__init__()
- self._tag = ""
- self.data = ""
- self.feed(html)
- def handle_starttag(self, tag, attrs):
- if self.data == "":
- self._tag = tag
- def handle_endtag(self, tag):
- self._tag = ""
- def handle_data(self, data):
- if self._tag != "":
- self.data = data
-
-
class ExceptionForRestart(Exception):
pass
matches = re.findall("(https?://[^\s>]+)", msg)
for i in range(len(matches)):
url = matches[i]
- request = urllib.request.Request(url, headers={
- "User-Agent": "plomlombot"
- })
try:
- webpage = urllib.request.urlopen(request, timeout=15)
- except (urllib.error.HTTPError, urllib.error.URLError,
- UnicodeError, http.client.BadStatusLine) as error:
+ r = requests.get(url, timeout=15)
+ except (requests.exceptions.TooManyRedirects,
+ requests.exceptions.ConnectionError,
+ requests.exceptions.InvalidURL,
+ requests.exceptions.InvalidSchema) as error:
notice("TROUBLE FOLLOWING URL: " + str(error))
continue
- charset = webpage.info().get_content_charset()
- if not charset:
- notice("TROUBLE READING PAGE TITLE: no charset in header")
- continue
- content_type = webpage.info().get_content_type()
- if content_type not in ('text/html', 'text/xml',
- 'application/xhtml+xml'):
- notice("TROUBLE READING PAGE TITLE: bad content type "
- + content_type)
- continue
- content = webpage.read().decode(charset)
- title = HTMLParser(content, "title").data
- title = html.unescape(title)
- notice("PAGE TITLE FOR URL: " + title)
+ title = bs4.BeautifulSoup(r.text).title
+ if title:
+ notice("PAGE TITLE FOR URL: " + title.string.strip())
+ else:
+ notice("PAGE HAS NO TITLE TAG")
sender = ""
for rune in tokens[0]: