X-Git-Url: https://plomlompom.com/repos/foo.html?a=blobdiff_plain;f=plomlombot.py;h=767d1f71a156c5a744c8d80ed08802ad97af0127;hb=269fcae158d408bf2629cf4a1157943831a6933a;hp=f34d97a821378c5e092cf064a78a2137d82350c7;hpb=f093ae1177a3f7193b061febb74a249fc2e7a389;p=plomlombot-irc.git
diff --git a/plomlombot.py b/plomlombot.py
index f34d97a..767d1f7 100755
--- a/plomlombot.py
+++ b/plomlombot.py
@@ -6,10 +6,8 @@ import datetime
import select
import time
import re
-import urllib.request
-import http.client
-import html
-import html.parser
+import requests
+import bs4
# Defaults, may be overwritten by command line arguments.
SERVER = "irc.freenode.net"
@@ -19,22 +17,6 @@ USERNAME = "plomlombot"
NICKNAME = USERNAME
-class HTMLParser(html.parser.HTMLParser):
- def __init__(self, html, tag):
- super().__init__()
- self._tag = ""
- self.data = ""
- self.feed(html)
- def handle_starttag(self, tag, attrs):
- if self.data == "":
- self._tag = tag
- def handle_endtag(self, tag):
- self._tag = ""
- def handle_data(self, data):
- if self._tag != "":
- self.data = data
-
-
class ExceptionForRestart(Exception):
pass
@@ -128,29 +110,19 @@ def lineparser_loop(io, nickname):
matches = re.findall("(https?://[^\s>]+)", msg)
for i in range(len(matches)):
url = matches[i]
- request = urllib.request.Request(url, headers={
- "User-Agent": "plomlombot"
- })
try:
- webpage = urllib.request.urlopen(request, timeout=15)
- except (urllib.error.HTTPError, urllib.error.URLError,
- UnicodeError, http.client.BadStatusLine) as error:
+ r = requests.get(url, timeout=15)
+ except (requests.exceptions.TooManyRedirects,
+ requests.exceptions.ConnectionError,
+ requests.exceptions.InvalidURL,
+ requests.exceptions.InvalidSchema) as error:
notice("TROUBLE FOLLOWING URL: " + str(error))
continue
- charset = webpage.info().get_content_charset()
- if not charset:
- notice("TROUBLE READING PAGE TITLE: no charset in header")
- continue
- content_type = webpage.info().get_content_type()
- if content_type not in ('text/html', 'text/xml',
- 'application/xhtml+xml'):
- notice("TROUBLE READING PAGE TITLE: bad content type "
- + content_type)
- continue
- content = webpage.read().decode(charset)
- title = HTMLParser(content, "title").data
- title = html.unescape(title)
- notice("PAGE TITLE FOR URL: " + title)
+ title = bs4.BeautifulSoup(r.text).title
+ if title:
+ notice("PAGE TITLE FOR URL: " + title.string)
+ else:
+ notice("PAGE HAS NO TITLE TAG")
sender = ""
for rune in tokens[0]: