home · contact · privacy
Use request library to simplify page retrieval code.
[plomlombot-irc.git] / plomlombot.py
index 9a8a85329ffafb3c34642fa75def9b55f57422ac..f7c7e32899704b705baf8b8ca49ea16c0fd7ccb3 100755 (executable)
@@ -6,8 +6,9 @@ import datetime
 import select
 import time
 import re
-import urllib.request
+import requests
 import html
+import html.parser
 
 # Defaults, may be overwritten by command line arguments.
 SERVER = "irc.freenode.net"
@@ -16,9 +17,28 @@ TIMEOUT = 240
 USERNAME = "plomlombot"
 NICKNAME = USERNAME
 
+
+class HTMLParser(html.parser.HTMLParser):
+    def __init__(self, html, tag):
+        super().__init__()
+        self._tag_to_check = tag
+        self._tag = ""
+        self.data = ""
+        self.feed(html)
+    def handle_starttag(self, tag, attrs):
+        if self.data == "" and tag == self._tag_to_check:
+            self._tag = tag
+    def handle_endtag(self, tag):
+        self._tag = ""
+    def handle_data(self, data):
+        if self._tag != "":
+            self.data = data
+
+
 class ExceptionForRestart(Exception):
     pass
 
+
 class IO:
 
     def __init__(self, server, port, timeout):
@@ -44,7 +64,7 @@ class IO:
         if len(msg.encode("utf-8")) > 510:
             print("NOT SENT LINE TO SERVER (too long): " + msg)
         print("LINE TO SERVER: "
-            + str(datetime.datetime.now()) + ": " + msg)
+              + str(datetime.datetime.now()) + ": " + msg)
         msg = msg + "\r\n"
         msg_len = len(msg)
         total_sent_len = 0
@@ -64,11 +84,15 @@ class IO:
                 self._pingtest(send_ping)
                 return None
             self.last_pong = time.time()
-            received_runes = self.socket.recv(1024).decode("UTF-8")
+            received_bytes = self.socket.recv(1024)
+            try:
+                received_runes = received_bytes.decode("UTF-8")
+            except UnicodeDecodeError:
+                received_runes = received_bytes.decode("latin1")
             if len(received_runes) == 0:
                 print("SOCKET CONNECTION BROKEN")
                 raise ExceptionForRestart
-            self.rune_buffer += received_runes 
+            self.rune_buffer += received_runes
             lines_split = str.split(self.rune_buffer, "\r\n")
             self.line_buffer += lines_split[:-1]
             self.rune_buffer = lines_split[-1]
@@ -79,9 +103,10 @@ class IO:
         line = self._recv_line_wrapped(send_ping)
         if line:
             print("LINE FROM SERVER " + str(datetime.datetime.now()) + ": " +
-            line)
+                  line)
         return line
 
+
 def init_session(server, port, timeout, nickname, username, channel):
     print("CONNECTING TO " + server)
     io = IO(server, port, timeout)
@@ -90,33 +115,30 @@ def init_session(server, port, timeout, nickname, username, channel):
     io.send_line("JOIN " + channel)
     return io
 
+
 def lineparser_loop(io, nickname):
 
     def act_on_privmsg(tokens):
 
         def url_check(msg):
-            matches = re.findall("(https?://[^\s]+)", msg)
+
+            def notice(msg):
+                io.send_line("NOTICE " + target + " :" + msg)
+
+            matches = re.findall("(https?://[^\s>]+)", msg)
             for i in range(len(matches)):
                 url = matches[i]
                 try:
-                    webpage = urllib.request.urlopen(url, timeout=15)
-                except urllib.error.HTTPError as error:
-                    print("TROUBLE FOLLOWING URL: " + str(error))
+                    r = requests.get(url, timeout=15)
+                except (requests.exceptions.TooManyRedirects,
+                        requests.exceptions.ConnectionError,
+                        requests.exceptions.InvalidSchema) as error:
+                    notice("TROUBLE FOLLOWING URL: " + str(error))
                     continue
-                charset = webpage.info().get_content_charset()
-                if not charset:
-                    charset="utf-8"
-                content_type = webpage.info().get_content_type()
-                if not content_type in ('text/html', 'text/xml',
-                        'application/xhtml+xml'):
-                    print("TROUBLE INTERPRETING URL: bad content type "
-                            + content_type)
-                    continue
-                content = webpage.read().decode(charset)
-                title = str(content).split('<title>')[1].split('</title>')[0]
+                content = r.text
+                title = HTMLParser(content, "title").data
                 title = html.unescape(title)
-                io.send_line("PRIVMSG " + target + " :page title for url: "
-                    + title)
+                notice("PAGE TITLE FOR URL: " + title)
 
         sender = ""
         for rune in tokens[0]:
@@ -136,7 +158,7 @@ def lineparser_loop(io, nickname):
         msg = str.join(" ", tokens[3:])[1:]
         url_check(msg)
 
-    while 1:
+    while True:
         line = io.recv_line()
         if not line:
             continue
@@ -147,34 +169,35 @@ def lineparser_loop(io, nickname):
             if tokens[0] == "PING":
                 io.send_line("PONG " + tokens[1])
 
+
 def parse_command_line_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("-s, --server", action="store", dest="server",
-            default=SERVER,
-            help="server or server net to connect to (default: " + SERVER +
-            ")")
+                        default=SERVER,
+                        help="server or server net to connect to (default: "
+                        + SERVER + ")")
     parser.add_argument("-p, --port", action="store", dest="port", type=int,
-            default=PORT, help="port to connect to (default : " + str(PORT) +
-            ")")
+                        default=PORT, help="port to connect to (default : "
+                        + str(PORT) + ")")
     parser.add_argument("-t, --timeout", action="store", dest="timeout",
-            type=int, default=TIMEOUT,
-            help="timeout in seconds after which to attempt reconnect " +
-            "(default: " + str(TIMEOUT) + ")")
+                        type=int, default=TIMEOUT,
+                        help="timeout in seconds after which to attempt " +
+                        "reconnect (default: " + str(TIMEOUT) + ")")
     parser.add_argument("-u, --username", action="store", dest="username",
-            default=USERNAME, help="username to use (default: " + USERNAME +
-            ")")
+                        default=USERNAME, help="username to use (default: "
+                        + USERNAME + ")")
     parser.add_argument("-n, --nickname", action="store", dest="nickname",
-            default=NICKNAME, help="nickname to use (default: " + NICKNAME +
-            ")")
+                        default=NICKNAME, help="nickname to use (default: "
+                        + NICKNAME + ")")
     parser.add_argument("CHANNEL", action="store", help="channel to join")
     opts, unknown = parser.parse_known_args()
     return opts
 
 opts = parse_command_line_arguments()
-while 1:
+while True:
     try:
         io = init_session(opts.server, opts.port, opts.timeout, opts.nickname,
-                opts.username, opts.CHANNEL)
+                          opts.username, opts.CHANNEL)
         lineparser_loop(io, opts.nickname)
     except ExceptionForRestart:
         io.socket.close()