home · contact · privacy
b5db868f7bcba32a561a9f63fdcfe9dfd0f091ac
[plomlombot-irc.git] / plomlombot.py
1 #!/usr/bin/python3
2
3 import argparse
4 import socket
5 import datetime
6 import select
7 import time
8 import re
9 import requests
10 import bs4
11 import random
12 import hashlib
13 import os
14 import signal
15 import plomsearch
16 import irclog
17
18 # Defaults, may be overwritten by command line arguments.
19 SERVER = "irc.freenode.net"
20 PORT = 6667
21 TIMEOUT = 240
22 USERNAME = "plomlombot"
23 NICKNAME = USERNAME
24 TWTFILE = ""
25 DBDIR = os.path.expanduser("~/plomlombot_db")
26
27
28 def write_to_file(path, mode, text):
29     f = open(path, mode)
30     f.write(text)
31     f.close()
32
33
34 class ExceptionForRestart(Exception):
35     pass
36
37
38 class Line:
39
40     def __init__(self, line):
41         self.line = line
42         self.tokens = line.split(" ")
43         self.sender = ""
44         if self.tokens[0][0] == ":":
45             for rune in self.tokens[0][1:]:
46                 if rune in {"!", "@"}:
47                     break
48                 self.sender += rune
49         self.receiver = ""
50         if len(self.tokens) > 2:
51             for rune in self.tokens[2]:
52                 if rune in {"!", "@"}:
53                     break
54                 if rune != ":":
55                     self.receiver += rune
56
57
58 class IO:
59
60     def __init__(self, server, port, timeout):
61         self.timeout = timeout
62         self.socket = socket.socket()
63         try:
64             self.socket.connect((server, port))
65         except TimeoutError:
66             raise ExceptionForRestart
67         self.socket.setblocking(0)
68         self.line_buffer = []
69         self.rune_buffer = ""
70         self.last_pong = time.time()
71         self.servername = self.recv_line(send_ping=False).split(" ")[0][1:]
72
73     def _pingtest(self, send_ping=True):
74         if self.last_pong + self.timeout < time.time():
75             print("SERVER NOT ANSWERING")
76             raise ExceptionForRestart
77         if send_ping:
78             self.send_line("PING " + self.servername)
79
80     def send_line(self, msg):
81         msg = msg.replace("\r", " ")
82         msg = msg.replace("\n", " ")
83         if len(msg.encode("utf-8")) > 510:
84             print("NOT SENT LINE TO SERVER (too long): " + msg)
85         print("LINE TO SERVER: "
86               + str(datetime.datetime.now()) + ": " + msg)
87         msg = msg + "\r\n"
88         msg_len = len(msg)
89         total_sent_len = 0
90         while total_sent_len < msg_len:
91             sent_len = self.socket.send(bytes(msg[total_sent_len:], "UTF-8"))
92             if sent_len == 0:
93                 print("SOCKET CONNECTION BROKEN")
94                 raise ExceptionForRestart
95             total_sent_len += sent_len
96
97     def _recv_line_wrapped(self, send_ping=True):
98         if len(self.line_buffer) > 0:
99             return self.line_buffer.pop(0)
100         while True:
101             ready = select.select([self.socket], [], [], int(self.timeout / 2))
102             if not ready[0]:
103                 self._pingtest(send_ping)
104                 return None
105             self.last_pong = time.time()
106             received_bytes = self.socket.recv(1024)
107             try:
108                 received_runes = received_bytes.decode("UTF-8")
109             except UnicodeDecodeError:
110                 received_runes = received_bytes.decode("latin1")
111             if len(received_runes) == 0:
112                 print("SOCKET CONNECTION BROKEN")
113                 raise ExceptionForRestart
114             self.rune_buffer += received_runes
115             lines_split = str.split(self.rune_buffer, "\r\n")
116             self.line_buffer += lines_split[:-1]
117             self.rune_buffer = lines_split[-1]
118             if len(self.line_buffer) > 0:
119                 return self.line_buffer.pop(0)
120
121     def recv_line(self, send_ping=True):
122         line = self._recv_line_wrapped(send_ping)
123         if line:
124             print("LINE FROM SERVER " + str(datetime.datetime.now()) + ": " +
125                   line)
126         return line
127
128
129 def handle_command(command, argument, notice, target, session):
130
131     def addquote():
132         if not os.access(session.quotesfile, os.F_OK):
133             write_to_file(session.quotesfile, "w",
134                           "QUOTES FOR " + target + ":\n")
135         write_to_file(session.quotesfile, "a", argument + "\n")
136         quotesfile = open(session.quotesfile, "r")
137         lines = quotesfile.readlines()
138         quotesfile.close()
139         notice("added quote #" + str(len(lines) - 1))
140
141     def quote():
142
143         def help():
144             notice("syntax: !quote [int] OR !quote search QUERY")
145             notice("QUERY may be a boolean grouping of quoted or unquoted " +
146                    "search terms, examples:")
147             notice("!quote search foo")
148             notice("!quote search foo AND (bar OR NOT baz)")
149             notice("!quote search \"foo\\\"bar\" AND ('NOT\"' AND \"'foo'\"" +
150                    " OR 'bar\\'baz')")
151
152         if "" == argument:
153             tokens = []
154         else:
155             tokens = argument.split(" ")
156         if (len(tokens) > 1 and tokens[0] != "search") or \
157             (len(tokens) == 1 and
158                 (tokens[0] == "search" or not tokens[0].isdigit())):
159             help()
160             return
161         if not os.access(session.quotesfile, os.F_OK):
162             notice("no quotes available")
163             return
164         quotesfile = open(session.quotesfile, "r")
165         lines = quotesfile.readlines()
166         quotesfile.close()
167         lines = lines[1:]
168         if len(tokens) == 1:
169             i = int(tokens[0])
170             if i == 0 or i > len(lines):
171                 notice("there's no quote of that index")
172                 return
173             i = i - 1
174         elif len(tokens) > 1:
175             query = str.join(" ", tokens[1:])
176             try:
177                 results = plomsearch.search(query, lines)
178             except plomsearch.LogicParserError as err:
179                 notice("failed query parsing: " + str(err))
180                 return
181             if len(results) == 0:
182                 notice("no quotes matching query")
183             else:
184                 if len(results) > 3:
185                     notice("showing 3 of " + str(len(results)) + " quotes")
186                 for result in results[:3]:
187                     notice("quote #" + str(result[0] + 1) + ": "
188                            + result[1][:-1])
189             return
190         else:
191             i = random.randrange(len(lines))
192         notice("quote #" + str(i + 1) + ": " + lines[i][:-1])
193
194     def markov():
195         from random import choice, shuffle
196         select_length = 2
197         selections = []
198
199         def markov(snippet):
200             usable_selections = []
201             for i in range(select_length, 0, -1):
202                 for selection in selections:
203                     add = True
204                     for j in range(i):
205                         j += 1
206                         if snippet[-j] != selection[-(j+1)]:
207                             add = False
208                             break
209                     if add:
210                         usable_selections += [selection]
211                 if [] != usable_selections:
212                     break
213             if [] == usable_selections:
214                 usable_selections = selections
215             selection = choice(usable_selections)
216             return selection[select_length]
217
218         if not os.access(session.markovfile, os.F_OK):
219             notice("not enough text to markov")
220             return
221
222         # Lowercase incoming lines, ensure they end in a sentence end mark.
223         file = open(session.markovfile, "r")
224         lines = file.readlines()
225         file.close()
226         tokens = []
227         sentence_end_markers = ".!?)("
228         for line in lines:
229             line = line.lower().replace("\n", "")
230             if line[-1] not in sentence_end_markers:
231                 line += "."
232             tokens += line.split()
233         if len(tokens) <= select_length:
234             notice("not enough text to markov")
235             return
236
237         # Replace URLs with escape string for now, so that the Markov selector
238         # won't see them as different strings. Stash replaced URLs in urls.
239         urls = []
240         url_escape = "\nURL"
241         url_starts = ["http://", "https://", "<http://", "<https://"]
242         for i in range(len(tokens)):
243             for url_start in url_starts:
244                 if tokens[i][:len(url_start)] == url_start:
245                     length = len(tokens[i])
246                     if url_start[0] == "<":
247                         try:
248                             length = tokens[i].index(">") + 1
249                         except ValueError:
250                             pass
251                     urls += [tokens[i][:length]]
252                     tokens[i] = url_escape + tokens[i][length:]
253                     break
254
255         # For each snippet of select_length, use markov() to find continuation
256         # token from selections. Replace present users' names with malkovich.
257         # Start snippets with the beginning of a sentence, if possible.
258         for i in range(len(tokens) - select_length):
259             token_list = []
260             for j in range(select_length + 1):
261                 token_list += [tokens[i + j]]
262             selections += [token_list]
263         snippet = []
264         for i in range(select_length):
265             snippet += [""]
266         shuffle(selections)
267         for i in range(len(selections)):
268             if selections[i][0][-1] in sentence_end_markers:
269                 for i in range(select_length):
270                     snippet[i] = selections[i][i + 1]
271                 break
272         msg = ""
273         malkovich = "malkovich"
274         while 1:
275             new_end = markov(snippet)
276             for name in session.users_in_chan:
277                 if new_end[:len(name)] == name.lower():
278                     new_end = malkovich + new_end[len(name):]
279                     break
280             if len(msg) + len(new_end) > 200:
281                 break
282             msg += new_end + " "
283             for i in range(select_length - 1):
284                 snippet[i] = snippet[i + 1]
285             snippet[select_length - 1] = new_end
286
287         # Replace occurences of url escape string with random choice from urls.
288         while True:
289             index = msg.find(url_escape)
290             if index < 0:
291                 break
292             msg = msg.replace(url_escape, choice(urls), 1)
293
294         # More meaningful ways to randomly end sentences.
295         notice(msg + malkovich + ".")
296
297     def twt():
298         def try_open(mode):
299             try:
300                 twtfile = open(session.twtfile, mode)
301             except (PermissionError, FileNotFoundError) as err:
302                 notice("can't access or create twt file: " + str(err))
303                 return None
304             return twtfile
305
306         from datetime import datetime
307         if not os.access(session.twtfile, os.F_OK):
308             twtfile = try_open("w")
309             if None == twtfile:
310                 return
311             twtfile.close()
312         twtfile = try_open("a")
313         if None == twtfile:
314             return
315         twtfile.write(datetime.utcnow().isoformat() + "\t" + argument + "\n")
316         twtfile.close()
317         notice("wrote twt.")
318
319     if "addquote" == command:
320         addquote()
321     elif "quote" == command:
322         quote()
323     elif "markov" == command:
324         markov()
325     elif "twt" == command:
326         twt()
327
328
329 def handle_url(url, notice, show_url=False):
330
331     def mobile_twitter_hack(url):
332         re1 = 'https?://(mobile.twitter.com/)[^/]+(/status/)'
333         re2 = 'https?://mobile.twitter.com/([^/]+)/status/([^\?/]+)'
334         m = re.search(re1, url)
335         if m and m.group(1) == 'mobile.twitter.com/' \
336                 and m.group(2) == '/status/':
337             m = re.search(re2, url)
338             url = 'https://twitter.com/' + m.group(1) + '/status/' + m.group(2)
339             handle_url(url, notice, True)
340             return True
341
342     class TimeOut(Exception):
343         pass
344
345     def timeout_handler(ignore1, ignore2):
346         raise TimeOut("timeout")
347
348     signal.signal(signal.SIGALRM, timeout_handler)
349     signal.alarm(15)
350     try:
351         r = requests.get(url, headers = {'User-Agent': 'plomlombot'}, stream=True)
352         r.raw.decode_content = True
353         text = r.raw.read(10000000+1)
354         if len(text) > 10000000:
355             raise ValueError('Too large a response')
356     except (requests.exceptions.TooManyRedirects,
357             requests.exceptions.ConnectionError,
358             requests.exceptions.InvalidURL,
359             TimeOut,
360             UnicodeError,
361             ValueError,
362             requests.exceptions.InvalidSchema) as error:
363         signal.alarm(0)
364         notice("trouble following url: " + str(error))
365         return False
366     signal.alarm(0)
367     if mobile_twitter_hack(url):
368         return True
369     title = bs4.BeautifulSoup(text, "html5lib").title
370     if title and title.string:
371         prefix = "page title: "
372         if show_url:
373             prefix = "page title for <" + url + ">: "
374         notice(prefix + title.string.strip())
375     else:
376         notice("page has no title tag")
377     return True
378
379
380 class Session:
381
382     def __init__(self, io, username, nickname, channel, twtfile, dbdir, rmlogs):
383         self.io = io
384         self.nickname = nickname
385         self.username = username
386         self.channel = channel
387         self.users_in_chan = []
388         self.twtfile = twtfile
389         self.dbdir = dbdir
390         self.rmlogs = rmlogs
391         self.io.send_line("NICK " + self.nickname)
392         self.io.send_line("USER " + self.username + " 0 * : ")
393         self.io.send_line("JOIN " + self.channel)
394         hash_channel = hashlib.md5(self.channel.encode("utf-8")).hexdigest()
395         self.chandir = self.dbdir + "/" + hash_channel + "/"
396         self.rawlogdir = self.chandir + "raw_logs/"
397         self.logdir = self.chandir + "logs/"
398         if not os.path.exists(self.logdir):
399             os.makedirs(self.logdir)
400         if not os.path.exists(self.rawlogdir):
401             os.makedirs(self.rawlogdir)
402         self.markovfile = self.chandir + "markovfeed"
403         self.quotesfile = self.chandir + "quotes"
404
405     def loop(self):
406
407         def log(line):
408             if type(line) == str:
409                 line = Line(":" + self.nickname + "!~" + self.username +
410                             "@localhost" + " " + line)
411             now = datetime.datetime.utcnow()
412             form = "%Y-%m-%d %H:%M:%S UTC\t"
413             write_to_file(self.rawlogdir + now.strftime("%Y-%m-%d") + ".txt",
414                           "a", now.strftime(form) + " " + line.line + "\n")
415             to_log = irclog.format_logline(line, self.channel)
416             if to_log != None:
417                 write_to_file(self.logdir + now.strftime("%Y-%m-%d") + ".txt",
418                               "a", now.strftime(form) + " " + to_log + "\n")
419
420         def handle_privmsg(line):
421
422             def notice(msg):
423                 line = "NOTICE " + target + " :" + msg
424                 self.io.send_line(line)
425                 log(line)
426
427             target = line.sender
428             if line.receiver != self.nickname:
429                 target = line.receiver
430             msg = str.join(" ", line.tokens[3:])[1:]
431             matches = re.findall("(https?://[^\s>]+)", msg)
432             url_count = 0
433             for i in range(len(matches)):
434                 if handle_url(matches[i], notice):
435                     url_count += 1
436                     if url_count == 3:
437                         notice("maximum number of urls to parse per message "
438                                "reached")
439                         break
440             if "!" == msg[0]:
441                 tokens = msg[1:].split()
442                 argument = str.join(" ", tokens[1:])
443                 handle_command(tokens[0], argument, notice, target, self)
444                 return
445             write_to_file(self.markovfile, "a", msg + "\n")
446
447         now = datetime.datetime.utcnow()
448         write_to_file(self.logdir + now.strftime("%Y-%m-%d") + ".txt", "a",
449                       "-----------------------\n")
450         while True:
451             if self.rmlogs > 0:
452                 for f in os.listdir(self.logdir):
453                     f = os.path.join(self.logdir, f)
454                     if os.path.isfile(f) and \
455                             os.stat(f).st_mtime < time.time() - self.rmlogs:
456                         os.remove(f)
457             line = self.io.recv_line()
458             if not line:
459                 continue
460             line = Line(line)
461             log(line)
462             if len(line.tokens) > 1:
463                 if line.tokens[0] == "PING":
464                     self.io.send_line("PONG " + line.tokens[1])
465                 elif line.tokens[1] == "PRIVMSG":
466                     handle_privmsg(line)
467                 elif line.tokens[1] == "353":
468                     names = line.tokens[5:]
469                     names[0] = names[0][1:]
470                     for i in range(len(names)):
471                         names[i] = names[i].replace("@", "").replace("+", "")
472                     self.users_in_chan += names
473                 elif line.tokens[1] == "JOIN" and line.sender != self.nickname:
474                     self.users_in_chan += [line.sender]
475                 elif line.tokens[1] == "PART":
476                     del(self.users_in_chan[self.users_in_chan.index(line.sender)])
477                 elif line.tokens[1] == "NICK":
478                     del(self.users_in_chan[self.users_in_chan.index(line.sender)])
479                     self.users_in_chan += [line.receiver]
480
481
482 def parse_command_line_arguments():
483     parser = argparse.ArgumentParser()
484     parser.add_argument("-s, --server", action="store", dest="server",
485                         default=SERVER,
486                         help="server or server net to connect to (default: "
487                         + SERVER + ")")
488     parser.add_argument("-p, --port", action="store", dest="port", type=int,
489                         default=PORT, help="port to connect to (default : "
490                         + str(PORT) + ")")
491     parser.add_argument("-w, --wait", action="store", dest="timeout",
492                         type=int, default=TIMEOUT,
493                         help="timeout in seconds after which to attempt "
494                         "reconnect (default: " + str(TIMEOUT) + ")")
495     parser.add_argument("-u, --username", action="store", dest="username",
496                         default=USERNAME, help="username to use (default: "
497                         + USERNAME + ")")
498     parser.add_argument("-n, --nickname", action="store", dest="nickname",
499                         default=NICKNAME, help="nickname to use (default: "
500                         + NICKNAME + ")")
501     parser.add_argument("-t, --twtxtfile", action="store", dest="twtfile",
502                         default=TWTFILE, help="twtxt file to use (default: "
503                         + TWTFILE + ")")
504     parser.add_argument("-d, --dbdir", action="store", dest="dbdir",
505                         default=DBDIR, help="directory to store DB files in")
506     parser.add_argument("-r, --rmlogs", action="store", dest="rmlogs",
507                         type=int, default=0,
508                         help="maximum age in seconds for logfiles in logs/ "
509                         "(0 means: never delete, and is default)")
510     parser.add_argument("CHANNEL", action="store", help="channel to join")
511     opts, unknown = parser.parse_known_args()
512     return opts
513
514
515 opts = parse_command_line_arguments()
516 while True:
517     try:
518         io = IO(opts.server, opts.port, opts.timeout)
519         hash_server = hashlib.md5(opts.server.encode("utf-8")).hexdigest()
520         dbdir = opts.dbdir + "/" + hash_server 
521         session = Session(io, opts.username, opts.nickname, opts.CHANNEL,
522             opts.twtfile, dbdir, opts.rmlogs)
523         session.loop()
524     except ExceptionForRestart:
525         io.socket.close()
526         continue