home
·
contact
·
privacy
projects
/
plomlombot-irc.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
73d956f
)
Use proper HTML parsing for page title retrieval.
author
Christian Heller
<c.heller@plomlompom.de>
Tue, 19 Jan 2016 00:01:38 +0000
(
01:01
+0100)
committer
Christian Heller
<c.heller@plomlompom.de>
Tue, 19 Jan 2016 00:01:38 +0000
(
01:01
+0100)
plomlombot.py
patch
|
blob
|
history
diff --git
a/plomlombot.py
b/plomlombot.py
index 8ac8f01c00cf0cc4e00ad6e4a664a743926ac2f6..f34d97a821378c5e092cf064a78a2137d82350c7 100755
(executable)
--- a/
plomlombot.py
+++ b/
plomlombot.py
@@
-9,6
+9,7
@@
import re
import urllib.request
import http.client
import html
import urllib.request
import http.client
import html
+import html.parser
# Defaults, may be overwritten by command line arguments.
SERVER = "irc.freenode.net"
# Defaults, may be overwritten by command line arguments.
SERVER = "irc.freenode.net"
@@
-18,6
+19,22
@@
USERNAME = "plomlombot"
NICKNAME = USERNAME
NICKNAME = USERNAME
+class HTMLParser(html.parser.HTMLParser):
+ def __init__(self, html, tag):
+ super().__init__()
+ self._tag = ""
+ self.data = ""
+ self.feed(html)
+ def handle_starttag(self, tag, attrs):
+ if self.data == "":
+ self._tag = tag
+ def handle_endtag(self, tag):
+ self._tag = ""
+ def handle_data(self, data):
+ if self._tag != "":
+ self.data = data
+
+
class ExceptionForRestart(Exception):
pass
class ExceptionForRestart(Exception):
pass
@@
-131,7
+148,7
@@
def lineparser_loop(io, nickname):
+ content_type)
continue
content = webpage.read().decode(charset)
+ content_type)
continue
content = webpage.read().decode(charset)
- title =
str(content).split('<title>')[1].split('</title>')[0]
+ title =
HTMLParser(content, "title").data
title = html.unescape(title)
notice("PAGE TITLE FOR URL: " + title)
title = html.unescape(title)
notice("PAGE TITLE FOR URL: " + title)