From 4511a2b60887c56bd12a2640e3a2e532491c39f4 Mon Sep 17 00:00:00 2001 From: Christian Heller <c.heller@plomlompom.de> Date: Sat, 23 Sep 2023 22:00:52 +0200 Subject: [PATCH] Add example pleroma archival script. --- archive_plomroma.py | 86 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 archive_plomroma.py diff --git a/archive_plomroma.py b/archive_plomroma.py new file mode 100755 index 0000000..0ad89b7 --- /dev/null +++ b/archive_plomroma.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +import lxml +import argparse +# use with `find status.plomlompom.com -type f -name "*.html" -exec ./archive_plomroma.py -f {} \;` + +parser = argparse.ArgumentParser(description="archive plom's self-hosted pleroma feed") +parser.add_argument("-f", "--file", dest="file", required=True, help="HTML file to process") +args = parser.parse_args() +print("processing", args.file) + +def print_tree(node, level=0): + tag = node.tag + id = node.get("id") + classes = node.get("class") + text = (node.text or "").strip() + attributes_info = [] + if id: + attributes_info.append(f"id='{id}'") + if classes: + attributes_info.append(f"class='{classes}'") + attr_str = " ".join(attributes_info) + print(" " * level + f"<{tag} {attr_str}>", end="") + if text: + print(f" -> {text}") + else: + print() + for child in node: + print_tree(child, level + 1) + +with open(args.file, "r", encoding="utf-8") as file: + content = file.read() +from lxml import html +tree = html.fromstring(content) + +atom_links = tree.xpath('/html/head/link[@rel="alternate"]') +for atom_link in atom_links: + atom_link.getparent().remove(atom_link) +comments = tree.xpath('//comment()') +for comment in comments: + comment.getparent().remove(comment) +forms = tree.xpath('//form') +for form in forms: + form.getparent().remove(form) + + +def has_class(context, element, class_name): + classes = element[0].get('class', '').split() + return class_name in classes +ns = lxml.etree.FunctionNamespace(None) +ns['has-class'] = has_class +matching_divs = tree.xpath('//div[has-class(., "activity") and .//div[has-class(., "p-author")] and .//bdi[has-class(., "p-name") and string()!="plomlompom"]]') +imgs = tree.xpath('//img') +for img in imgs: + src = img.get('src') + if src and not src.startswith('https://status.plomlompom.com/'): + img.attrib.pop('src', None) + alt = img.get('alt') + if alt and not alt.startswith('../'): + img.attrib.pop('alt', None) + title = img.get('title') + if title and not title.startswith('../'): + img.attrib.pop('title', None) +removal_notice = "[Removed foreign content for static archive, follow permalink on date to see original.]" +for activity_div in matching_divs: + details = activity_div.xpath('.//details[./div[has-class]]') + for detail in details: + new_div = lxml.etree.Element("div") + new_div.text = removal_notice + detail.getparent().replace(detail, new_div) + e_contents = activity_div.xpath('.//div[has-class(., "e-content") or has-class(., "activity-content")]') + for content in e_contents: + content.clear() + content.text = removal_notice + +header = """ +<p style="text-align: right;"><a href="https://plomlompom.com/contact.html">contact</a> / <a href="https://plomlompom.com/privacy.html">privacy</a></p> +<p>plomroma (archived): This site is a static archive of a Pleroma instance formerly hosted by me, to preserve my own messages from that time. Foreign content has been removed, but may still be available via links.</p> +<hr /> +""" +tree.body.insert(0, html.fromstring(header)) + +# print_tree(tree) +with open(args.file, "w", encoding="utf-8") as file: + file.write(html.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")) + +print("done") -- 2.30.2