From: Christian Heller <c.heller@plomlompom.de>
Date: Sat, 23 Sep 2023 20:00:52 +0000 (+0200)
Subject: Add example pleroma archival script.
X-Git-Url: https://plomlompom.com/repos/%7B%7B%20web_path%20%7D%7D/%7B%7Bprefix%7D%7D/%7B%7Bdb.prefix%7D%7D/%7B%7Bdb.prefix%7D%7D/calendar?a=commitdiff_plain;h=4511a2b60887c56bd12a2640e3a2e532491c39f4;p=config

Add example pleroma archival script.
---

diff --git a/archive_plomroma.py b/archive_plomroma.py
new file mode 100755
index 0000000..0ad89b7
--- /dev/null
+++ b/archive_plomroma.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+import lxml
+import argparse
+# use with `find status.plomlompom.com -type f -name "*.html" -exec ./archive_plomroma.py -f {} \;`
+
+parser = argparse.ArgumentParser(description="archive plom's self-hosted pleroma feed")
+parser.add_argument("-f", "--file", dest="file", required=True, help="HTML file to process")
+args = parser.parse_args()
+print("processing", args.file)
+
+def print_tree(node, level=0):
+    tag = node.tag
+    id = node.get("id")
+    classes = node.get("class")
+    text = (node.text or "").strip()
+    attributes_info = []
+    if id:
+        attributes_info.append(f"id='{id}'")
+    if classes:
+        attributes_info.append(f"class='{classes}'")
+    attr_str = " ".join(attributes_info)
+    print("  " * level + f"<{tag} {attr_str}>", end="")
+    if text:
+        print(f" -> {text}")
+    else:
+        print()
+    for child in node:
+        print_tree(child, level + 1)
+
+with open(args.file, "r", encoding="utf-8") as file:
+    content = file.read()
+from lxml import html
+tree = html.fromstring(content)
+
+atom_links = tree.xpath('/html/head/link[@rel="alternate"]')
+for atom_link in atom_links:
+    atom_link.getparent().remove(atom_link)
+comments = tree.xpath('//comment()')
+for comment in comments:
+    comment.getparent().remove(comment)
+forms = tree.xpath('//form')
+for form in forms:
+    form.getparent().remove(form)
+
+
+def has_class(context, element, class_name):
+    classes = element[0].get('class', '').split()
+    return class_name in classes
+ns = lxml.etree.FunctionNamespace(None)
+ns['has-class'] = has_class
+matching_divs = tree.xpath('//div[has-class(., "activity") and .//div[has-class(., "p-author")] and .//bdi[has-class(., "p-name") and string()!="plomlompom"]]')
+imgs = tree.xpath('//img')
+for img in imgs:
+   src = img.get('src')
+   if src and not src.startswith('https://status.plomlompom.com/'):
+       img.attrib.pop('src', None)
+       alt = img.get('alt')
+       if alt and not alt.startswith('../'):
+           img.attrib.pop('alt', None)
+       title = img.get('title')
+       if title and not title.startswith('../'):
+           img.attrib.pop('title', None)
+removal_notice = "[Removed foreign content for static archive, follow permalink on date to see original.]"
+for activity_div in matching_divs:
+    details = activity_div.xpath('.//details[./div[has-class]]')
+    for detail in details: 
+        new_div = lxml.etree.Element("div")
+        new_div.text = removal_notice
+        detail.getparent().replace(detail, new_div)
+    e_contents = activity_div.xpath('.//div[has-class(., "e-content") or has-class(., "activity-content")]')
+    for content in e_contents: 
+        content.clear()
+        content.text = removal_notice
+
+header = """
+<p style="text-align: right;"><a href="https://plomlompom.com/contact.html">contact</a> / <a href="https://plomlompom.com/privacy.html">privacy</a></p>
+<p>plomroma (archived): This site is a static archive of a Pleroma instance formerly hosted by me, to preserve my own messages from that time. Foreign content has been removed, but may still be available via links.</p>
+<hr />
+"""
+tree.body.insert(0, html.fromstring(header))
+
+# print_tree(tree)
+with open(args.file, "w", encoding="utf-8") as file:
+    file.write(html.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8"))
+
+print("done")