4 # use with `find status.plomlompom.com -type f -name "*.html" -exec ./archive_plomroma.py -f {} \;`
6 parser = argparse.ArgumentParser(description="archive plom's self-hosted pleroma feed")
7 parser.add_argument("-f", "--file", dest="file", required=True, help="HTML file to process")
8 args = parser.parse_args()
9 print("processing", args.file)
11 def print_tree(node, level=0):
14 classes = node.get("class")
15 text = (node.text or "").strip()
18 attributes_info.append(f"id='{id}'")
20 attributes_info.append(f"class='{classes}'")
21 attr_str = " ".join(attributes_info)
22 print(" " * level + f"<{tag} {attr_str}>", end="")
28 print_tree(child, level + 1)
30 with open(args.file, "r", encoding="utf-8") as file:
33 tree = html.fromstring(content)
35 atom_links = tree.xpath('/html/head/link[@rel="alternate"]')
36 for atom_link in atom_links:
37 atom_link.getparent().remove(atom_link)
38 comments = tree.xpath('//comment()')
39 for comment in comments:
40 comment.getparent().remove(comment)
41 forms = tree.xpath('//form')
43 form.getparent().remove(form)
46 def has_class(context, element, class_name):
47 classes = element[0].get('class', '').split()
48 return class_name in classes
49 ns = lxml.etree.FunctionNamespace(None)
50 ns['has-class'] = has_class
51 matching_divs = tree.xpath('//div[has-class(., "activity") and .//div[has-class(., "p-author")] and .//bdi[has-class(., "p-name") and string()!="plomlompom"]]')
52 imgs = tree.xpath('//img')
55 if src and not src.startswith('https://status.plomlompom.com/'):
56 img.attrib.pop('src', None)
58 if alt and not alt.startswith('../'):
59 img.attrib.pop('alt', None)
60 title = img.get('title')
61 if title and not title.startswith('../'):
62 img.attrib.pop('title', None)
63 removal_notice = "[Removed foreign content for static archive, follow permalink on date to see original.]"
64 for activity_div in matching_divs:
65 details = activity_div.xpath('.//details[./div[has-class]]')
66 for detail in details:
67 new_div = lxml.etree.Element("div")
68 new_div.text = removal_notice
69 detail.getparent().replace(detail, new_div)
70 e_contents = activity_div.xpath('.//div[has-class(., "e-content") or has-class(., "activity-content")]')
71 for content in e_contents:
73 content.text = removal_notice
76 <p style="text-align: right;"><a href="https://plomlompom.com/contact.html">contact</a> / <a href="https://plomlompom.com/privacy.html">privacy</a></p>
77 <p>plomroma (archived): This site is a static archive of a Pleroma instance formerly hosted by me, to preserve my own messages from that time. Foreign content has been removed, but may still be available via links.</p>
80 tree.body.insert(0, html.fromstring(header))
83 with open(args.file, "w", encoding="utf-8") as file:
84 file.write(html.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8"))