plomlompom.com Git - config/blob - archive_plomroma.py

   1 #!/usr/bin/env python3
   2 import lxml
   3 import argparse
   4 # use with `find status.plomlompom.com -type f -name "*.html" -exec ./archive_plomroma.py -f {} \;`
   5
   6 parser = argparse.ArgumentParser(description="archive plom's self-hosted pleroma feed")
   7 parser.add_argument("-f", "--file", dest="file", required=True, help="HTML file to process")
   8 args = parser.parse_args()
   9 print("processing", args.file)
  10
  11 def print_tree(node, level=0):
  12     tag = node.tag
  13     id = node.get("id")
  14     classes = node.get("class")
  15     text = (node.text or "").strip()
  16     attributes_info = []
  17     if id:
  18         attributes_info.append(f"id='{id}'")
  19     if classes:
  20         attributes_info.append(f"class='{classes}'")
  21     attr_str = " ".join(attributes_info)
  22     print("  " * level + f"<{tag} {attr_str}>", end="")
  23     if text:
  24         print(f" -> {text}")
  25     else:
  26         print()
  27     for child in node:
  28         print_tree(child, level + 1)
  29
  30 with open(args.file, "r", encoding="utf-8") as file:
  31     content = file.read()
  32 from lxml import html
  33 tree = html.fromstring(content)
  34
  35 atom_links = tree.xpath('/html/head/link[@rel="alternate"]')
  36 for atom_link in atom_links:
  37     atom_link.getparent().remove(atom_link)
  38 comments = tree.xpath('//comment()')
  39 for comment in comments:
  40     comment.getparent().remove(comment)
  41 forms = tree.xpath('//form')
  42 for form in forms:
  43     form.getparent().remove(form)
  44
  45
  46 def has_class(context, element, class_name):
  47     classes = element[0].get('class', '').split()
  48     return class_name in classes
  49 ns = lxml.etree.FunctionNamespace(None)
  50 ns['has-class'] = has_class
  51 matching_divs = tree.xpath('//div[has-class(., "activity") and .//div[has-class(., "p-author")] and .//bdi[has-class(., "p-name") and string()!="plomlompom"]]')
  52 imgs = tree.xpath('//img')
  53 for img in imgs:
  54    src = img.get('src')
  55    if src and not src.startswith('https://status.plomlompom.com/'):
  56        img.attrib.pop('src', None)
  57        alt = img.get('alt')
  58        if alt and not alt.startswith('../'):
  59            img.attrib.pop('alt', None)
  60        title = img.get('title')
  61        if title and not title.startswith('../'):
  62            img.attrib.pop('title', None)
  63 removal_notice = "[Removed foreign content for static archive, follow permalink on date to see original.]"
  64 for activity_div in matching_divs:
  65     details = activity_div.xpath('.//details[./div[has-class]]')
  66     for detail in details:
  67         new_div = lxml.etree.Element("div")
  68         new_div.text = removal_notice
  69         detail.getparent().replace(detail, new_div)
  70     e_contents = activity_div.xpath('.//div[has-class(., "e-content") or has-class(., "activity-content")]')
  71     for content in e_contents:
  72         content.clear()
  73         content.text = removal_notice
  74
  75 header = """
  76 <p style="text-align: right;"><a href="https://plomlompom.com/contact.html">contact</a> / <a href="https://plomlompom.com/privacy.html">privacy</a></p>
  77 <p>plomroma (archived): This site is a static archive of a Pleroma instance formerly hosted by me, to preserve my own messages from that time. Foreign content has been removed, but may still be available via links.</p>
  78 <hr />
  79 """
  80 tree.body.insert(0, html.fromstring(header))
  81
  82 # print_tree(tree)
  83 with open(args.file, "w", encoding="utf-8") as file:
  84     file.write(html.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8"))
  85
  86 print("done")