From b029a5a7afb40f700d3762f59a331fd2dcc5c83b Mon Sep 17 00:00:00 2001
From: Christian Heller <c.heller@plomlompom.de>
Date: Sun, 20 Oct 2024 15:27:36 +0200
Subject: [PATCH 1/1] Initial commit.

---
 requirements.txt       |   3 +
 templates/index.tmpl   |  23 ++++
 templates/results.tmpl |  24 ++++
 ytplom.py              | 261 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 311 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 templates/index.tmpl
 create mode 100644 templates/results.tmpl
 create mode 100755 ytplom.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0b70b2b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+google-api-python-client==2.149.0
+Jinja2==3.1.4
+yt-dlp==2024.10.7
diff --git a/templates/index.tmpl b/templates/index.tmpl
new file mode 100644
index 0000000..346b543
--- /dev/null
+++ b/templates/index.tmpl
@@ -0,0 +1,23 @@
+<html>
+<meta charset="UTF-8">
+<body>
+<p>quota: {{quota_count}}/100000</p>
+<form action="" method="POST" />
+<input name="query" />
+</form>
+<table>
+<tr>
+<th>retrieved at</th>
+<th>DLs</th>
+<th>query</th>
+</tr>
+{% for query in queries %}
+<tr>
+<td>{{query.retrieved_at[:19]}}</td>
+<td style="text-align: right;">{{query.downloads}}</td>
+<td><a href="/query/{{query.id}}">{{query.text}}</a></td>
+</tr>
+{% endfor %}
+</table>
+</body>
+</html>
diff --git a/templates/results.tmpl b/templates/results.tmpl
new file mode 100644
index 0000000..c6cbec7
--- /dev/null
+++ b/templates/results.tmpl
@@ -0,0 +1,24 @@
+<html>
+<meta charset="UTF-8">
+<body>
+<p>quota: {{quota_count}}/100000 · <a href="/">index</a><br />
+query: {{query["text"]}}</p>
+<table>
+{% for result in query["results"] %}
+<tr>
+<td>
+<a href="{{youtube_prefix}}{{result.id}}"><img src="/thumbnails/{{result.id}}.jpg" /></a>
+</td>
+<td>
+{{result.definition}}<br />
+{{result.duration}}<br />
+<a href="/dl/{{result.id}}?from_query={{query_id}}">{% if result.available %}[loaded]{% else %}[LOAD]{% endif %}</a></td>
+</td>
+<td>
+<b>{{result.title}}</b> · {{result.description}}
+</td>
+</tr>
+{% endfor %}
+</table>
+</body>
+</html>
diff --git a/ytplom.py b/ytplom.py
new file mode 100755
index 0000000..1dbd147
--- /dev/null
+++ b/ytplom.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+from os import environ, makedirs, scandir, remove as os_remove
+from os.path import (isdir, exists as path_exists, join as path_join, splitext,
+                     basename)
+from time import sleep
+from json import load as json_load, dump as json_dump
+from datetime import datetime, timedelta
+from threading import Thread
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from urllib.parse import urlparse, parse_qs
+from urllib.request import urlretrieve
+from hashlib import md5
+
+from jinja2 import Template
+from yt_dlp import YoutubeDL
+import googleapiclient.discovery
+
+API_KEY = environ.get('GOOGLE_API_KEY')
+
+HTTP_PORT = 8083
+PATH_QUOTA_LOG = 'quota_log.json'
+PATH_DIR_DOWNLOADS = 'downloads'
+PATH_DIR_THUMBNAILS = 'thumbnails'
+PATH_DIR_REQUESTS_CACHE = 'cache_googleapi'
+PATH_DIR_TEMPLATES = 'templates'
+NAME_DIR_TEMP = 'temp'
+NAME_TEMPLATE_INDEX = 'index.tmpl'
+NAME_TEMPLATE_RESULTS = 'results.tmpl'
+
+PATH_DIR_TEMP = path_join(PATH_DIR_DOWNLOADS, NAME_DIR_TEMP)
+EXPECTED_DIRS = [PATH_DIR_DOWNLOADS, PATH_DIR_TEMP, PATH_DIR_THUMBNAILS,
+                 PATH_DIR_REQUESTS_CACHE]
+PATH_TEMPLATE_INDEX = path_join(PATH_DIR_TEMPLATES, NAME_TEMPLATE_INDEX)
+TIMESTAMP_FMT = '%Y-%m-%d %H:%M:%S.%f'
+YOUTUBE_URL_PREFIX = 'https://www.youtube.com/watch?v='
+
+QUOTA_COST_YOUTUBE_SEARCH = 100
+QUOTA_COST_YOUTUBE_DETAILS = 1
+
+to_download = []
+
+
+def ensure_expected_dirs_and_files():
+    for dir_name in EXPECTED_DIRS:
+        if not path_exists(dir_name):
+            print(f'creating expected directory: {dir_name}')
+            makedirs(dir_name)
+        elif not isdir(dir_name):
+            msg = f'at expected directory path {dir_name} found non-directory'
+            raise Exception(msg)
+    if not path_exists(PATH_QUOTA_LOG):
+        with open(PATH_QUOTA_LOG, 'w', encoding='utf8') as f:
+            f.write('{}')
+    else:
+        try:
+            read_quota_log()  # just to check if we can
+        except Exception as e:
+            print(f'Trouble reading quota log file at {PATH_QUOTA_LOG}:')
+            raise e
+
+
+def clean_unfinished_downloads():
+    for e in [e for e in scandir(PATH_DIR_TEMP) if e.is_file]:
+        print(f'removing unfinished download: {e.path}')
+        os_remove(e.path)
+
+
+def run_server():
+    server = HTTPServer(('localhost', HTTP_PORT), TaskHandler)
+    print(f'running at port {HTTP_PORT}')
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        print('aborted due to keyboard interrupt; '
+              'repeat to end download thread too')
+    server.server_close()
+
+
+def read_quota_log():
+    with open(PATH_QUOTA_LOG, 'r', encoding='utf8') as f:
+        log = json_load(f)
+    ret = {}
+    now = datetime.now()
+    for time, amount in log.items():
+        then = datetime.strptime(time, TIMESTAMP_FMT)
+        if then >= now - timedelta(days=1):
+            ret[time] = amount
+    return ret
+
+
+def update_quota_log(now, cost):
+    quota_log = read_quota_log()
+    quota_log[now] = quota_log.get(now, 0) + cost
+    with open(PATH_QUOTA_LOG, 'w', encoding='utf8') as f:
+        json_dump(quota_log, f)
+
+
+def download_thread():
+    while True:
+        sleep(0.5)
+        try:
+            video_id = to_download.pop(0)
+        except IndexError:
+            continue
+        url = f'{YOUTUBE_URL_PREFIX}{video_id}'
+        params = {'paths': {'home': PATH_DIR_DOWNLOADS, 'temp': NAME_DIR_TEMP}}
+        with YoutubeDL(params) as ydl:
+            ydl.download([url])
+
+
+class TaskHandler(BaseHTTPRequestHandler):
+
+    def _send_http(self, content=None, headers=None, code=200):
+        headers = headers if headers else []
+        self.send_response(code)
+        for header_tuple in headers:
+            self.send_header(header_tuple[0], header_tuple[1])
+        self.end_headers()
+        if content is not None:
+            self.wfile.write(content)
+
+    def do_POST(self):
+        length = int(self.headers['content-length'])
+        postvars = parse_qs(self.rfile.read(length).decode())
+        query = postvars['query'][0]
+        youtube = googleapiclient.discovery.build('youtube', 'v3',
+                                                  developerKey=API_KEY)
+        now = datetime.now().strftime(TIMESTAMP_FMT)
+
+        update_quota_log(now, QUOTA_COST_YOUTUBE_SEARCH)
+        request = youtube.search().list(part='snippet', maxResults=25, q=query,
+                                        safeSearch='none', type='video')
+        response = request.execute()
+        to_save = {'text': query, 'retrieved_at': now, 'results': []}
+        ids_for_details = []
+        for item in response['items']:
+            video_id = item['id']['videoId']
+            ids_for_details += [video_id]
+            snippet = item['snippet']
+            to_save['results'] += [{'id': video_id,
+                                    'title': snippet['title'],
+                                    'description': snippet['description'],
+                                    'published_at': snippet['publishedAt'],
+                                    }]
+            thumbnail_url = item['snippet']['thumbnails']['default']['url']
+            store_at = path_join(PATH_DIR_THUMBNAILS, f'{video_id}.jpg')
+            urlretrieve(thumbnail_url, store_at)
+
+        update_quota_log(now, QUOTA_COST_YOUTUBE_DETAILS)
+        request = youtube.videos().list(id=','.join(ids_for_details),
+                                        part='content_details')
+        details = request.execute()
+        for i, detailed in enumerate(details['items']):
+            item = to_save['results'][i]
+            assert item['id'] == detailed['id']
+            item['duration'] = detailed['contentDetails']['duration']
+            item['definition'] = detailed['contentDetails']['definition']
+
+        md5sum = md5(query.encode()).hexdigest()
+        path = path_join(PATH_DIR_REQUESTS_CACHE, f'{md5sum}.json')
+        with open(path, 'w', encoding='utf8') as f:
+            json_dump(to_save, f)
+        self._send_http(headers=[('Location', f'/query/{md5sum}')], code=302)
+
+    def do_GET(self):
+        parsed_url = urlparse(self.path)
+        toks_url = parsed_url.path.split('/')
+        page = toks_url[1]
+
+        if 'thumbnails' == page:
+            filename = toks_url[2]
+            with open(path_join(PATH_DIR_THUMBNAILS, filename), 'rb') as f:
+                img = f.read()
+            self._send_http(img, [('Content-type', 'image/jpg')])
+            return
+
+        downloaded = {}
+        for e in [e for e in scandir(PATH_DIR_DOWNLOADS) if e.is_file]:
+            before_ext, _ = splitext(e.path)
+            id_ = before_ext.split('[')[-1].split(']')[0]
+            downloaded[id_] = e.path
+
+        if 'dl' == page:
+            video_id = toks_url[2]
+            if video_id in downloaded:
+                with open(downloaded[video_id], 'rb') as f:
+                    video = f.read()
+                self._send_http(content=video)
+                return
+            to_download.append(video_id)
+            params = parse_qs(parsed_url.query)
+            query_id = params.get('from_query', [''])[0]
+            redir_path = f'/query/{query_id}' if query_id else '/'
+            self._send_http(headers=[('Location', redir_path)], code=302)
+            return
+
+        kwargs = {'quota_count': 0}
+        for amount in read_quota_log().values():
+            kwargs['quota_count'] += amount
+        if 'query' == page:
+            tmpl_name = NAME_TEMPLATE_RESULTS
+            kwargs['youtube_prefix'] = YOUTUBE_URL_PREFIX
+            query_id = toks_url[2]
+            kwargs['query_id'] = query_id
+            path = path_join(PATH_DIR_REQUESTS_CACHE, f'{query_id}.json')
+            with open(path, 'r', encoding='utf8') as f:
+                query = json_load(f)
+            for result in query['results']:
+                result['available'] = result['id'] in downloaded
+                date_dur, time_dur_remains = result['duration'].split('T')
+                seconds = 0
+                date_dur_remains = date_dur[1:]
+                for dur_char, len_seconds in (('Y', 60*60*24*365.25),
+                                              ('M', 60*60*24*30),
+                                              ('D', 60*60*24)):
+                    if dur_char in date_dur_remains:
+                        dur_str, date_dur_remains = date_dur_remains.split(dur_char)
+                        seconds += int(dur_str) * len_seconds
+                for dur_char, len_seconds in (('H', 60*60),
+                                              ('M', 60),
+                                              ('S', 1)):
+                    if dur_char in time_dur_remains:
+                        dur_str, time_dur_remains = time_dur_remains.split(dur_char)
+                        seconds += int(dur_str) * len_seconds
+                seconds_str = str(seconds % 60)
+                minutes_str = str(seconds // 60)
+                hours_str = str(seconds // (60 * 60))
+                result['duration'] = ':'.join(
+                        [f'0{str_}' if len(str_) == 1 else str_
+                         for str_ in (hours_str, minutes_str, seconds_str)])
+                result['definition'] = result['definition'].upper() 
+            kwargs['query'] = query
+        else:
+            tmpl_name = NAME_TEMPLATE_INDEX
+            queries = []
+            for file in [f for f in scandir(PATH_DIR_REQUESTS_CACHE)
+                         if f.is_file]:
+                id_, _ = splitext(basename(file.path))
+                with open(file.path, 'r', encoding='utf8') as f:
+                    query = json_load(f)
+                query['id'] = id_
+                for result in query['results']:
+                    result['available'] = result['id'] in downloaded
+                query['downloads'] = len([result for result in query['results']
+                                          if result['available']])
+                queries += [query]
+            queries.sort(key=lambda q: q['retrieved_at'], reverse=True)
+            kwargs['queries'] = queries
+        path = path_join(PATH_DIR_TEMPLATES, tmpl_name)
+        with open(path, 'r', encoding='utf8') as f:
+            tmpl = Template(f.read())
+        html = tmpl.render(**kwargs)
+        self._send_http(bytes(html, 'utf8'))
+
+
+if __name__ == '__main__':
+    to_download = []
+    ensure_expected_dirs_and_files()
+    clean_unfinished_downloads()
+    Thread(target=download_thread, daemon=False).start()
+    run_server()
-- 
2.30.2