From: Christian Heller Date: Sun, 20 Oct 2024 13:27:36 +0000 (+0200) Subject: Initial commit. X-Git-Url: https://plomlompom.com/repos/%7B%7B%20web_path%20%7D%7D/cards/%7B%7Bcard_id%7D%7D/process?a=commitdiff_plain;h=b029a5a7afb40f700d3762f59a331fd2dcc5c83b;p=ytplom Initial commit. --- b029a5a7afb40f700d3762f59a331fd2dcc5c83b diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0b70b2b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +google-api-python-client==2.149.0 +Jinja2==3.1.4 +yt-dlp==2024.10.7 diff --git a/templates/index.tmpl b/templates/index.tmpl new file mode 100644 index 0000000..346b543 --- /dev/null +++ b/templates/index.tmpl @@ -0,0 +1,23 @@ + + + +

quota: {{quota_count}}/100000

+
+ +
+ + + + + + +{% for query in queries %} + + + + + +{% endfor %} +
retrieved atDLsquery
{{query.retrieved_at[:19]}}{{query.downloads}}{{query.text}}
+ + diff --git a/templates/results.tmpl b/templates/results.tmpl new file mode 100644 index 0000000..c6cbec7 --- /dev/null +++ b/templates/results.tmpl @@ -0,0 +1,24 @@ + + + +

quota: {{quota_count}}/100000 · index
+query: {{query["text"]}}

+ +{% for result in query["results"] %} + + + + + + +{% endfor %} +
+ + +{{result.definition}}
+{{result.duration}}
+{% if result.available %}[loaded]{% else %}[LOAD]{% endif %}
+{{result.title}} · {{result.description}} +
+ + diff --git a/ytplom.py b/ytplom.py new file mode 100755 index 0000000..1dbd147 --- /dev/null +++ b/ytplom.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +from os import environ, makedirs, scandir, remove as os_remove +from os.path import (isdir, exists as path_exists, join as path_join, splitext, + basename) +from time import sleep +from json import load as json_load, dump as json_dump +from datetime import datetime, timedelta +from threading import Thread +from http.server import HTTPServer, BaseHTTPRequestHandler +from urllib.parse import urlparse, parse_qs +from urllib.request import urlretrieve +from hashlib import md5 + +from jinja2 import Template +from yt_dlp import YoutubeDL +import googleapiclient.discovery + +API_KEY = environ.get('GOOGLE_API_KEY') + +HTTP_PORT = 8083 +PATH_QUOTA_LOG = 'quota_log.json' +PATH_DIR_DOWNLOADS = 'downloads' +PATH_DIR_THUMBNAILS = 'thumbnails' +PATH_DIR_REQUESTS_CACHE = 'cache_googleapi' +PATH_DIR_TEMPLATES = 'templates' +NAME_DIR_TEMP = 'temp' +NAME_TEMPLATE_INDEX = 'index.tmpl' +NAME_TEMPLATE_RESULTS = 'results.tmpl' + +PATH_DIR_TEMP = path_join(PATH_DIR_DOWNLOADS, NAME_DIR_TEMP) +EXPECTED_DIRS = [PATH_DIR_DOWNLOADS, PATH_DIR_TEMP, PATH_DIR_THUMBNAILS, + PATH_DIR_REQUESTS_CACHE] +PATH_TEMPLATE_INDEX = path_join(PATH_DIR_TEMPLATES, NAME_TEMPLATE_INDEX) +TIMESTAMP_FMT = '%Y-%m-%d %H:%M:%S.%f' +YOUTUBE_URL_PREFIX = 'https://www.youtube.com/watch?v=' + +QUOTA_COST_YOUTUBE_SEARCH = 100 +QUOTA_COST_YOUTUBE_DETAILS = 1 + +to_download = [] + + +def ensure_expected_dirs_and_files(): + for dir_name in EXPECTED_DIRS: + if not path_exists(dir_name): + print(f'creating expected directory: {dir_name}') + makedirs(dir_name) + elif not isdir(dir_name): + msg = f'at expected directory path {dir_name} found non-directory' + raise Exception(msg) + if not path_exists(PATH_QUOTA_LOG): + with open(PATH_QUOTA_LOG, 'w', encoding='utf8') as f: + f.write('{}') + else: + try: + read_quota_log() # just to check if we can + except Exception as e: + print(f'Trouble reading quota log file at {PATH_QUOTA_LOG}:') + raise e + + +def clean_unfinished_downloads(): + for e in [e for e in scandir(PATH_DIR_TEMP) if e.is_file]: + print(f'removing unfinished download: {e.path}') + os_remove(e.path) + + +def run_server(): + server = HTTPServer(('localhost', HTTP_PORT), TaskHandler) + print(f'running at port {HTTP_PORT}') + try: + server.serve_forever() + except KeyboardInterrupt: + print('aborted due to keyboard interrupt; ' + 'repeat to end download thread too') + server.server_close() + + +def read_quota_log(): + with open(PATH_QUOTA_LOG, 'r', encoding='utf8') as f: + log = json_load(f) + ret = {} + now = datetime.now() + for time, amount in log.items(): + then = datetime.strptime(time, TIMESTAMP_FMT) + if then >= now - timedelta(days=1): + ret[time] = amount + return ret + + +def update_quota_log(now, cost): + quota_log = read_quota_log() + quota_log[now] = quota_log.get(now, 0) + cost + with open(PATH_QUOTA_LOG, 'w', encoding='utf8') as f: + json_dump(quota_log, f) + + +def download_thread(): + while True: + sleep(0.5) + try: + video_id = to_download.pop(0) + except IndexError: + continue + url = f'{YOUTUBE_URL_PREFIX}{video_id}' + params = {'paths': {'home': PATH_DIR_DOWNLOADS, 'temp': NAME_DIR_TEMP}} + with YoutubeDL(params) as ydl: + ydl.download([url]) + + +class TaskHandler(BaseHTTPRequestHandler): + + def _send_http(self, content=None, headers=None, code=200): + headers = headers if headers else [] + self.send_response(code) + for header_tuple in headers: + self.send_header(header_tuple[0], header_tuple[1]) + self.end_headers() + if content is not None: + self.wfile.write(content) + + def do_POST(self): + length = int(self.headers['content-length']) + postvars = parse_qs(self.rfile.read(length).decode()) + query = postvars['query'][0] + youtube = googleapiclient.discovery.build('youtube', 'v3', + developerKey=API_KEY) + now = datetime.now().strftime(TIMESTAMP_FMT) + + update_quota_log(now, QUOTA_COST_YOUTUBE_SEARCH) + request = youtube.search().list(part='snippet', maxResults=25, q=query, + safeSearch='none', type='video') + response = request.execute() + to_save = {'text': query, 'retrieved_at': now, 'results': []} + ids_for_details = [] + for item in response['items']: + video_id = item['id']['videoId'] + ids_for_details += [video_id] + snippet = item['snippet'] + to_save['results'] += [{'id': video_id, + 'title': snippet['title'], + 'description': snippet['description'], + 'published_at': snippet['publishedAt'], + }] + thumbnail_url = item['snippet']['thumbnails']['default']['url'] + store_at = path_join(PATH_DIR_THUMBNAILS, f'{video_id}.jpg') + urlretrieve(thumbnail_url, store_at) + + update_quota_log(now, QUOTA_COST_YOUTUBE_DETAILS) + request = youtube.videos().list(id=','.join(ids_for_details), + part='content_details') + details = request.execute() + for i, detailed in enumerate(details['items']): + item = to_save['results'][i] + assert item['id'] == detailed['id'] + item['duration'] = detailed['contentDetails']['duration'] + item['definition'] = detailed['contentDetails']['definition'] + + md5sum = md5(query.encode()).hexdigest() + path = path_join(PATH_DIR_REQUESTS_CACHE, f'{md5sum}.json') + with open(path, 'w', encoding='utf8') as f: + json_dump(to_save, f) + self._send_http(headers=[('Location', f'/query/{md5sum}')], code=302) + + def do_GET(self): + parsed_url = urlparse(self.path) + toks_url = parsed_url.path.split('/') + page = toks_url[1] + + if 'thumbnails' == page: + filename = toks_url[2] + with open(path_join(PATH_DIR_THUMBNAILS, filename), 'rb') as f: + img = f.read() + self._send_http(img, [('Content-type', 'image/jpg')]) + return + + downloaded = {} + for e in [e for e in scandir(PATH_DIR_DOWNLOADS) if e.is_file]: + before_ext, _ = splitext(e.path) + id_ = before_ext.split('[')[-1].split(']')[0] + downloaded[id_] = e.path + + if 'dl' == page: + video_id = toks_url[2] + if video_id in downloaded: + with open(downloaded[video_id], 'rb') as f: + video = f.read() + self._send_http(content=video) + return + to_download.append(video_id) + params = parse_qs(parsed_url.query) + query_id = params.get('from_query', [''])[0] + redir_path = f'/query/{query_id}' if query_id else '/' + self._send_http(headers=[('Location', redir_path)], code=302) + return + + kwargs = {'quota_count': 0} + for amount in read_quota_log().values(): + kwargs['quota_count'] += amount + if 'query' == page: + tmpl_name = NAME_TEMPLATE_RESULTS + kwargs['youtube_prefix'] = YOUTUBE_URL_PREFIX + query_id = toks_url[2] + kwargs['query_id'] = query_id + path = path_join(PATH_DIR_REQUESTS_CACHE, f'{query_id}.json') + with open(path, 'r', encoding='utf8') as f: + query = json_load(f) + for result in query['results']: + result['available'] = result['id'] in downloaded + date_dur, time_dur_remains = result['duration'].split('T') + seconds = 0 + date_dur_remains = date_dur[1:] + for dur_char, len_seconds in (('Y', 60*60*24*365.25), + ('M', 60*60*24*30), + ('D', 60*60*24)): + if dur_char in date_dur_remains: + dur_str, date_dur_remains = date_dur_remains.split(dur_char) + seconds += int(dur_str) * len_seconds + for dur_char, len_seconds in (('H', 60*60), + ('M', 60), + ('S', 1)): + if dur_char in time_dur_remains: + dur_str, time_dur_remains = time_dur_remains.split(dur_char) + seconds += int(dur_str) * len_seconds + seconds_str = str(seconds % 60) + minutes_str = str(seconds // 60) + hours_str = str(seconds // (60 * 60)) + result['duration'] = ':'.join( + [f'0{str_}' if len(str_) == 1 else str_ + for str_ in (hours_str, minutes_str, seconds_str)]) + result['definition'] = result['definition'].upper() + kwargs['query'] = query + else: + tmpl_name = NAME_TEMPLATE_INDEX + queries = [] + for file in [f for f in scandir(PATH_DIR_REQUESTS_CACHE) + if f.is_file]: + id_, _ = splitext(basename(file.path)) + with open(file.path, 'r', encoding='utf8') as f: + query = json_load(f) + query['id'] = id_ + for result in query['results']: + result['available'] = result['id'] in downloaded + query['downloads'] = len([result for result in query['results'] + if result['available']]) + queries += [query] + queries.sort(key=lambda q: q['retrieved_at'], reverse=True) + kwargs['queries'] = queries + path = path_join(PATH_DIR_TEMPLATES, tmpl_name) + with open(path, 'r', encoding='utf8') as f: + tmpl = Template(f.read()) + html = tmpl.render(**kwargs) + self._send_http(bytes(html, 'utf8')) + + +if __name__ == '__main__': + to_download = [] + ensure_expected_dirs_and_files() + clean_unfinished_downloads() + Thread(target=download_thread, daemon=False).start() + run_server()