From: Christian Heller <c.heller@plomlompom.de>
Date: Sun, 1 Dec 2024 07:07:59 +0000 (+0100)
Subject: To files table, add sha512 checksum field.
X-Git-Url: https://plomlompom.com/repos/%7B%7Bprefix%7D%7D/%7B%7Bdb.prefix%7D%7D/%7B%7B%20web_path%20%7D%7D/static/edit?a=commitdiff_plain;h=3b30254cdec658814a7e59b18f790103a59e136f;p=ytplom

To files table, add sha512 checksum field.
---

diff --git a/install.sh b/install.sh
index 18b764b..8c0c6d7 100755
--- a/install.sh
+++ b/install.sh
@@ -7,7 +7,7 @@ NAME_EXECUTABLE=ytplom
 
 mkdir -p "${PATH_APP_SHARE}" "${PATH_LOCAL_BIN}"
 
-rm -f ${PATH_APP_SHARE}/migrations/*
+rm -rf ${PATH_APP_SHARE}/migrations/*
 
 cp -r ./src/* "${PATH_APP_SHARE}/"
 cp "${NAME_EXECUTABLE}" "${PATH_LOCAL_BIN}/"
diff --git a/src/migrate.py b/src/migrate.py
index 85f4af4..fc63965 100755
--- a/src/migrate.py
+++ b/src/migrate.py
@@ -1,10 +1,15 @@
 #!/usr/bin/env python3
 """Script to migrate DB to most recent schema."""
+from importlib.util import spec_from_file_location, module_from_spec
+from pathlib import Path
 from sys import exit as sys_exit
-from sqlite3 import connect as sql_connect
 from ytplom.misc import (
         EXPECTED_DB_VERSION, PATH_DB, PATH_DB_SCHEMA, PATH_MIGRATIONS,
-        SQL_DB_VERSION, HandledException, get_db_version)
+        SQL_DB_VERSION, get_db_version, DbConn, HandledException, SqlText)
+
+
+_SUFFIX_PY = '.py'
+_SUFFIX_SQL = '.sql'
 
 
 def main() -> None:
@@ -19,26 +24,46 @@ def main() -> None:
                 f'{EXPECTED_DB_VERSION}.')
     print(f'Trying to migrate from DB version {start_version} to '
           f'{EXPECTED_DB_VERSION} …')
-    needed = [n+1 for n in range(start_version, EXPECTED_DB_VERSION)]
-    migrations = {}
+    migrations: dict[int, list[Path]] = {
+            n+1: [] for n in range(start_version, EXPECTED_DB_VERSION)}
     for path in [p for p in PATH_MIGRATIONS.iterdir()
                  if p.is_file() and p != PATH_DB_SCHEMA]:
         toks = path.name.split('_')
         try:
             version = int(toks[0])
+            if path.suffix not in {_SUFFIX_PY, _SUFFIX_SQL}:
+                raise ValueError
         except ValueError as e:
             msg = f'Found illegal migration path {path}, aborting.'
             raise HandledException(msg) from e
-        if version in needed:
-            migrations[version] = path
-    missing = [n for n in needed if n not in migrations]
+        if version in migrations:
+            migrations[version] += [path]
+    missing = [n for n in migrations.keys() if not migrations[n]]
     if missing:
         raise HandledException(f'Needed migrations missing: {missing}')
-    with sql_connect(PATH_DB) as conn:
-        for version_number, migration_path in migrations.items():
-            print(f'Applying migration {version_number}: {migration_path}')
-            conn.executescript(migration_path.read_text(encoding='utf8'))
-            conn.execute(f'{SQL_DB_VERSION} = {version_number}')
+    with DbConn(check_version=False) as conn:
+        for version, migration_paths in migrations.items():
+            sorted_paths = sorted(migration_paths)
+            msg_apply_prefix = f'Applying migration {version}: '
+            for path in [p for p in sorted_paths if _SUFFIX_SQL == p.suffix]:
+                print(f'{msg_apply_prefix}{path}')
+                sql = SqlText(path.read_text(encoding='utf8'))
+                conn.exec(sql)
+            for path in [p for p in sorted_paths if _SUFFIX_PY == p.suffix]:
+                spec = spec_from_file_location(str(path), path)
+                assert spec is not None
+                assert spec.loader is not None
+                module = module_from_spec(spec)
+                assert module is not None
+                spec.loader.exec_module(module)
+                if hasattr(module, 'migrate'):
+                    print(f'{msg_apply_prefix}{path}')
+                    module.migrate(conn)
+                else:
+                    raise HandledException(
+                        f'Suspected migration file {path} missing migrate().')
+        conn.exec(SqlText(f'{SQL_DB_VERSION} = {version}'))
+        conn.commit()
 
 
 if __name__ == '__main__':
diff --git a/src/migrations/2_add_files_sha512.py b/src/migrations/2_add_files_sha512.py
new file mode 100644
index 0000000..329286f
--- /dev/null
+++ b/src/migrations/2_add_files_sha512.py
@@ -0,0 +1,28 @@
+from hashlib import file_digest
+from ytplom.misc import DbConn, HandledException, HashStr, SqlText, VideoFile
+
+
+_LEGIT_YES = 'YES!'
+
+
+def migrate(conn: DbConn) -> None:
+    file_entries = VideoFile.get_all(conn)
+    missing = [f for f in file_entries if not f.present]
+    if missing:
+        print('WARNING: Cannot find files to following paths')
+        for f in missing:
+            print(f.full_path)
+        reply = input(
+                'WARNING: To continue migration, will have to delete above '
+                f'rows from DB. To continue, type (exactly) "{_LEGIT_YES}": ')
+        if "YES!" != reply:
+            raise HandledException('Migration aborted!')
+        for f in missing:
+            conn.exec(SqlText('DELETE FROM files WHERE rel_path = ?'),
+                      (str(f.rel_path),))
+    for file in VideoFile.get_all(conn):
+        print(f'Calculating digest for: {file.rel_path}')
+        with open(file.full_path, 'rb') as x:
+            file.sha512_digest = HashStr(
+                    file_digest(x, 'sha512').hexdigest())
+        file.save(conn)
diff --git a/src/migrations/2_add_files_sha512.sql b/src/migrations/2_add_files_sha512.sql
new file mode 100644
index 0000000..36d99e1
--- /dev/null
+++ b/src/migrations/2_add_files_sha512.sql
@@ -0,0 +1 @@
+ALTER TABLE files ADD COLUMN sha512_digest TEXT NOT NULL DEFAULT "";
diff --git a/src/migrations/init_1.sql b/src/migrations/init_1.sql
deleted file mode 100644
index 6d90d23..0000000
--- a/src/migrations/init_1.sql
+++ /dev/null
@@ -1,32 +0,0 @@
-CREATE TABLE yt_queries (
-  id TEXT PRIMARY KEY,
-  text TEXT NOT NULL,
-  retrieved_at TEXT NOT NULL
-);
-CREATE TABLE yt_videos (
-  id TEXT PRIMARY KEY,
-  title TEXT NOT NULL,
-  description TEXT NOT NULL,
-  published_at TEXT NOT NULL,
-  duration TEXT NOT NULL,
-  definition TEXT NOT NULL
-);
-CREATE TABLE yt_query_results (
-  query_id TEXT NOT NULL,
-  video_id TEXT NOT NULL,
-  PRIMARY KEY (query_id, video_id),
-  FOREIGN KEY (query_id) REFERENCES yt_queries(id),
-  FOREIGN KEY (video_id) REFERENCES yt_videos(id)
-);
-CREATE TABLE quota_costs (
-  id TEXT PRIMARY KEY,
-  timestamp TEXT NOT NULL,
-  cost INT NOT NULL
-);
-CREATE TABLE files (
-  rel_path TEXT PRIMARY KEY,
-  yt_id TEXT NOT NULL DEFAULT "",
-  flags INTEGER NOT NULL DEFAULT 0,
-  last_update TEXT NOT NULL DEFAULT "2000-01-01 12:00:00.123456",
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
-);
diff --git a/src/migrations/init_2.sql b/src/migrations/init_2.sql
new file mode 100644
index 0000000..aaa866b
--- /dev/null
+++ b/src/migrations/init_2.sql
@@ -0,0 +1,33 @@
+CREATE TABLE yt_queries (
+  id TEXT PRIMARY KEY,
+  text TEXT NOT NULL,
+  retrieved_at TEXT NOT NULL
+);
+CREATE TABLE yt_videos (
+  id TEXT PRIMARY KEY,
+  title TEXT NOT NULL,
+  description TEXT NOT NULL,
+  published_at TEXT NOT NULL,
+  duration TEXT NOT NULL,
+  definition TEXT NOT NULL
+);
+CREATE TABLE yt_query_results (
+  query_id TEXT NOT NULL,
+  video_id TEXT NOT NULL,
+  PRIMARY KEY (query_id, video_id),
+  FOREIGN KEY (query_id) REFERENCES yt_queries(id),
+  FOREIGN KEY (video_id) REFERENCES yt_videos(id)
+);
+CREATE TABLE quota_costs (
+  id TEXT PRIMARY KEY,
+  timestamp TEXT NOT NULL,
+  cost INT NOT NULL
+);
+CREATE TABLE files (
+  rel_path TEXT PRIMARY KEY,
+  yt_id TEXT NOT NULL DEFAULT "",
+  flags INTEGER NOT NULL DEFAULT 0,
+  last_update TEXT NOT NULL DEFAULT "2000-01-01 12:00:00.123456",
+  sha512_digest TEXT NOT NULL DEFAULT "",
+  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
+);
diff --git a/src/ytplom/misc.py b/src/ytplom/misc.py
index 3fd8218..512c755 100644
--- a/src/ytplom/misc.py
+++ b/src/ytplom/misc.py
@@ -4,6 +4,7 @@
 from typing import Any, Literal, NewType, Optional, Self, TypeAlias
 from os import chdir, environ
 from base64 import urlsafe_b64encode, urlsafe_b64decode
+from hashlib import file_digest
 from random import shuffle
 from time import time, sleep
 from datetime import datetime, timedelta
@@ -36,6 +37,7 @@ ProseText = NewType('ProseText', str)
 SqlText = NewType('SqlText', str)
 FlagName = NewType('FlagName', str)
 FlagsInt = NewType('FlagsInt', int)
+HashStr = NewType('HashStr', str)
 AmountDownloads = NewType('AmountDownloads', int)
 PlayerUpdateId = NewType('PlayerUpdateId', str)
 B64Str = NewType('B64Str', str)
@@ -66,7 +68,7 @@ QUOTA_COST_YOUTUBE_SEARCH = QuotaCost(100)
 QUOTA_COST_YOUTUBE_DETAILS = QuotaCost(1)
 
 # database stuff
-EXPECTED_DB_VERSION = 1
+EXPECTED_DB_VERSION = 2
 SQL_DB_VERSION = SqlText('PRAGMA user_version')
 PATH_MIGRATIONS = PATH_APP_DATA.joinpath('migrations')
 PATH_DB_SCHEMA = PATH_MIGRATIONS.joinpath(f'init_{EXPECTED_DB_VERSION}.sql')
@@ -130,7 +132,10 @@ class Config:
 class DbConn:
     """Wrapped sqlite3.Connection."""
 
-    def __init__(self, path: Path = PATH_DB) -> None:
+    def __init__(self,
+                 path: Path = PATH_DB,
+                 check_version: bool = True
+                 ) -> None:
         self._path = path
         if not self._path.is_file():
             if self._path.exists():
@@ -143,11 +148,12 @@ class DbConn:
             with sql_connect(self._path) as conn:
                 conn.executescript(PATH_DB_SCHEMA.read_text(encoding='utf8'))
                 conn.execute(f'{SQL_DB_VERSION} = {EXPECTED_DB_VERSION}')
-        cur_version = get_db_version(self._path)
-        if cur_version != EXPECTED_DB_VERSION:
-            raise HandledException(
-                    f'wrong database version {cur_version}, expected: '
-                    f'{EXPECTED_DB_VERSION} – run "migrate"?')
+        if check_version:
+            cur_version = get_db_version(self._path)
+            if cur_version != EXPECTED_DB_VERSION:
+                raise HandledException(
+                        f'wrong database version {cur_version}, expected: '
+                        f'{EXPECTED_DB_VERSION} – run "migrate"?')
         self._conn = sql_connect(self._path)
 
     def __enter__(self) -> Self:
@@ -361,7 +367,7 @@ class VideoFile(DbData):
     """Collects data about downloaded files."""
     id_name = 'rel_path'
     _table_name = 'files'
-    _cols = ('rel_path', 'yt_id', 'flags', 'last_update')
+    _cols = ('rel_path', 'yt_id', 'flags', 'last_update', 'sha512_digest')
     last_update: DatetimeStr
     rel_path: Path
 
@@ -369,7 +375,8 @@ class VideoFile(DbData):
                  rel_path: Path,
                  yt_id: YoutubeId,
                  flags: FlagsInt = FlagsInt(0),
-                 last_update: Optional[DatetimeStr] = None
+                 last_update: Optional[DatetimeStr] = None,
+                 sha512_digest: Optional[HashStr] = None
                  ) -> None:
         self.rel_path = rel_path
         self.yt_id = yt_id
@@ -378,6 +385,12 @@ class VideoFile(DbData):
             self._renew_last_update()
         else:
             self.last_update = last_update
+        if sha512_digest is None:
+            with self.full_path.open('rb') as f:
+                self.sha512_digest = HashStr(
+                        file_digest(f, 'sha512').hexdigest())
+        else:
+            self.sha512_digest = sha512_digest
 
     def _renew_last_update(self):
         self.last_update = DatetimeStr(datetime.now().strftime(TIMESTAMP_FMT))
@@ -679,8 +692,8 @@ class DownloadsManager:
             for path in [p for p in Path('.').iterdir()
                          if p.is_file() and p not in known_paths]:
                 yt_id = self._id_from_filename(path)
+                print(f'SYNC: new file {path}, saving to YT ID "{yt_id}".')
                 file = VideoFile(path, yt_id)
-                print(f'SYNC: new file {path}, saving with YT ID "{yt_id}".')
                 file.save(conn)
             self._files = VideoFile.get_all(conn)
             for file in self._files: