From 17cf4888e2a440fb97456c8b5fd771dc5d5d9eae Mon Sep 17 00:00:00 2001
From: Christian Heller <c.heller@plomlompom.de>
Date: Sat, 4 Jan 2025 22:12:16 +0100
Subject: [PATCH] Add table schema validation.

---
 src/migrations/0_init.sql                   | 41 +++++++------
 src/migrations/1_add_files_last_updated.sql |  2 +-
 src/migrations/2_add_files_sha512.sql       |  2 +-
 src/migrations/3_files_redo.sql             | 20 +++----
 src/migrations/4_add_files_sha512_blob.sql  |  2 +-
 src/migrations/5_files_redo.sql             | 12 ++--
 src/migrations/6_add_files_tags.sql         |  2 +-
 src/migrations/new_init.sql                 | 46 +++++++--------
 src/ytplom/db.py                            | 64 ++++++++++++++++++---
 src/ytplom/migrations.py                    |  2 +-
 10 files changed, 120 insertions(+), 73 deletions(-)

diff --git a/src/migrations/0_init.sql b/src/migrations/0_init.sql
index 5b45769..24c4a3f 100644
--- a/src/migrations/0_init.sql
+++ b/src/migrations/0_init.sql
@@ -1,9 +1,27 @@
-CREATE TABLE yt_queries (
+CREATE TABLE "files" (
+  rel_path TEXT PRIMARY KEY,
+  yt_id TEXT NOT NULL DEFAULT "",
+  flags INTEGER NOT NULL DEFAULT 0,
+  FOREIGN KEY (yt_id) REFERENCES "yt_videos"(id)
+);
+CREATE TABLE "quota_costs" (
+  id TEXT PRIMARY KEY,
+  timestamp TEXT NOT NULL,
+  cost INT NOT NULL
+);
+CREATE TABLE "yt_queries" (
   id TEXT PRIMARY KEY,
   text TEXT NOT NULL,
   retrieved_at TEXT NOT NULL
 );
-CREATE TABLE yt_videos (
+CREATE TABLE "yt_query_results" (
+  query_id TEXT NOT NULL,
+  video_id TEXT NOT NULL,
+  PRIMARY KEY (query_id, video_id),
+  FOREIGN KEY (query_id) REFERENCES "yt_queries"(id),
+  FOREIGN KEY (video_id) REFERENCES "yt_videos"(id)
+);
+CREATE TABLE "yt_videos" (
   id TEXT PRIMARY KEY,
   title TEXT NOT NULL,
   description TEXT NOT NULL,
@@ -11,22 +29,3 @@ CREATE TABLE yt_videos (
   duration TEXT NOT NULL,
   definition TEXT NOT NULL
 );
-CREATE TABLE yt_query_results (
-  query_id TEXT NOT NULL,
-  video_id TEXT NOT NULL,
-  PRIMARY KEY (query_id, video_id),
-  FOREIGN KEY (query_id) REFERENCES yt_queries(id),
-  FOREIGN KEY (video_id) REFERENCES yt_videos(id)
-);
-CREATE TABLE quota_costs (
-  id TEXT PRIMARY KEY,
-  timestamp TEXT NOT NULL,
-  cost INT NOT NULL
-);
-CREATE TABLE files (
-  rel_path TEXT PRIMARY KEY,
-  yt_id TEXT NOT NULL DEFAULT "",
-  flags INTEGER NOT NULL DEFAULT 0,
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
-);
-
diff --git a/src/migrations/1_add_files_last_updated.sql b/src/migrations/1_add_files_last_updated.sql
index 678d843..4d2427e 100644
--- a/src/migrations/1_add_files_last_updated.sql
+++ b/src/migrations/1_add_files_last_updated.sql
@@ -1 +1 @@
-ALTER TABLE files ADD COLUMN last_update TEXT NOT NULL DEFAULT "2000-01-01 12:00:00.123456";
+ALTER TABLE "files" ADD COLUMN last_update TEXT NOT NULL DEFAULT "2000-01-01 12:00:00.123456";
diff --git a/src/migrations/2_add_files_sha512.sql b/src/migrations/2_add_files_sha512.sql
index 36d99e1..33f08d8 100644
--- a/src/migrations/2_add_files_sha512.sql
+++ b/src/migrations/2_add_files_sha512.sql
@@ -1 +1 @@
-ALTER TABLE files ADD COLUMN sha512_digest TEXT NOT NULL DEFAULT "";
+ALTER TABLE "files" ADD COLUMN sha512_digest TEXT NOT NULL DEFAULT "";
diff --git a/src/migrations/3_files_redo.sql b/src/migrations/3_files_redo.sql
index bf485bb..9c0ff87 100644
--- a/src/migrations/3_files_redo.sql
+++ b/src/migrations/3_files_redo.sql
@@ -1,32 +1,32 @@
-CREATE TEMPORARY TABLE files_backup (
+CREATE TEMPORARY TABLE "files_backup" (
   rel_path TEXT PRIMARY KEY,
   yt_id TEXT NOT NULL DEFAULT "",
   flags INTEGER NOT NULL DEFAULT 0,
   last_update TEXT NOT NULL DEFAULT "2000-01-01 12:00:00.123456",
   sha512_digest TEXT NOT NULL,
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
+  FOREIGN KEY (yt_id) REFERENCES "yt_videos"(id)
 );
-INSERT INTO files_backup SELECT
+INSERT INTO "files_backup" SELECT
   rel_path,
   yt_id,
   flags,
   last_update,
   sha512_digest
-FROM files;
-DROP TABLE files;
-CREATE TABLE files (
+FROM "files";
+DROP TABLE "files";
+CREATE TABLE "files" (
   sha512_digest TEXT PRIMARY KEY,
   rel_path TEXT NOT NULL,
   flags INTEGER NOT NULL DEFAULT 0,
   yt_id TEXT,
   last_update TEXT NOT NULL,
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
+  FOREIGN KEY (yt_id) REFERENCES "yt_videos"(id)
 );
-INSERT INTO files SELECT
+INSERT INTO "files" SELECT
   sha512_digest,
   rel_path,
   flags,
   yt_id,
   last_update
-FROM files_backup;
-DROP TABLE files_backup;
+FROM "files_backup";
+DROP TABLE "files_backup";
diff --git a/src/migrations/4_add_files_sha512_blob.sql b/src/migrations/4_add_files_sha512_blob.sql
index c382740..4ed2299 100644
--- a/src/migrations/4_add_files_sha512_blob.sql
+++ b/src/migrations/4_add_files_sha512_blob.sql
@@ -1 +1 @@
-ALTER TABLE files ADD COLUMN sha512_blob BLOB;
+ALTER TABLE "files" ADD COLUMN sha512_blob BLOB;
diff --git a/src/migrations/5_files_redo.sql b/src/migrations/5_files_redo.sql
index a9c30fc..44bbb92 100644
--- a/src/migrations/5_files_redo.sql
+++ b/src/migrations/5_files_redo.sql
@@ -1,17 +1,17 @@
-CREATE TABLE files_new (
+ALTER TABLE "files" RENAME TO "files_old";
+CREATE TABLE "files" (
   digest BLOB PRIMARY KEY,
   rel_path TEXT NOT NULL,
   flags INTEGER NOT NULL DEFAULT 0,
   yt_id TEXT,
   last_update TEXT NOT NULL,
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
+  FOREIGN KEY (yt_id) REFERENCES "yt_videos"(id)
 );
-INSERT INTO files_new SELECT
+INSERT INTO "files" SELECT
   sha512_blob,
   rel_path,
   flags,
   yt_id,
   last_update
-FROM files;
-DROP TABLE files;
-ALTER TABLE files_new RENAME TO files;
+FROM "files_old";
+DROP TABLE "files_old";
diff --git a/src/migrations/6_add_files_tags.sql b/src/migrations/6_add_files_tags.sql
index 7d70c18..4077830 100644
--- a/src/migrations/6_add_files_tags.sql
+++ b/src/migrations/6_add_files_tags.sql
@@ -1 +1 @@
-ALTER TABLE files ADD COLUMN tags TEXT NOT NULL DEFAULT "";
+ALTER TABLE "files" ADD COLUMN tags TEXT NOT NULL DEFAULT "";
diff --git a/src/migrations/new_init.sql b/src/migrations/new_init.sql
index ad11670..ae5e871 100644
--- a/src/migrations/new_init.sql
+++ b/src/migrations/new_init.sql
@@ -1,9 +1,30 @@
-CREATE TABLE yt_queries (
+CREATE TABLE "files" (
+  digest BLOB PRIMARY KEY,
+  rel_path TEXT NOT NULL,
+  flags INTEGER NOT NULL DEFAULT 0,
+  yt_id TEXT,
+  last_update TEXT NOT NULL,
+  tags TEXT NOT NULL DEFAULT "",
+  FOREIGN KEY (yt_id) REFERENCES "yt_videos"(id)
+);
+CREATE TABLE "quota_costs" (
+  id TEXT PRIMARY KEY,
+  timestamp TEXT NOT NULL,
+  cost INT NOT NULL
+);
+CREATE TABLE "yt_queries" (
   id TEXT PRIMARY KEY,
   text TEXT NOT NULL,
   retrieved_at TEXT NOT NULL
 );
-CREATE TABLE yt_videos (
+CREATE TABLE "yt_query_results" (
+  query_id TEXT NOT NULL,
+  video_id TEXT NOT NULL,
+  PRIMARY KEY (query_id, video_id),
+  FOREIGN KEY (query_id) REFERENCES "yt_queries"(id),
+  FOREIGN KEY (video_id) REFERENCES "yt_videos"(id)
+);
+CREATE TABLE "yt_videos" (
   id TEXT PRIMARY KEY,
   title TEXT NOT NULL,
   description TEXT NOT NULL,
@@ -11,24 +32,3 @@ CREATE TABLE yt_videos (
   duration TEXT NOT NULL,
   definition TEXT NOT NULL
 );
-CREATE TABLE yt_query_results (
-  query_id TEXT NOT NULL,
-  video_id TEXT NOT NULL,
-  PRIMARY KEY (query_id, video_id),
-  FOREIGN KEY (query_id) REFERENCES yt_queries(id),
-  FOREIGN KEY (video_id) REFERENCES yt_videos(id)
-);
-CREATE TABLE quota_costs (
-  id TEXT PRIMARY KEY,
-  timestamp TEXT NOT NULL,
-  cost INT NOT NULL
-);
-CREATE TABLE files (
-  digest BLOB PRIMARY KEY,
-  rel_path TEXT NOT NULL,
-  flags INTEGER NOT NULL DEFAULT 0,
-  yt_id TEXT,
-  last_update TEXT NOT NULL,
-  tags TEXT NOT NULL DEFAULT "",
-  FOREIGN KEY (yt_id) REFERENCES yt_videos(id)
-);
diff --git a/src/ytplom/db.py b/src/ytplom/db.py
index e30e2ad..51e94ac 100644
--- a/src/ytplom/db.py
+++ b/src/ytplom/db.py
@@ -2,6 +2,7 @@
 
 # included libs
 from base64 import urlsafe_b64decode, urlsafe_b64encode
+from difflib import Differ
 from hashlib import file_digest
 from pathlib import Path
 from sqlite3 import (connect as sql_connect, Connection as SqlConnection,
@@ -63,16 +64,63 @@ class DbFile:
 
     def __init__(self,
                  path: Path = PATH_DB,
-                 expected_version: int = EXPECTED_DB_VERSION
+                 version_to_validate: int = EXPECTED_DB_VERSION
                  ) -> None:
         self._path = path
-        if not path.is_file():
-            raise HandledException(f'no DB file at {path}')
-        if expected_version >= 0:
-            user_version = self._get_user_version()
-            if user_version != expected_version:
-                raise HandledException(
-                    f'wrong DB version {user_version} (!= {expected_version})')
+        if not self._path.is_file():
+            raise HandledException(f'no DB file at {self._path}')
+
+        if version_to_validate < 0:
+            return
+        # ensure version
+        if (user_version := self._get_user_version()) != version_to_validate:
+            raise HandledException(
+                f'wrong DB version {user_version} (!= {version_to_validate})')
+
+        # ensure schema
+        with sql_connect(self._path) as conn:
+            schema_rows = [
+                    r[0] for r in
+                    conn.execute('SELECT sql FROM sqlite_master ORDER BY sql')
+                    if r[0]]
+        schema_rows_normed = []
+        indent = '  '
+        for row in schema_rows:
+            row_normed = []
+            for subrow in row.split('\n'):
+                subrow = subrow.rstrip()
+                in_parentheses = 0
+                split_at = []
+                for i, c in enumerate(subrow):
+                    if '(' == c:
+                        in_parentheses += 1
+                    elif ')' == c:
+                        in_parentheses -= 1
+                    elif ',' == c and 0 == in_parentheses:
+                        split_at += [i + 1]
+                prev_split = 0
+                for i in split_at:
+                    segment = subrow[prev_split:i].strip()
+                    if len(segment) > 0:
+                        row_normed += [f'{indent}{segment}']
+                    prev_split = i
+                segment = subrow[prev_split:].strip()
+                if len(segment) > 0:
+                    row_normed += [f'{indent}{segment}']
+            row_normed[0] = row_normed[0].lstrip()
+            row_normed[-1] = row_normed[-1].lstrip()
+            if row_normed[-1] != ')' and row_normed[-3][-1] != ',':
+                row_normed[-3] = row_normed[-3] + ','
+                row_normed[-2:] = [indent + row_normed[-1][:-1]] + [')']
+            schema_rows_normed += ['\n'.join(row_normed)]
+        retrieved_schema = ';\n'.join(schema_rows_normed) + ';'
+        with open(_PATH_DB_SCHEMA, 'r', encoding='utf-8') as f:
+            stored_schema = f.read().rstrip()
+        if stored_schema != retrieved_schema:
+            diff_msg = Differ().compare(retrieved_schema.splitlines(),
+                                        stored_schema.splitlines())
+            raise HandledException('DB has wrong tables schema. Diff:\n'
+                                   + '\n'.join(diff_msg))
 
     def _get_user_version(self) -> int:
         with sql_connect(self._path) as conn:
diff --git a/src/ytplom/migrations.py b/src/ytplom/migrations.py
index cb038ef..5cacc95 100644
--- a/src/ytplom/migrations.py
+++ b/src/ytplom/migrations.py
@@ -68,4 +68,4 @@ _MIGRATIONS: MigrationsList = [
 
 def migrate():
     """Migrate DB file at expected default path to most recent version."""
-    DbFile(expected_version=-1).migrate(_MIGRATIONS)
+    DbFile(version_to_validate=-1).migrate(_MIGRATIONS)
-- 
2.30.2