#!/usr/bin/env python3 """Compare SQLite and PostgreSQL row counts after import.""" from __future__ import annotations import argparse import json import sqlite3 import sys from pathlib import Path from psycopg import sql REPO_ROOT = Path(__file__).resolve().parents[2] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from app.db.postgres_connection import connect # noqa: E402 from scripts.postgres.import_from_sqlite import ( # noqa: E402 DEFAULT_SCHEDULER_SQLITE_PATH, DEFAULT_SQLITE_PATH, EXCLUDED_TABLES, ) KEY_TABLES = [ "recommendation", "price_tracking", "screening_log", "coin_state", "cron_run_log", "review_log", "app_user", "user_subscription", "event_news", "sentiment_events", "onchain_events", "onchain_raw_events", "llm_insights", "system_reset_log", "scheduler_job_config", "scheduler_runtime_status", "scheduler_manual_trigger", ] def _sqlite_tables(conn: sqlite3.Connection) -> set[str]: rows = conn.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'" ).fetchall() return {row["name"] for row in rows if row["name"] not in EXCLUDED_TABLES} def _postgres_tables(conn) -> set[str]: rows = conn.execute( """ SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE' """ ).fetchall() return {row[0] for row in rows if row[0] not in EXCLUDED_TABLES} def _sqlite_count(conn: sqlite3.Connection, table: str) -> int: return int(conn.execute(f'SELECT COUNT(*) AS n FROM "{table}"').fetchone()["n"]) def _sqlite_max_id(conn: sqlite3.Connection, table: str) -> int | None: cols = [row["name"] for row in conn.execute(f"PRAGMA table_info({table})").fetchall()] if "id" not in cols: return None value = conn.execute(f'SELECT MAX(id) AS max_id FROM "{table}"').fetchone()["max_id"] return int(value) if value is not None else None def _postgres_count(conn, table: str) -> int: return int(conn.execute(sql.SQL("SELECT COUNT(*) FROM {table}").format(table=sql.Identifier(table))).fetchone()[0]) def _postgres_max_id(conn, table: str) -> int | None: has_id = conn.execute( """ SELECT 1 FROM information_schema.columns WHERE table_schema='public' AND table_name=%s AND column_name='id' """, (table,), ).fetchone() if not has_id: return None value = conn.execute(sql.SQL("SELECT MAX(id) FROM {table}").format(table=sql.Identifier(table))).fetchone()[0] return int(value) if value is not None else None def _collect_sqlite_sources(sqlite_path: Path, scheduler_sqlite_path: Path | None) -> list[tuple[str, Path]]: if not sqlite_path.exists(): raise FileNotFoundError(f"SQLite database not found: {sqlite_path}") sources = [("main", sqlite_path)] if scheduler_sqlite_path and scheduler_sqlite_path.exists(): sources.append(("scheduler", scheduler_sqlite_path)) return sources def validate( sqlite_path: Path, database_url: str | None = None, *, scheduler_sqlite_path: Path | None = None, all_tables: bool = False, ) -> dict: sources = _collect_sqlite_sources(sqlite_path, scheduler_sqlite_path) sqlite_conns = [] try: sqlite_by_table = {} for source, path in sources: conn = sqlite3.connect(str(path)) conn.row_factory = sqlite3.Row sqlite_conns.append(conn) for table in _sqlite_tables(conn): sqlite_by_table[table] = (source, conn) with connect(database_url) as pg_conn: sqlite_tables = set(sqlite_by_table) pg_tables = _postgres_tables(pg_conn) table_names = sorted(sqlite_tables & pg_tables) if all_tables else [t for t in KEY_TABLES if t in sqlite_tables and t in pg_tables] tables = [] ok = True for table in table_names: source, sqlite_conn = sqlite_by_table[table] sqlite_count = _sqlite_count(sqlite_conn, table) pg_count = _postgres_count(pg_conn, table) sqlite_max_id = _sqlite_max_id(sqlite_conn, table) pg_max_id = _postgres_max_id(pg_conn, table) table_ok = sqlite_count == pg_count and sqlite_max_id == pg_max_id ok = ok and table_ok tables.append( { "table": table, "source": source, "sqlite_count": sqlite_count, "postgres_count": pg_count, "sqlite_max_id": sqlite_max_id, "postgres_max_id": pg_max_id, "ok": table_ok, } ) return { "ok": ok, "checked_tables": len(tables), "sqlite_only_tables": sorted(sqlite_tables - pg_tables), "postgres_only_tables": sorted(pg_tables - sqlite_tables), "tables": tables, } finally: for conn in sqlite_conns: conn.close() def main() -> int: parser = argparse.ArgumentParser(description="Validate AlphaX SQLite -> PostgreSQL import.") parser.add_argument("--sqlite-path", type=Path, default=DEFAULT_SQLITE_PATH) parser.add_argument("--scheduler-sqlite-path", type=Path, default=DEFAULT_SCHEDULER_SQLITE_PATH) parser.add_argument("--database-url", default=None, help="Override DATABASE_URL.") parser.add_argument("--all-tables", action="store_true") parser.add_argument("--json", action="store_true", help="Print full JSON report.") args = parser.parse_args() report = validate( args.sqlite_path, args.database_url, scheduler_sqlite_path=args.scheduler_sqlite_path, all_tables=args.all_tables, ) if args.json: print(json.dumps(report, ensure_ascii=False, indent=2)) else: status = "PASS" if report["ok"] else "FAIL" print(f"[validate] {status}: checked {report['checked_tables']} table(s)") for item in report["tables"]: mark = "OK" if item["ok"] else "DIFF" print( f"[validate] {mark} {item['table']} ({item['source']}): " f"count {item['sqlite_count']} -> {item['postgres_count']}, " f"max_id {item['sqlite_max_id']} -> {item['postgres_max_id']}" ) if report["sqlite_only_tables"]: print(f"[validate] sqlite-only tables: {', '.join(report['sqlite_only_tables'])}") if report["postgres_only_tables"]: print(f"[validate] postgres-only tables: {', '.join(report['postgres_only_tables'])}") return 0 if report["ok"] else 1 if __name__ == "__main__": raise SystemExit(main())