alphax/app/db/llm_insights.py

"""Cached LLM insight storage helpers."""

import hashlib
import json
from datetime import datetime

from app.db.schema import get_conn


_MOJIBAKE_MARKERS = ("Ã", "Â", "ç", "è", "é", "å", "æ", "ä", "ï¼", "ã")


def _looks_mojibake(value):
    text = str(value or "")
    if not text:
        return False
    return any(marker in text for marker in _MOJIBAKE_MARKERS)


def repair_mojibake_text(value):
    """Repair common UTF-8-as-latin1 mojibake from model/provider responses."""
    if not isinstance(value, str) or not _looks_mojibake(value):
        return value
    try:
        repaired = value.encode("latin1").decode("utf-8")
    except Exception:
        repaired = _repair_mixed_mojibake_text(value)
        return repaired if repaired != value else value
    # Only accept the repair when it produces visible CJK text or common CJK punctuation.
    if any("\u4e00" <= ch <= "\u9fff" for ch in repaired) or any(ch in repaired for ch in "，。；：！？（）《》"):
        return repaired
    return value


def _repair_mixed_mojibake_text(value):
    """Repair strings that contain normal CJK text plus mojibake fragments."""
    text = str(value or "")
    separators = (": ", "：", " - ", "，", "。")
    for sep in separators:
        if sep not in text:
            continue
        left, right = text.split(sep, 1)
        fixed_right = repair_mojibake_text(right)
        if fixed_right != right:
            return left + sep + fixed_right
    return value


def repair_mojibake_json(value):
    if isinstance(value, dict):
        return {
            repair_mojibake_text(k) if isinstance(k, str) else k: repair_mojibake_json(v)
            for k, v in value.items()
        }
    if isinstance(value, list):
        return [repair_mojibake_json(v) for v in value]
    if isinstance(value, str):
        return repair_mojibake_text(value)
    return value


def compute_input_hash(payload):
    """Stable hash for structured LLM input payloads."""
    raw = json.dumps(payload or {}, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()


def _load_content(row):
    item = dict(row)
    try:
        item["content"] = repair_mojibake_json(json.loads(item.get("content_json") or "{}"))
    except Exception:
        item["content"] = {}
    try:
        item["input"] = repair_mojibake_json(json.loads(item.get("input_json") or "{}"))
    except Exception:
        item["input"] = {}
    return item


def get_cached_insight(target_type, target_id, insight_type, input_hash=None, success_only=True):
    conn = get_conn()
    where = "target_type=? AND target_id=? AND insight_type=?"
    params = [str(target_type), str(target_id), str(insight_type)]
    if input_hash:
        where += " AND input_hash=?"
        params.append(str(input_hash))
    if success_only:
        where += " AND status='success'"
    row = conn.execute(
        f"""
        SELECT * FROM llm_insights
        WHERE {where}
        ORDER BY updated_at DESC, id DESC
        LIMIT 1
        """,
        tuple(params),
    ).fetchone()
    conn.close()
    return _load_content(row) if row else None


def get_any_insight(target_type, target_id, insight_type, input_hash):
    return get_cached_insight(target_type, target_id, insight_type, input_hash=input_hash, success_only=False)


def upsert_insight(
    target_type,
    target_id,
    insight_type,
    prompt_version,
    input_hash,
    status,
    input_payload=None,
    content=None,
    error="",
    model="",
):
    now = datetime.now().isoformat()
    input_json = json.dumps(repair_mojibake_json(input_payload or {}), ensure_ascii=False, default=str)
    content_json = json.dumps(repair_mojibake_json(content or {}), ensure_ascii=False, default=str)
    conn = get_conn()
    conn.execute(
        """
        INSERT INTO llm_insights (
            target_type, target_id, insight_type, prompt_version, input_hash,
            status, input_json, content_json, error, model, created_at, updated_at
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ON CONFLICT(target_type, target_id, insight_type, input_hash) DO UPDATE SET
            prompt_version=excluded.prompt_version,
            status=excluded.status,
            input_json=excluded.input_json,
            content_json=excluded.content_json,
            error=excluded.error,
            model=excluded.model,
            updated_at=excluded.updated_at
        """,
        (
            str(target_type),
            str(target_id),
            str(insight_type),
            str(prompt_version),
            str(input_hash),
            str(status),
            input_json,
            content_json,
            str(error or "")[:2000],
            str(model or ""),
            now,
            now,
        ),
    )
    conn.commit()
    row = conn.execute(
        """
        SELECT * FROM llm_insights
        WHERE target_type=? AND target_id=? AND insight_type=? AND input_hash=?
        """,
        (str(target_type), str(target_id), str(insight_type), str(input_hash)),
    ).fetchone()
    conn.close()
    return _load_content(row) if row else None


def get_insights_for_targets(target_type, target_ids, insight_type):
    ids = [str(x) for x in (target_ids or []) if str(x or "").strip()]
    if not ids:
        return {}
    placeholders = ",".join(["?"] * len(ids))
    conn = get_conn()
    rows = conn.execute(
        f"""
        SELECT * FROM llm_insights
        WHERE target_type=? AND insight_type=? AND status='success'
          AND target_id IN ({placeholders})
        ORDER BY updated_at DESC, id DESC
        """,
        tuple([str(target_type), str(insight_type)] + ids),
    ).fetchall()
    conn.close()
    result = {}
    for row in rows:
        item = _load_content(row)
        result.setdefault(str(item.get("target_id")), item)
    return result


def get_latest_insight_by_type(target_type, insight_type, success_only=True):
    conn = get_conn()
    status_clause = "AND status='success'" if success_only else ""
    row = conn.execute(
        f"""
        SELECT * FROM llm_insights
        WHERE target_type=? AND insight_type=? {status_clause}
        ORDER BY updated_at DESC, id DESC
        LIMIT 1
        """,
        (str(target_type), str(insight_type)),
    ).fetchone()
    conn.close()
    return _load_content(row) if row else None


def list_llm_insights(limit=50, offset=0, target_type="", status="", insight_type=""):
    try:
        limit = min(100, max(1, int(limit or 50)))
    except Exception:
        limit = 50
    try:
        offset = max(0, int(offset or 0))
    except Exception:
        offset = 0
    where = []
    params = []
    if target_type:
        where.append("target_type=?")
        params.append(str(target_type))
    if status:
        where.append("status=?")
        params.append(str(status))
    if insight_type:
        where.append("insight_type=?")
        params.append(str(insight_type))
    clause = ("WHERE " + " AND ".join(where)) if where else ""
    conn = get_conn()
    total = conn.execute(f"SELECT COUNT(*) FROM llm_insights {clause}", tuple(params)).fetchone()[0]
    rows = conn.execute(
        f"""
        SELECT * FROM llm_insights
        {clause}
        ORDER BY updated_at DESC, id DESC
        LIMIT ? OFFSET ?
        """,
        tuple(params + [limit, offset]),
    ).fetchall()
    conn.close()
    return {
        "items": [_load_content(row) for row in rows],
        "total": int(total or 0),
        "limit": limit,
        "offset": offset,
        "has_more": offset + len(rows) < int(total or 0),
    }


def get_llm_insight_by_id(insight_id):
    conn = get_conn()
    row = conn.execute("SELECT * FROM llm_insights WHERE id=?", (int(insight_id or 0),)).fetchone()
    conn.close()
    return _load_content(row) if row else None


__all__ = [
    "compute_input_hash",
    "get_any_insight",
    "get_cached_insight",
    "get_insights_for_targets",
    "get_latest_insight_by_type",
    "get_llm_insight_by_id",
    "list_llm_insights",
    "repair_mojibake_json",
    "repair_mojibake_text",
    "upsert_insight",
]