alphax/app/db/llm_insights.py
2026-05-16 14:52:10 +08:00

265 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Cached LLM insight storage helpers."""
import hashlib
import json
from datetime import datetime
from app.db.schema import get_conn
_MOJIBAKE_MARKERS = ("Ã", "Â", "ç", "è", "é", "å", "æ", "ä", "ï¼", "ã")
def _looks_mojibake(value):
text = str(value or "")
if not text:
return False
return any(marker in text for marker in _MOJIBAKE_MARKERS)
def repair_mojibake_text(value):
"""Repair common UTF-8-as-latin1 mojibake from model/provider responses."""
if not isinstance(value, str) or not _looks_mojibake(value):
return value
try:
repaired = value.encode("latin1").decode("utf-8")
except Exception:
repaired = _repair_mixed_mojibake_text(value)
return repaired if repaired != value else value
# Only accept the repair when it produces visible CJK text or common CJK punctuation.
if any("\u4e00" <= ch <= "\u9fff" for ch in repaired) or any(ch in repaired for ch in ",。;:!?()《》"):
return repaired
return value
def _repair_mixed_mojibake_text(value):
"""Repair strings that contain normal CJK text plus mojibake fragments."""
text = str(value or "")
separators = (": ", "", " - ", "", "")
for sep in separators:
if sep not in text:
continue
left, right = text.split(sep, 1)
fixed_right = repair_mojibake_text(right)
if fixed_right != right:
return left + sep + fixed_right
return value
def repair_mojibake_json(value):
if isinstance(value, dict):
return {
repair_mojibake_text(k) if isinstance(k, str) else k: repair_mojibake_json(v)
for k, v in value.items()
}
if isinstance(value, list):
return [repair_mojibake_json(v) for v in value]
if isinstance(value, str):
return repair_mojibake_text(value)
return value
def compute_input_hash(payload):
"""Stable hash for structured LLM input payloads."""
raw = json.dumps(payload or {}, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
def _load_content(row):
item = dict(row)
try:
item["content"] = repair_mojibake_json(json.loads(item.get("content_json") or "{}"))
except Exception:
item["content"] = {}
try:
item["input"] = repair_mojibake_json(json.loads(item.get("input_json") or "{}"))
except Exception:
item["input"] = {}
return item
def get_cached_insight(target_type, target_id, insight_type, input_hash=None, success_only=True):
conn = get_conn()
where = "target_type=%s AND target_id=%s AND insight_type=%s"
params = [str(target_type), str(target_id), str(insight_type)]
if input_hash:
where += " AND input_hash=%s"
params.append(str(input_hash))
if success_only:
where += " AND status='success'"
row = conn.execute(
f"""
SELECT * FROM llm_insights
WHERE {where}
ORDER BY updated_at DESC, id DESC
LIMIT 1
""",
tuple(params),
).fetchone()
conn.close()
return _load_content(row) if row else None
def get_any_insight(target_type, target_id, insight_type, input_hash):
return get_cached_insight(target_type, target_id, insight_type, input_hash=input_hash, success_only=False)
def upsert_insight(
target_type,
target_id,
insight_type,
prompt_version,
input_hash,
status,
input_payload=None,
content=None,
error="",
model="",
):
now = datetime.now().isoformat()
input_json = json.dumps(repair_mojibake_json(input_payload or {}), ensure_ascii=False, default=str)
content_json = json.dumps(repair_mojibake_json(content or {}), ensure_ascii=False, default=str)
conn = get_conn()
conn.execute(
"""
INSERT INTO llm_insights (
target_type, target_id, insight_type, prompt_version, input_hash,
status, input_json, content_json, error, model, created_at, updated_at
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT(target_type, target_id, insight_type, input_hash) DO UPDATE SET
prompt_version=excluded.prompt_version,
status=excluded.status,
input_json=excluded.input_json,
content_json=excluded.content_json,
error=excluded.error,
model=excluded.model,
updated_at=excluded.updated_at
""",
(
str(target_type),
str(target_id),
str(insight_type),
str(prompt_version),
str(input_hash),
str(status),
input_json,
content_json,
str(error or "")[:2000],
str(model or ""),
now,
now,
),
)
conn.commit()
row = conn.execute(
"""
SELECT * FROM llm_insights
WHERE target_type=%s AND target_id=%s AND insight_type=%s AND input_hash=%s
""",
(str(target_type), str(target_id), str(insight_type), str(input_hash)),
).fetchone()
conn.close()
return _load_content(row) if row else None
def get_insights_for_targets(target_type, target_ids, insight_type):
ids = [str(x) for x in (target_ids or []) if str(x or "").strip()]
if not ids:
return {}
placeholders = ",".join(["%s"] * len(ids))
conn = get_conn()
rows = conn.execute(
f"""
SELECT * FROM llm_insights
WHERE target_type=%s AND insight_type=%s AND status='success'
AND target_id IN ({placeholders})
ORDER BY updated_at DESC, id DESC
""",
tuple([str(target_type), str(insight_type)] + ids),
).fetchall()
conn.close()
result = {}
for row in rows:
item = _load_content(row)
result.setdefault(str(item.get("target_id")), item)
return result
def get_latest_insight_by_type(target_type, insight_type, success_only=True):
conn = get_conn()
status_clause = "AND status='success'" if success_only else ""
row = conn.execute(
f"""
SELECT * FROM llm_insights
WHERE target_type=%s AND insight_type=%s {status_clause}
ORDER BY updated_at DESC, id DESC
LIMIT 1
""",
(str(target_type), str(insight_type)),
).fetchone()
conn.close()
return _load_content(row) if row else None
def list_llm_insights(limit=50, offset=0, target_type="", status="", insight_type=""):
try:
limit = min(100, max(1, int(limit or 50)))
except Exception:
limit = 50
try:
offset = max(0, int(offset or 0))
except Exception:
offset = 0
where = []
params = []
if target_type:
where.append("target_type=%s")
params.append(str(target_type))
if status:
where.append("status=%s")
params.append(str(status))
if insight_type:
where.append("insight_type=%s")
params.append(str(insight_type))
clause = ("WHERE " + " AND ".join(where)) if where else ""
conn = get_conn()
total = conn.execute(f"SELECT COUNT(*) FROM llm_insights {clause}", tuple(params)).fetchone()[0]
rows = conn.execute(
f"""
SELECT * FROM llm_insights
{clause}
ORDER BY updated_at DESC, id DESC
LIMIT %s OFFSET %s
""",
tuple(params + [limit, offset]),
).fetchall()
conn.close()
return {
"items": [_load_content(row) for row in rows],
"total": int(total or 0),
"limit": limit,
"offset": offset,
"has_more": offset + len(rows) < int(total or 0),
}
def get_llm_insight_by_id(insight_id):
conn = get_conn()
row = conn.execute("SELECT * FROM llm_insights WHERE id=%s", (int(insight_id or 0),)).fetchone()
conn.close()
return _load_content(row) if row else None
__all__ = [
"compute_input_hash",
"get_any_insight",
"get_cached_insight",
"get_insights_for_targets",
"get_latest_insight_by_type",
"get_llm_insight_by_id",
"list_llm_insights",
"repair_mojibake_json",
"repair_mojibake_text",
"upsert_insight",
]