265 lines
8.0 KiB
Python
265 lines
8.0 KiB
Python
"""Cached LLM insight storage helpers."""
|
||
|
||
import hashlib
|
||
import json
|
||
from datetime import datetime
|
||
|
||
from app.db.schema import get_conn
|
||
|
||
|
||
_MOJIBAKE_MARKERS = ("Ã", "Â", "ç", "è", "é", "å", "æ", "ä", "ï¼", "ã")
|
||
|
||
|
||
def _looks_mojibake(value):
|
||
text = str(value or "")
|
||
if not text:
|
||
return False
|
||
return any(marker in text for marker in _MOJIBAKE_MARKERS)
|
||
|
||
|
||
def repair_mojibake_text(value):
|
||
"""Repair common UTF-8-as-latin1 mojibake from model/provider responses."""
|
||
if not isinstance(value, str) or not _looks_mojibake(value):
|
||
return value
|
||
try:
|
||
repaired = value.encode("latin1").decode("utf-8")
|
||
except Exception:
|
||
repaired = _repair_mixed_mojibake_text(value)
|
||
return repaired if repaired != value else value
|
||
# Only accept the repair when it produces visible CJK text or common CJK punctuation.
|
||
if any("\u4e00" <= ch <= "\u9fff" for ch in repaired) or any(ch in repaired for ch in ",。;:!?()《》"):
|
||
return repaired
|
||
return value
|
||
|
||
|
||
def _repair_mixed_mojibake_text(value):
|
||
"""Repair strings that contain normal CJK text plus mojibake fragments."""
|
||
text = str(value or "")
|
||
separators = (": ", ":", " - ", ",", "。")
|
||
for sep in separators:
|
||
if sep not in text:
|
||
continue
|
||
left, right = text.split(sep, 1)
|
||
fixed_right = repair_mojibake_text(right)
|
||
if fixed_right != right:
|
||
return left + sep + fixed_right
|
||
return value
|
||
|
||
|
||
def repair_mojibake_json(value):
|
||
if isinstance(value, dict):
|
||
return {
|
||
repair_mojibake_text(k) if isinstance(k, str) else k: repair_mojibake_json(v)
|
||
for k, v in value.items()
|
||
}
|
||
if isinstance(value, list):
|
||
return [repair_mojibake_json(v) for v in value]
|
||
if isinstance(value, str):
|
||
return repair_mojibake_text(value)
|
||
return value
|
||
|
||
|
||
def compute_input_hash(payload):
|
||
"""Stable hash for structured LLM input payloads."""
|
||
raw = json.dumps(payload or {}, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str)
|
||
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
|
||
|
||
|
||
def _load_content(row):
|
||
item = dict(row)
|
||
try:
|
||
item["content"] = repair_mojibake_json(json.loads(item.get("content_json") or "{}"))
|
||
except Exception:
|
||
item["content"] = {}
|
||
try:
|
||
item["input"] = repair_mojibake_json(json.loads(item.get("input_json") or "{}"))
|
||
except Exception:
|
||
item["input"] = {}
|
||
return item
|
||
|
||
|
||
def get_cached_insight(target_type, target_id, insight_type, input_hash=None, success_only=True):
|
||
conn = get_conn()
|
||
where = "target_type=? AND target_id=? AND insight_type=?"
|
||
params = [str(target_type), str(target_id), str(insight_type)]
|
||
if input_hash:
|
||
where += " AND input_hash=?"
|
||
params.append(str(input_hash))
|
||
if success_only:
|
||
where += " AND status='success'"
|
||
row = conn.execute(
|
||
f"""
|
||
SELECT * FROM llm_insights
|
||
WHERE {where}
|
||
ORDER BY updated_at DESC, id DESC
|
||
LIMIT 1
|
||
""",
|
||
tuple(params),
|
||
).fetchone()
|
||
conn.close()
|
||
return _load_content(row) if row else None
|
||
|
||
|
||
def get_any_insight(target_type, target_id, insight_type, input_hash):
|
||
return get_cached_insight(target_type, target_id, insight_type, input_hash=input_hash, success_only=False)
|
||
|
||
|
||
def upsert_insight(
|
||
target_type,
|
||
target_id,
|
||
insight_type,
|
||
prompt_version,
|
||
input_hash,
|
||
status,
|
||
input_payload=None,
|
||
content=None,
|
||
error="",
|
||
model="",
|
||
):
|
||
now = datetime.now().isoformat()
|
||
input_json = json.dumps(repair_mojibake_json(input_payload or {}), ensure_ascii=False, default=str)
|
||
content_json = json.dumps(repair_mojibake_json(content or {}), ensure_ascii=False, default=str)
|
||
conn = get_conn()
|
||
conn.execute(
|
||
"""
|
||
INSERT INTO llm_insights (
|
||
target_type, target_id, insight_type, prompt_version, input_hash,
|
||
status, input_json, content_json, error, model, created_at, updated_at
|
||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
ON CONFLICT(target_type, target_id, insight_type, input_hash) DO UPDATE SET
|
||
prompt_version=excluded.prompt_version,
|
||
status=excluded.status,
|
||
input_json=excluded.input_json,
|
||
content_json=excluded.content_json,
|
||
error=excluded.error,
|
||
model=excluded.model,
|
||
updated_at=excluded.updated_at
|
||
""",
|
||
(
|
||
str(target_type),
|
||
str(target_id),
|
||
str(insight_type),
|
||
str(prompt_version),
|
||
str(input_hash),
|
||
str(status),
|
||
input_json,
|
||
content_json,
|
||
str(error or "")[:2000],
|
||
str(model or ""),
|
||
now,
|
||
now,
|
||
),
|
||
)
|
||
conn.commit()
|
||
row = conn.execute(
|
||
"""
|
||
SELECT * FROM llm_insights
|
||
WHERE target_type=? AND target_id=? AND insight_type=? AND input_hash=?
|
||
""",
|
||
(str(target_type), str(target_id), str(insight_type), str(input_hash)),
|
||
).fetchone()
|
||
conn.close()
|
||
return _load_content(row) if row else None
|
||
|
||
|
||
def get_insights_for_targets(target_type, target_ids, insight_type):
|
||
ids = [str(x) for x in (target_ids or []) if str(x or "").strip()]
|
||
if not ids:
|
||
return {}
|
||
placeholders = ",".join(["?"] * len(ids))
|
||
conn = get_conn()
|
||
rows = conn.execute(
|
||
f"""
|
||
SELECT * FROM llm_insights
|
||
WHERE target_type=? AND insight_type=? AND status='success'
|
||
AND target_id IN ({placeholders})
|
||
ORDER BY updated_at DESC, id DESC
|
||
""",
|
||
tuple([str(target_type), str(insight_type)] + ids),
|
||
).fetchall()
|
||
conn.close()
|
||
result = {}
|
||
for row in rows:
|
||
item = _load_content(row)
|
||
result.setdefault(str(item.get("target_id")), item)
|
||
return result
|
||
|
||
|
||
def get_latest_insight_by_type(target_type, insight_type, success_only=True):
|
||
conn = get_conn()
|
||
status_clause = "AND status='success'" if success_only else ""
|
||
row = conn.execute(
|
||
f"""
|
||
SELECT * FROM llm_insights
|
||
WHERE target_type=? AND insight_type=? {status_clause}
|
||
ORDER BY updated_at DESC, id DESC
|
||
LIMIT 1
|
||
""",
|
||
(str(target_type), str(insight_type)),
|
||
).fetchone()
|
||
conn.close()
|
||
return _load_content(row) if row else None
|
||
|
||
|
||
def list_llm_insights(limit=50, offset=0, target_type="", status="", insight_type=""):
|
||
try:
|
||
limit = min(100, max(1, int(limit or 50)))
|
||
except Exception:
|
||
limit = 50
|
||
try:
|
||
offset = max(0, int(offset or 0))
|
||
except Exception:
|
||
offset = 0
|
||
where = []
|
||
params = []
|
||
if target_type:
|
||
where.append("target_type=?")
|
||
params.append(str(target_type))
|
||
if status:
|
||
where.append("status=?")
|
||
params.append(str(status))
|
||
if insight_type:
|
||
where.append("insight_type=?")
|
||
params.append(str(insight_type))
|
||
clause = ("WHERE " + " AND ".join(where)) if where else ""
|
||
conn = get_conn()
|
||
total = conn.execute(f"SELECT COUNT(*) FROM llm_insights {clause}", tuple(params)).fetchone()[0]
|
||
rows = conn.execute(
|
||
f"""
|
||
SELECT * FROM llm_insights
|
||
{clause}
|
||
ORDER BY updated_at DESC, id DESC
|
||
LIMIT ? OFFSET ?
|
||
""",
|
||
tuple(params + [limit, offset]),
|
||
).fetchall()
|
||
conn.close()
|
||
return {
|
||
"items": [_load_content(row) for row in rows],
|
||
"total": int(total or 0),
|
||
"limit": limit,
|
||
"offset": offset,
|
||
"has_more": offset + len(rows) < int(total or 0),
|
||
}
|
||
|
||
|
||
def get_llm_insight_by_id(insight_id):
|
||
conn = get_conn()
|
||
row = conn.execute("SELECT * FROM llm_insights WHERE id=?", (int(insight_id or 0),)).fetchone()
|
||
conn.close()
|
||
return _load_content(row) if row else None
|
||
|
||
|
||
__all__ = [
|
||
"compute_input_hash",
|
||
"get_any_insight",
|
||
"get_cached_insight",
|
||
"get_insights_for_targets",
|
||
"get_latest_insight_by_type",
|
||
"get_llm_insight_by_id",
|
||
"list_llm_insights",
|
||
"repair_mojibake_json",
|
||
"repair_mojibake_text",
|
||
"upsert_insight",
|
||
]
|