"""Cached LLM insight storage helpers.""" import hashlib import json from datetime import datetime from app.db.schema import get_conn _MOJIBAKE_MARKERS = ("Ã", "Â", "ç", "è", "é", "å", "æ", "ä", "ï¼", "ã") def _looks_mojibake(value): text = str(value or "") if not text: return False return any(marker in text for marker in _MOJIBAKE_MARKERS) def repair_mojibake_text(value): """Repair common UTF-8-as-latin1 mojibake from model/provider responses.""" if not isinstance(value, str) or not _looks_mojibake(value): return value try: repaired = value.encode("latin1").decode("utf-8") except Exception: repaired = _repair_mixed_mojibake_text(value) return repaired if repaired != value else value # Only accept the repair when it produces visible CJK text or common CJK punctuation. if any("\u4e00" <= ch <= "\u9fff" for ch in repaired) or any(ch in repaired for ch in ",。;:!?()《》"): return repaired return value def _repair_mixed_mojibake_text(value): """Repair strings that contain normal CJK text plus mojibake fragments.""" text = str(value or "") separators = (": ", ":", " - ", ",", "。") for sep in separators: if sep not in text: continue left, right = text.split(sep, 1) fixed_right = repair_mojibake_text(right) if fixed_right != right: return left + sep + fixed_right return value def repair_mojibake_json(value): if isinstance(value, dict): return { repair_mojibake_text(k) if isinstance(k, str) else k: repair_mojibake_json(v) for k, v in value.items() } if isinstance(value, list): return [repair_mojibake_json(v) for v in value] if isinstance(value, str): return repair_mojibake_text(value) return value def compute_input_hash(payload): """Stable hash for structured LLM input payloads.""" raw = json.dumps(payload or {}, ensure_ascii=False, sort_keys=True, separators=(",", ":"), default=str) return hashlib.sha256(raw.encode("utf-8")).hexdigest() def _load_content(row): item = dict(row) try: item["content"] = repair_mojibake_json(json.loads(item.get("content_json") or "{}")) except Exception: item["content"] = {} try: item["input"] = repair_mojibake_json(json.loads(item.get("input_json") or "{}")) except Exception: item["input"] = {} return item def get_cached_insight(target_type, target_id, insight_type, input_hash=None, success_only=True): conn = get_conn() where = "target_type=%s AND target_id=%s AND insight_type=%s" params = [str(target_type), str(target_id), str(insight_type)] if input_hash: where += " AND input_hash=%s" params.append(str(input_hash)) if success_only: where += " AND status='success'" row = conn.execute( f""" SELECT * FROM llm_insights WHERE {where} ORDER BY updated_at DESC, id DESC LIMIT 1 """, tuple(params), ).fetchone() conn.close() return _load_content(row) if row else None def get_any_insight(target_type, target_id, insight_type, input_hash): return get_cached_insight(target_type, target_id, insight_type, input_hash=input_hash, success_only=False) def upsert_insight( target_type, target_id, insight_type, prompt_version, input_hash, status, input_payload=None, content=None, error="", model="", ): now = datetime.now().isoformat() input_json = json.dumps(repair_mojibake_json(input_payload or {}), ensure_ascii=False, default=str) content_json = json.dumps(repair_mojibake_json(content or {}), ensure_ascii=False, default=str) conn = get_conn() conn.execute( """ INSERT INTO llm_insights ( target_type, target_id, insight_type, prompt_version, input_hash, status, input_json, content_json, error, model, created_at, updated_at ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT(target_type, target_id, insight_type, input_hash) DO UPDATE SET prompt_version=excluded.prompt_version, status=excluded.status, input_json=excluded.input_json, content_json=excluded.content_json, error=excluded.error, model=excluded.model, updated_at=excluded.updated_at """, ( str(target_type), str(target_id), str(insight_type), str(prompt_version), str(input_hash), str(status), input_json, content_json, str(error or "")[:2000], str(model or ""), now, now, ), ) conn.commit() row = conn.execute( """ SELECT * FROM llm_insights WHERE target_type=%s AND target_id=%s AND insight_type=%s AND input_hash=%s """, (str(target_type), str(target_id), str(insight_type), str(input_hash)), ).fetchone() conn.close() return _load_content(row) if row else None def get_insights_for_targets(target_type, target_ids, insight_type): ids = [str(x) for x in (target_ids or []) if str(x or "").strip()] if not ids: return {} placeholders = ",".join(["%s"] * len(ids)) conn = get_conn() rows = conn.execute( f""" SELECT * FROM llm_insights WHERE target_type=%s AND insight_type=%s AND status='success' AND target_id IN ({placeholders}) ORDER BY updated_at DESC, id DESC """, tuple([str(target_type), str(insight_type)] + ids), ).fetchall() conn.close() result = {} for row in rows: item = _load_content(row) result.setdefault(str(item.get("target_id")), item) return result def get_latest_insight_by_type(target_type, insight_type, success_only=True): conn = get_conn() status_clause = "AND status='success'" if success_only else "" row = conn.execute( f""" SELECT * FROM llm_insights WHERE target_type=%s AND insight_type=%s {status_clause} ORDER BY updated_at DESC, id DESC LIMIT 1 """, (str(target_type), str(insight_type)), ).fetchone() conn.close() return _load_content(row) if row else None def list_llm_insights(limit=50, offset=0, target_type="", status="", insight_type=""): try: limit = min(100, max(1, int(limit or 50))) except Exception: limit = 50 try: offset = max(0, int(offset or 0)) except Exception: offset = 0 where = [] params = [] if target_type: where.append("target_type=%s") params.append(str(target_type)) if status: where.append("status=%s") params.append(str(status)) if insight_type: where.append("insight_type=%s") params.append(str(insight_type)) clause = ("WHERE " + " AND ".join(where)) if where else "" conn = get_conn() total = conn.execute(f"SELECT COUNT(*) FROM llm_insights {clause}", tuple(params)).fetchone()[0] rows = conn.execute( f""" SELECT * FROM llm_insights {clause} ORDER BY updated_at DESC, id DESC LIMIT %s OFFSET %s """, tuple(params + [limit, offset]), ).fetchall() conn.close() return { "items": [_load_content(row) for row in rows], "total": int(total or 0), "limit": limit, "offset": offset, "has_more": offset + len(rows) < int(total or 0), } def get_llm_insight_by_id(insight_id): conn = get_conn() row = conn.execute("SELECT * FROM llm_insights WHERE id=%s", (int(insight_id or 0),)).fetchone() conn.close() return _load_content(row) if row else None __all__ = [ "compute_input_hash", "get_any_insight", "get_cached_insight", "get_insights_for_targets", "get_latest_insight_by_type", "get_llm_insight_by_id", "list_llm_insights", "repair_mojibake_json", "repair_mojibake_text", "upsert_insight", ]