astock-agent/backend/app/catalyst/service.py
2026-05-14 17:02:13 +08:00

302 lines
11 KiB
Python

"""催化事件存储与主题分数聚合。"""
from __future__ import annotations
import hashlib
import logging
from datetime import datetime, timedelta
from sqlalchemy import text
from app.catalyst.mapper import analyze_catalyst
from app.catalyst.models import CatalystAnalysis, CatalystInput, ThemeCatalystScore
from app.db import tables
from app.db.database import get_db
from app.news.models import NewsItem
logger = logging.getLogger(__name__)
async def ingest_catalyst(item: CatalystInput, use_llm: bool = True) -> CatalystAnalysis:
analysis = await analyze_catalyst(item, use_llm=use_llm)
await save_catalyst(analysis)
return analysis
async def ingest_catalyst_with_id(item: CatalystInput, use_llm: bool = True) -> tuple[CatalystAnalysis, int]:
analysis = await analyze_catalyst(item, use_llm=use_llm)
catalyst_id = await save_catalyst(analysis)
return analysis, catalyst_id
async def save_catalyst(analysis: CatalystAnalysis) -> int:
async with get_db() as db:
if analysis.url:
exists = await db.execute(
text(
"SELECT id FROM catalysts WHERE source = :source AND url = :url "
"ORDER BY id DESC LIMIT 1"
),
{"source": analysis.source, "url": analysis.url},
)
row = exists.fetchone()
if row:
return int(row._mapping["id"])
result = await db.execute(
tables.catalysts_table.insert().values(
title=analysis.title,
summary=analysis.summary,
source=analysis.source,
url=analysis.url,
published_at=analysis.published_at,
catalyst_type=analysis.catalyst_type,
strength=analysis.strength,
freshness=analysis.freshness,
confidence=analysis.confidence,
raw_text=analysis.raw_text,
llm_reason=analysis.llm_reason,
is_active=True,
)
)
catalyst_id = int(result.inserted_primary_key[0])
if analysis.themes:
await db.execute(
tables.theme_catalysts_table.insert(),
[
{
"catalyst_id": catalyst_id,
"theme_id": theme.theme_id,
"theme_name": theme.theme_name,
"relevance": theme.relevance,
"reason": theme.reason,
}
for theme in analysis.themes
],
)
await db.commit()
return catalyst_id
async def ingest_news_items(items: list[NewsItem]) -> dict:
"""保存原始新闻并分析新增项。"""
inserted = 0
duplicates = 0
analyzed = 0
failed = 0
async with get_db() as db:
for item in items:
dedup_key = build_news_dedup_key(item)
exists = await db.execute(
text("SELECT id FROM news_items WHERE dedup_key = :dedup_key LIMIT 1"),
{"dedup_key": dedup_key},
)
if exists.fetchone():
duplicates += 1
continue
await db.execute(
tables.news_items_table.insert().values(
title=item.title,
content=item.content,
summary=item.summary,
source=item.source,
url=item.url,
published_at=item.published_at,
dedup_key=dedup_key,
status="pending",
)
)
inserted += 1
await db.commit()
if inserted:
result = await analyze_pending_news(limit=inserted)
analyzed += int(result.get("analyzed", 0))
failed += int(result.get("failed", 0))
return {
"fetched": len(items),
"inserted": inserted,
"duplicates": duplicates,
"analyzed": analyzed,
"failed": failed,
}
async def analyze_pending_news(limit: int = 50, use_llm: bool = True) -> dict:
rows = []
async with get_db() as db:
result = await db.execute(
text(
"SELECT * FROM news_items "
"WHERE status = 'pending' "
"ORDER BY COALESCE(published_at, created_at) DESC, id DESC "
"LIMIT :limit"
),
{"limit": limit},
)
rows = [dict(row._mapping) for row in result.fetchall()]
analyzed = 0
skipped = 0
failed = 0
for row in rows:
title = row.get("title") or ""
content = row.get("content") or row.get("summary") or ""
try:
if not _looks_market_relevant(title, content):
await _mark_news_item(row["id"], status="skipped", error="")
skipped += 1
continue
_, catalyst_id = await ingest_catalyst_with_id(
CatalystInput(
title=title,
content=content,
source=row.get("source") or "news",
url=row.get("url") or "",
published_at=row.get("published_at"),
),
use_llm=use_llm,
)
await _mark_news_item(
row["id"],
status="analyzed",
catalyst_id=catalyst_id,
error="",
)
analyzed += 1
except Exception as e:
logger.warning("新闻催化分析失败 id=%s title=%s error=%s", row.get("id"), title, e)
await _mark_news_item(row["id"], status="failed", error=str(e)[:500])
failed += 1
return {"analyzed": analyzed, "skipped": skipped, "failed": failed}
def build_news_dedup_key(item: NewsItem) -> str:
text = f"{item.source}|{item.url or item.title}"
normalized = "".join(ch.lower() for ch in text.strip() if ch.isalnum() or ch in ".:/_-|")
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
async def _mark_news_item(
news_id: int,
status: str,
catalyst_id: int | None = None,
error: str = "",
) -> None:
async with get_db() as db:
await db.execute(
text(
"UPDATE news_items SET status = :status, catalyst_id = :catalyst_id, "
"error = :error, analyzed_at = :analyzed_at WHERE id = :id"
),
{
"status": status,
"catalyst_id": catalyst_id,
"error": error,
"analyzed_at": datetime.now(),
"id": news_id,
},
)
await db.commit()
def _looks_market_relevant(title: str, content: str) -> bool:
text_value = f"{title} {content}"
keywords = [
"A股", "股市", "上市公司", "证券", "券商", "板块", "行业", "概念",
"政策", "国务院", "发改委", "工信部", "央行", "证监会", "交易所",
"业绩", "订单", "签约", "并购", "重组", "回购", "定增", "涨停",
"资金", "主力", "北向", "算力", "人工智能", "半导体", "新能源",
"机器人", "低空", "医药", "消费", "军工", "地产", "有色",
]
return any(keyword in text_value for keyword in keywords)
async def get_recent_catalysts(limit: int = 30, hours: int = 72) -> list[dict]:
since = datetime.now() - timedelta(hours=hours)
async with get_db() as db:
result = await db.execute(
text(
"SELECT c.*, "
"GROUP_CONCAT(tc.theme_name || ':' || ROUND(tc.relevance, 0), ',') AS themes "
"FROM catalysts c "
"LEFT JOIN theme_catalysts tc ON tc.catalyst_id = c.id "
"WHERE c.is_active = 1 AND COALESCE(c.published_at, c.created_at) >= :since "
"GROUP BY c.id "
"ORDER BY COALESCE(c.published_at, c.created_at) DESC, c.id DESC "
"LIMIT :limit"
),
{"since": since, "limit": limit},
)
rows = result.mappings().all()
return [dict(row) for row in rows]
async def get_recent_news_items(limit: int = 50, hours: int = 24, status: str | None = None) -> list[dict]:
since = datetime.now() - timedelta(hours=hours)
conditions = ["COALESCE(published_at, created_at) >= :since"]
params = {"since": since, "limit": limit}
if status:
conditions.append("status = :status")
params["status"] = status
async with get_db() as db:
result = await db.execute(
text(
"SELECT id, title, source, url, published_at, status, catalyst_id, error, created_at, analyzed_at "
"FROM news_items WHERE "
+ " AND ".join(conditions)
+ " ORDER BY COALESCE(published_at, created_at) DESC, id DESC LIMIT :limit"
),
params,
)
return [dict(row._mapping) for row in result.fetchall()]
async def build_theme_catalyst_scores(hours: int = 72, limit: int = 20) -> list[ThemeCatalystScore]:
since = datetime.now() - timedelta(hours=hours)
async with get_db() as db:
rows = (
await db.execute(
text(
"SELECT tc.theme_id, tc.theme_name, "
"COUNT(*) AS catalyst_count, "
"SUM((c.strength * 0.45 + c.freshness * 0.25 + c.confidence * 0.15 + tc.relevance * 0.15)) AS raw_score, "
"GROUP_CONCAT(SUBSTR(COALESCE(tc.reason, c.summary, c.title), 1, 60), ' | ') AS reasons "
"FROM theme_catalysts tc "
"JOIN catalysts c ON c.id = tc.catalyst_id "
"WHERE c.is_active = 1 AND COALESCE(c.published_at, c.created_at) >= :since "
"GROUP BY tc.theme_id, tc.theme_name "
"ORDER BY raw_score DESC "
"LIMIT :limit"
),
{"since": since, "limit": limit},
)
).mappings().all()
scores = []
for row in rows:
raw = float(row.get("raw_score") or 0)
count = int(row.get("catalyst_count") or 0)
normalized = min(raw / max(count, 1), 100)
reasons = [
item.strip()
for item in str(row.get("reasons") or "").split("|")
if item.strip()
][:3]
scores.append(ThemeCatalystScore(
theme_id=row["theme_id"],
theme_name=row["theme_name"],
catalyst_score=round(normalized, 1),
catalyst_count=count,
top_reasons=reasons,
generated_by="catalyst_layer",
))
return scores