190 lines
5.9 KiB
Python
190 lines
5.9 KiB
Python
"""多源新闻采集器。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import email.utils
|
|
import html
|
|
import logging
|
|
import re
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime, timedelta
|
|
|
|
import httpx
|
|
|
|
from app.config import settings
|
|
from app.data.tushare_client import tushare_client
|
|
from app.news.models import NewsItem
|
|
|
|
logger = logging.getLogger(__name__)
|
|
_tushare_source_cursor = 0
|
|
|
|
RSS_HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
}
|
|
|
|
|
|
async def collect_news_sources(
|
|
lookback_hours: int | None = None,
|
|
limit_per_source: int | None = None,
|
|
) -> list[NewsItem]:
|
|
"""从配置的数据源采集新闻。失败源隔离,不影响其他源。"""
|
|
lookback_hours = lookback_hours or settings.news_fetch_lookback_hours
|
|
limit_per_source = limit_per_source or settings.news_fetch_limit_per_source
|
|
items: list[NewsItem] = []
|
|
|
|
for source in _select_tushare_sources_for_run():
|
|
try:
|
|
items.extend(await _collect_tushare_news(source, lookback_hours, limit_per_source))
|
|
except Exception as e:
|
|
logger.warning("Tushare 新闻源采集失败 source=%s error=%s", source, e)
|
|
|
|
rss_sources = _parse_rss_sources(settings.news_rss_sources)
|
|
if rss_sources:
|
|
async with httpx.AsyncClient(headers=RSS_HEADERS, timeout=10.0, follow_redirects=True) as client:
|
|
for name, url in rss_sources:
|
|
try:
|
|
items.extend(await _collect_rss(client, name, url, lookback_hours, limit_per_source))
|
|
except Exception as e:
|
|
logger.warning("RSS 新闻源采集失败 source=%s url=%s error=%s", name, url, e)
|
|
|
|
return _dedup_in_memory(items)
|
|
|
|
|
|
async def _collect_tushare_news(source: str, lookback_hours: int, limit: int) -> list[NewsItem]:
|
|
df = tushare_client.get_news(
|
|
source=source,
|
|
start_time=datetime.now() - timedelta(hours=lookback_hours),
|
|
end_time=datetime.now(),
|
|
limit=limit,
|
|
)
|
|
if df.empty:
|
|
return []
|
|
|
|
items: list[NewsItem] = []
|
|
for _, row in df.iterrows():
|
|
title = _clean_text(row.get("title", ""))
|
|
if len(title) < settings.news_min_title_length:
|
|
continue
|
|
content = _clean_text(row.get("content", ""))
|
|
items.append(NewsItem(
|
|
title=title,
|
|
content=content,
|
|
summary=_clean_text(row.get("summary", "")),
|
|
source=f"tushare:{source}",
|
|
url=str(row.get("url", "") or ""),
|
|
published_at=_parse_datetime(row.get("datetime") or row.get("time") or row.get("publish_time")),
|
|
))
|
|
return items[:limit]
|
|
|
|
|
|
async def _collect_rss(
|
|
client: httpx.AsyncClient,
|
|
source: str,
|
|
url: str,
|
|
lookback_hours: int,
|
|
limit: int,
|
|
) -> list[NewsItem]:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
root = ET.fromstring(resp.content)
|
|
cutoff = datetime.now() - timedelta(hours=lookback_hours)
|
|
items: list[NewsItem] = []
|
|
|
|
for item in root.findall(".//item")[: limit * 2]:
|
|
title = _clean_text(_xml_text(item, "title"))
|
|
if len(title) < settings.news_min_title_length:
|
|
continue
|
|
published_at = _parse_datetime(_xml_text(item, "pubDate"))
|
|
if published_at and published_at < cutoff:
|
|
continue
|
|
summary = _clean_text(_xml_text(item, "description"))
|
|
items.append(NewsItem(
|
|
title=title,
|
|
content=summary,
|
|
summary=summary[:240],
|
|
source=f"rss:{source}",
|
|
url=_clean_text(_xml_text(item, "link")),
|
|
published_at=published_at,
|
|
))
|
|
if len(items) >= limit:
|
|
break
|
|
|
|
return items
|
|
|
|
|
|
def _split_csv(value: str) -> list[str]:
|
|
return [item.strip() for item in (value or "").split(",") if item.strip()]
|
|
|
|
|
|
def _parse_rss_sources(value: str) -> list[tuple[str, str]]:
|
|
result: list[tuple[str, str]] = []
|
|
for chunk in _split_csv(value):
|
|
if "|" not in chunk:
|
|
continue
|
|
name, url = chunk.split("|", 1)
|
|
name = name.strip()
|
|
url = url.strip()
|
|
if name and url:
|
|
result.append((name, url))
|
|
return result
|
|
|
|
|
|
def _select_tushare_sources_for_run() -> list[str]:
|
|
"""Tushare news 免费/低权限账号通常限制 1 次/分钟,每轮只取少量源。"""
|
|
global _tushare_source_cursor
|
|
|
|
sources = _split_csv(settings.news_tushare_sources)
|
|
if not sources:
|
|
return []
|
|
|
|
limit = max(1, min(int(settings.news_tushare_sources_per_run or 1), len(sources)))
|
|
selected: list[str] = []
|
|
for offset in range(limit):
|
|
selected.append(sources[(_tushare_source_cursor + offset) % len(sources)])
|
|
_tushare_source_cursor = (_tushare_source_cursor + limit) % len(sources)
|
|
return selected
|
|
|
|
|
|
def _xml_text(item: ET.Element, tag: str) -> str:
|
|
node = item.find(tag)
|
|
return node.text if node is not None and node.text else ""
|
|
|
|
|
|
def _clean_text(value) -> str:
|
|
text = html.unescape(str(value or ""))
|
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text.strip()
|
|
|
|
|
|
def _parse_datetime(value) -> datetime | None:
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, datetime):
|
|
return value.replace(tzinfo=None)
|
|
text = str(value).strip()
|
|
if not text:
|
|
return None
|
|
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y%m%d%H%M%S", "%Y%m%d"):
|
|
try:
|
|
return datetime.strptime(text[: len(fmt)], fmt)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
parsed = email.utils.parsedate_to_datetime(text)
|
|
return parsed.replace(tzinfo=None)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _dedup_in_memory(items: list[NewsItem]) -> list[NewsItem]:
|
|
seen: set[str] = set()
|
|
result: list[NewsItem] = []
|
|
for item in items:
|
|
key = re.sub(r"\W+", "", item.title.lower())[:80]
|
|
if not key or key in seen:
|
|
continue
|
|
seen.add(key)
|
|
result.append(item)
|
|
return result
|