272 lines
8.4 KiB
Python
272 lines
8.4 KiB
Python
"""
|
||
新闻获取模块 - 从 RSS 源获取新闻
|
||
"""
|
||
import asyncio
|
||
import hashlib
|
||
import feedparser
|
||
import httpx
|
||
from datetime import datetime, timedelta
|
||
from typing import Dict, List, Any, Optional
|
||
from dataclasses import dataclass
|
||
from bs4 import BeautifulSoup
|
||
|
||
from app.utils.logger import logger
|
||
from app.news_agent.sources import get_enabled_sources
|
||
|
||
|
||
@dataclass
|
||
class NewsItem:
|
||
"""新闻项数据类"""
|
||
title: str
|
||
content: str
|
||
url: str
|
||
source: str
|
||
category: str
|
||
published_at: Optional[datetime]
|
||
crawled_at: datetime
|
||
content_hash: str
|
||
author: Optional[str] = None
|
||
tags: Optional[List[str]] = None
|
||
|
||
def to_dict(self) -> Dict[str, Any]:
|
||
"""转换为字典"""
|
||
return {
|
||
'title': self.title,
|
||
'content': self.content,
|
||
'url': self.url,
|
||
'source': self.source,
|
||
'category': self.category,
|
||
'published_at': self.published_at.isoformat() if self.published_at else None,
|
||
'crawled_at': self.crawled_at.isoformat(),
|
||
'content_hash': self.content_hash,
|
||
'author': self.author,
|
||
'tags': self.tags,
|
||
}
|
||
|
||
|
||
class NewsFetcher:
|
||
"""新闻获取器"""
|
||
|
||
def __init__(self):
|
||
self.sources = get_enabled_sources()
|
||
self.client = httpx.AsyncClient(
|
||
timeout=30.0,
|
||
headers={
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
}
|
||
)
|
||
|
||
async def close(self):
|
||
"""关闭 HTTP 客户端"""
|
||
await self.client.aclose()
|
||
|
||
def _generate_content_hash(self, title: str, content: str) -> str:
|
||
"""生成内容哈希用于去重"""
|
||
combined = f"{title}{content}"
|
||
return hashlib.sha256(combined.encode()).hexdigest()
|
||
|
||
def _clean_html(self, html: str) -> str:
|
||
"""清理 HTML,提取纯文本"""
|
||
if not html:
|
||
return ""
|
||
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 移除脚本和样式
|
||
for script in soup(['script', 'style']):
|
||
script.decompose()
|
||
|
||
# 获取文本
|
||
text = soup.get_text()
|
||
|
||
# 清理空白
|
||
lines = (line.strip() for line in text.splitlines())
|
||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||
text = ' '.join(chunk for chunk in chunks if chunk)
|
||
|
||
return text[:5000] # 限制长度
|
||
|
||
def _parse_rss_date(self, date_str: str) -> Optional[datetime]:
|
||
"""解析 RSS 日期"""
|
||
if not date_str:
|
||
return None
|
||
|
||
try:
|
||
# feedparser 会解析日期
|
||
parsed = feedparser.parse(date_str)
|
||
if hasattr(parsed, 'updated_parsed'):
|
||
return datetime(*parsed.updated_parsed[:6])
|
||
except Exception as e:
|
||
logger.debug(f"日期解析失败: {date_str}, 错误: {e}")
|
||
|
||
return None
|
||
|
||
async def fetch_rss_feed(self, source: Dict[str, Any]) -> List[NewsItem]:
|
||
"""
|
||
获取单个 RSS 源的新闻
|
||
|
||
Args:
|
||
source: 新闻源配置
|
||
|
||
Returns:
|
||
新闻项列表
|
||
"""
|
||
items = []
|
||
|
||
try:
|
||
logger.debug(f"正在获取 {source['name']} 的 RSS...")
|
||
|
||
# 使用 feedparser 解析 RSS
|
||
feed = feedparser.parse(source['url'])
|
||
|
||
if feed.bozo: # RSS 解析错误
|
||
logger.warning(f"{source['name']} RSS 解析警告: {feed.bozo_exception}")
|
||
|
||
# 解析每个条目
|
||
for entry in feed.entries[:50]: # 每次最多取 50 条
|
||
try:
|
||
# 提取标题
|
||
title = entry.get('title', '')
|
||
|
||
# 提取内容
|
||
content = ''
|
||
if hasattr(entry, 'content'):
|
||
content = entry.content[0].value if entry.content else ''
|
||
elif hasattr(entry, 'summary'):
|
||
content = entry.summary
|
||
elif hasattr(entry, 'description'):
|
||
content = entry.description
|
||
|
||
# 清理 HTML
|
||
content = self._clean_html(content)
|
||
|
||
# 提取链接
|
||
url = entry.get('link', '')
|
||
|
||
# 提取作者
|
||
author = entry.get('author', None)
|
||
|
||
# 提取标签
|
||
tags = []
|
||
if hasattr(entry, 'tags'):
|
||
tags = [tag.term for tag in entry.tags]
|
||
|
||
# 解析发布时间
|
||
published_at = None
|
||
if hasattr(entry, 'published_parsed'):
|
||
published_at = datetime(*entry.published_parsed[:6])
|
||
elif hasattr(entry, 'updated_parsed'):
|
||
published_at = datetime(*entry.updated_parsed[:6])
|
||
|
||
# 只处理最近 24 小时的新闻
|
||
if published_at:
|
||
time_diff = datetime.utcnow() - published_at
|
||
if time_diff > timedelta(hours=24):
|
||
continue
|
||
|
||
# 生成内容哈希
|
||
content_hash = self._generate_content_hash(title, content)
|
||
|
||
news_item = NewsItem(
|
||
title=title,
|
||
content=content,
|
||
url=url,
|
||
source=source['name'],
|
||
category=source['category'],
|
||
published_at=published_at,
|
||
crawled_at=datetime.utcnow(),
|
||
content_hash=content_hash,
|
||
author=author,
|
||
tags=tags if tags else None
|
||
)
|
||
|
||
items.append(news_item)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"解析新闻条目失败: {e}")
|
||
continue
|
||
|
||
logger.info(f"从 {source['name']} 获取到 {len(items)} 条新闻")
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取 {source['name']} 失败: {e}")
|
||
|
||
return items
|
||
|
||
async def fetch_all_news(self, category: str = None) -> List[NewsItem]:
|
||
"""
|
||
获取所有新闻源的新闻
|
||
|
||
Args:
|
||
category: 分类过滤 ('crypto', 'stock', None 表示全部)
|
||
|
||
Returns:
|
||
所有新闻项列表
|
||
"""
|
||
sources = get_enabled_sources(category)
|
||
|
||
if not sources:
|
||
logger.warning("没有启用的新闻源")
|
||
return []
|
||
|
||
logger.info(f"开始从 {len(sources)} 个新闻源获取新闻...")
|
||
|
||
# 并发获取所有源
|
||
tasks = [self.fetch_rss_feed(source) for source in sources]
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
# 合并结果
|
||
all_items = []
|
||
for result in results:
|
||
if isinstance(result, Exception):
|
||
logger.error(f"获取新闻时出错: {result}")
|
||
continue
|
||
all_items.extend(result)
|
||
|
||
logger.info(f"总共获取到 {len(all_items)} 条新闻")
|
||
|
||
return all_items
|
||
|
||
async def fetch_single_url(self, url: str, source: str = "manual") -> Optional[NewsItem]:
|
||
"""
|
||
获取单个 URL 的新闻内容
|
||
|
||
Args:
|
||
url: 新闻 URL
|
||
source: 新闻来源名称
|
||
|
||
Returns:
|
||
新闻项或 None
|
||
"""
|
||
try:
|
||
response = await self.client.get(url)
|
||
response.raise_for_status()
|
||
|
||
# 使用 BeautifulSoup 解析
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# 尝试提取标题
|
||
title_tag = soup.find(['h1', 'title'])
|
||
title = title_tag.get_text().strip() if title_tag else url
|
||
|
||
# 提取正文(简单处理,实际需要针对不同网站调整)
|
||
content = self._clean_html(response.text)
|
||
|
||
# 生成哈希
|
||
content_hash = self._generate_content_hash(title, content)
|
||
|
||
return NewsItem(
|
||
title=title,
|
||
content=content,
|
||
url=url,
|
||
source=source,
|
||
category="manual",
|
||
published_at=datetime.utcnow(),
|
||
crawled_at=datetime.utcnow(),
|
||
content_hash=content_hash
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取 URL {url} 失败: {e}")
|
||
return None
|