""" 新闻获取模块 - 从 RSS 源获取新闻 """ import asyncio import hashlib import feedparser import httpx from datetime import datetime, timedelta from typing import Dict, List, Any, Optional from dataclasses import dataclass from bs4 import BeautifulSoup from app.utils.logger import logger from app.news_agent.sources import get_enabled_sources @dataclass class NewsItem: """新闻项数据类""" title: str content: str url: str source: str category: str published_at: Optional[datetime] crawled_at: datetime content_hash: str author: Optional[str] = None tags: Optional[List[str]] = None def to_dict(self) -> Dict[str, Any]: """转换为字典""" return { 'title': self.title, 'content': self.content, 'url': self.url, 'source': self.source, 'category': self.category, 'published_at': self.published_at.isoformat() if self.published_at else None, 'crawled_at': self.crawled_at.isoformat(), 'content_hash': self.content_hash, 'author': self.author, 'tags': self.tags, } class NewsFetcher: """新闻获取器""" def __init__(self): self.sources = get_enabled_sources() self.client = httpx.AsyncClient( timeout=30.0, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } ) async def close(self): """关闭 HTTP 客户端""" await self.client.aclose() def _generate_content_hash(self, title: str, content: str) -> str: """生成内容哈希用于去重""" combined = f"{title}{content}" return hashlib.sha256(combined.encode()).hexdigest() def _clean_html(self, html: str) -> str: """清理 HTML,提取纯文本""" if not html: return "" soup = BeautifulSoup(html, 'html.parser') # 移除脚本和样式 for script in soup(['script', 'style']): script.decompose() # 获取文本 text = soup.get_text() # 清理空白 lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) return text[:5000] # 限制长度 def _parse_rss_date(self, date_str: str) -> Optional[datetime]: """解析 RSS 日期""" if not date_str: return None try: # feedparser 会解析日期 parsed = feedparser.parse(date_str) if hasattr(parsed, 'updated_parsed'): return datetime(*parsed.updated_parsed[:6]) except Exception as e: logger.debug(f"日期解析失败: {date_str}, 错误: {e}") return None async def fetch_rss_feed(self, source: Dict[str, Any]) -> List[NewsItem]: """ 获取单个 RSS 源的新闻 Args: source: 新闻源配置 Returns: 新闻项列表 """ items = [] try: logger.debug(f"正在获取 {source['name']} 的 RSS...") # 使用 feedparser 解析 RSS feed = feedparser.parse(source['url']) if feed.bozo: # RSS 解析错误 logger.warning(f"{source['name']} RSS 解析警告: {feed.bozo_exception}") # 解析每个条目 for entry in feed.entries[:50]: # 每次最多取 50 条 try: # 提取标题 title = entry.get('title', '') # 提取内容 content = '' if hasattr(entry, 'content'): content = entry.content[0].value if entry.content else '' elif hasattr(entry, 'summary'): content = entry.summary elif hasattr(entry, 'description'): content = entry.description # 清理 HTML content = self._clean_html(content) # 提取链接 url = entry.get('link', '') # 提取作者 author = entry.get('author', None) # 提取标签 tags = [] if hasattr(entry, 'tags'): tags = [tag.term for tag in entry.tags] # 解析发布时间 published_at = None if hasattr(entry, 'published_parsed'): published_at = datetime(*entry.published_parsed[:6]) elif hasattr(entry, 'updated_parsed'): published_at = datetime(*entry.updated_parsed[:6]) # 只处理最近 24 小时的新闻 if published_at: time_diff = datetime.utcnow() - published_at if time_diff > timedelta(hours=24): continue # 生成内容哈希 content_hash = self._generate_content_hash(title, content) news_item = NewsItem( title=title, content=content, url=url, source=source['name'], category=source['category'], published_at=published_at, crawled_at=datetime.utcnow(), content_hash=content_hash, author=author, tags=tags if tags else None ) items.append(news_item) except Exception as e: logger.debug(f"解析新闻条目失败: {e}") continue logger.info(f"从 {source['name']} 获取到 {len(items)} 条新闻") except Exception as e: logger.error(f"获取 {source['name']} 失败: {e}") return items async def fetch_all_news(self, category: str = None) -> List[NewsItem]: """ 获取所有新闻源的新闻 Args: category: 分类过滤 ('crypto', 'stock', None 表示全部) Returns: 所有新闻项列表 """ sources = get_enabled_sources(category) if not sources: logger.warning("没有启用的新闻源") return [] logger.info(f"开始从 {len(sources)} 个新闻源获取新闻...") # 并发获取所有源 tasks = [self.fetch_rss_feed(source) for source in sources] results = await asyncio.gather(*tasks, return_exceptions=True) # 合并结果 all_items = [] for result in results: if isinstance(result, Exception): logger.error(f"获取新闻时出错: {result}") continue all_items.extend(result) logger.info(f"总共获取到 {len(all_items)} 条新闻") return all_items async def fetch_single_url(self, url: str, source: str = "manual") -> Optional[NewsItem]: """ 获取单个 URL 的新闻内容 Args: url: 新闻 URL source: 新闻来源名称 Returns: 新闻项或 None """ try: response = await self.client.get(url) response.raise_for_status() # 使用 BeautifulSoup 解析 soup = BeautifulSoup(response.text, 'html.parser') # 尝试提取标题 title_tag = soup.find(['h1', 'title']) title = title_tag.get_text().strip() if title_tag else url # 提取正文(简单处理,实际需要针对不同网站调整) content = self._clean_html(response.text) # 生成哈希 content_hash = self._generate_content_hash(title, content) return NewsItem( title=title, content=content, url=url, source=source, category="manual", published_at=datetime.utcnow(), crawled_at=datetime.utcnow(), content_hash=content_hash ) except Exception as e: logger.error(f"获取 URL {url} 失败: {e}") return None