stock-ai-agent/backend/app/news_agent/fetcher.py

"""
新闻获取模块 - 从 RSS 源获取新闻
"""
import asyncio
import hashlib
import feedparser
import httpx
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from bs4 import BeautifulSoup

from app.utils.logger import logger
from app.news_agent.sources import get_enabled_sources


@dataclass
class NewsItem:
    """新闻项数据类"""
    title: str
    content: str
    url: str
    source: str
    category: str
    published_at: Optional[datetime]
    crawled_at: datetime
    content_hash: str
    author: Optional[str] = None
    tags: Optional[List[str]] = None

    def to_dict(self) -> Dict[str, Any]:
        """转换为字典"""
        return {
            'title': self.title,
            'content': self.content,
            'url': self.url,
            'source': self.source,
            'category': self.category,
            'published_at': self.published_at.isoformat() if self.published_at else None,
            'crawled_at': self.crawled_at.isoformat(),
            'content_hash': self.content_hash,
            'author': self.author,
            'tags': self.tags,
        }


class NewsFetcher:
    """新闻获取器"""

    def __init__(self):
        self.sources = get_enabled_sources()
        self.client = httpx.AsyncClient(
            timeout=30.0,
            headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
        )

    async def close(self):
        """关闭 HTTP 客户端"""
        await self.client.aclose()

    def _generate_content_hash(self, title: str, content: str) -> str:
        """生成内容哈希用于去重"""
        combined = f"{title}{content}"
        return hashlib.sha256(combined.encode()).hexdigest()

    def _clean_html(self, html: str) -> str:
        """清理 HTML，提取纯文本"""
        if not html:
            return ""

        soup = BeautifulSoup(html, 'html.parser')

        # 移除脚本和样式
        for script in soup(['script', 'style']):
            script.decompose()

        # 获取文本
        text = soup.get_text()

        # 清理空白
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)

        return text[:5000]  # 限制长度

    def _parse_rss_date(self, date_str: str) -> Optional[datetime]:
        """解析 RSS 日期"""
        if not date_str:
            return None

        try:
            # feedparser 会解析日期
            parsed = feedparser.parse(date_str)
            if hasattr(parsed, 'updated_parsed'):
                return datetime(*parsed.updated_parsed[:6])
        except Exception as e:
            logger.debug(f"日期解析失败: {date_str}, 错误: {e}")

        return None

    async def fetch_rss_feed(self, source: Dict[str, Any]) -> List[NewsItem]:
        """
        获取单个 RSS 源的新闻

        Args:
            source: 新闻源配置

        Returns:
            新闻项列表
        """
        items = []

        try:
            logger.debug(f"正在获取 {source['name']} 的 RSS...")

            # 使用 feedparser 解析 RSS
            feed = feedparser.parse(source['url'])

            if feed.bozo:  # RSS 解析错误
                logger.warning(f"{source['name']} RSS 解析警告: {feed.bozo_exception}")

            # 解析每个条目
            for entry in feed.entries[:50]:  # 每次最多取 50 条
                try:
                    # 提取标题
                    title = entry.get('title', '')

                    # 提取内容
                    content = ''
                    if hasattr(entry, 'content'):
                        content = entry.content[0].value if entry.content else ''
                    elif hasattr(entry, 'summary'):
                        content = entry.summary
                    elif hasattr(entry, 'description'):
                        content = entry.description

                    # 清理 HTML
                    content = self._clean_html(content)

                    # 提取链接
                    url = entry.get('link', '')

                    # 提取作者
                    author = entry.get('author', None)

                    # 提取标签
                    tags = []
                    if hasattr(entry, 'tags'):
                        tags = [tag.term for tag in entry.tags]

                    # 解析发布时间
                    published_at = None
                    if hasattr(entry, 'published_parsed'):
                        published_at = datetime(*entry.published_parsed[:6])
                    elif hasattr(entry, 'updated_parsed'):
                        published_at = datetime(*entry.updated_parsed[:6])

                    # 只处理最近 24 小时的新闻
                    if published_at:
                        time_diff = datetime.utcnow() - published_at
                        if time_diff > timedelta(hours=24):
                            continue

                    # 生成内容哈希
                    content_hash = self._generate_content_hash(title, content)

                    news_item = NewsItem(
                        title=title,
                        content=content,
                        url=url,
                        source=source['name'],
                        category=source['category'],
                        published_at=published_at,
                        crawled_at=datetime.utcnow(),
                        content_hash=content_hash,
                        author=author,
                        tags=tags if tags else None
                    )

                    items.append(news_item)

                except Exception as e:
                    logger.debug(f"解析新闻条目失败: {e}")
                    continue

            logger.info(f"从 {source['name']} 获取到 {len(items)} 条新闻")

        except Exception as e:
            logger.error(f"获取 {source['name']} 失败: {e}")

        return items

    async def fetch_all_news(self, category: str = None) -> List[NewsItem]:
        """
        获取所有新闻源的新闻

        Args:
            category: 分类过滤 ('crypto', 'stock', None 表示全部)

        Returns:
            所有新闻项列表
        """
        sources = get_enabled_sources(category)

        if not sources:
            logger.warning("没有启用的新闻源")
            return []

        logger.info(f"开始从 {len(sources)} 个新闻源获取新闻...")

        # 并发获取所有源
        tasks = [self.fetch_rss_feed(source) for source in sources]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # 合并结果
        all_items = []
        for result in results:
            if isinstance(result, Exception):
                logger.error(f"获取新闻时出错: {result}")
                continue
            all_items.extend(result)

        logger.info(f"总共获取到 {len(all_items)} 条新闻")

        return all_items

    async def fetch_single_url(self, url: str, source: str = "manual") -> Optional[NewsItem]:
        """
        获取单个 URL 的新闻内容

        Args:
            url: 新闻 URL
            source: 新闻来源名称

        Returns:
            新闻项或 None
        """
        try:
            response = await self.client.get(url)
            response.raise_for_status()

            # 使用 BeautifulSoup 解析
            soup = BeautifulSoup(response.text, 'html.parser')

            # 尝试提取标题
            title_tag = soup.find(['h1', 'title'])
            title = title_tag.get_text().strip() if title_tag else url

            # 提取正文（简单处理，实际需要针对不同网站调整）
            content = self._clean_html(response.text)

            # 生成哈希
            content_hash = self._generate_content_hash(title, content)

            return NewsItem(
                title=title,
                content=content,
                url=url,
                source=source,
                category="manual",
                published_at=datetime.utcnow(),
                crawled_at=datetime.utcnow(),
                content_hash=content_hash
            )

        except Exception as e:
            logger.error(f"获取 URL {url} 失败: {e}")
            return None