stock-ai-agent/backend/app/services/news_service.py
2026-02-20 21:39:47 +08:00

558 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
新闻舆情服务 - 获取加密货币和股票相关新闻
"""
import re
import html
import asyncio
import aiohttp
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from app.utils.logger import logger
from app.config import get_settings
class NewsService:
"""新闻舆情服务"""
# 律动快讯 RSS
BLOCKBEATS_RSS = "https://api.theblockbeats.news/v2/rss/newsflash"
# Cointelegraph RSS - 英文加密货币新闻
COINTELEGRAPH_RSS = "https://cointelegraph.com/rss"
# CoinDesk RSS - 权威加密货币新闻
COINDESK_RSS = "https://www.coindesk.com/arc/outboundfeeds/rss/"
# Brave Search API
BRAVE_SEARCH_API = "https://api.search.brave.com/res/v1/web/search"
def __init__(self):
"""初始化新闻服务"""
self._cache: Dict[str, List[Dict[str, Any]]] = {'crypto': [], 'stock': {}}
self._cache_time: Optional[datetime] = None
self._cache_duration = timedelta(minutes=5) # 缓存5分钟
self.settings = get_settings()
logger.info("新闻舆情服务初始化完成")
async def get_latest_news(self, limit: int = 20) -> List[Dict[str, Any]]:
"""
获取最新加密货币新闻(多源聚合)
数据源:
1. 律动快讯 - 中文快讯
2. Cointelegraph - 英文新闻
3. CoinDesk - 英文深度分析
Args:
limit: 获取数量
Returns:
新闻列表
"""
# 检查缓存
if self._cache and self._cache_time:
if datetime.now() - self._cache_time < self._cache_duration:
return self._cache['crypto'][:limit] if isinstance(self._cache, dict) else self._cache[:limit]
try:
# 并发获取所有源的新闻
news_tasks = [
self._fetch_blockbeats_news(),
self._fetch_cointelegraph_news(),
self._fetch_coindesk_news(),
]
results = await asyncio.gather(*news_tasks, return_exceptions=True)
# 合并新闻
all_news = []
for result in results:
if isinstance(result, list):
all_news.extend(result)
elif isinstance(result, Exception):
logger.warning(f"获取新闻失败: {result}")
# 按时间排序
all_news.sort(key=lambda x: x.get('time') or datetime.min, reverse=True)
# 更新缓存
self._cache = {'crypto': all_news, 'stock': self._cache.get('stock', {}) if isinstance(self._cache, dict) else {}}
self._cache_time = datetime.now()
logger.info(f"获取到 {len(all_news)} 条加密货币新闻(律动+Cointelegraph+CoinDesk")
return all_news[:limit]
except Exception as e:
logger.error(f"获取新闻失败: {e}")
# 返回缓存
if isinstance(self._cache, dict):
return self._cache.get('crypto', [])[:limit]
return self._cache[:limit] if self._cache else []
async def _fetch_blockbeats_news(self) -> List[Dict[str, Any]]:
"""获取律动快讯"""
news_list = []
try:
async with aiohttp.ClientSession() as session:
async with session.get(self.BLOCKBEATS_RSS, timeout=10) as response:
if response.status != 200:
logger.error(f"获取律动快讯失败: HTTP {response.status}")
return []
content = await response.text()
# 解析 XML
root = ET.fromstring(content)
channel = root.find('channel')
if channel is None:
return []
for item in channel.findall('item'):
title_elem = item.find('title')
desc_elem = item.find('description')
pub_date_elem = item.find('pubDate')
link_elem = item.find('link')
if title_elem is None:
continue
# 提取标题
title = self._clean_cdata(title_elem.text or '')
# 提取描述(去除 HTML 标签)
description = ''
if desc_elem is not None and desc_elem.text:
description = self._clean_html(self._clean_cdata(desc_elem.text))
# 解析时间
pub_time = None
if pub_date_elem is not None and pub_date_elem.text:
pub_time = self._parse_rss_date(self._clean_cdata(pub_date_elem.text))
# 链接
link = ''
if link_elem is not None and link_elem.text:
link = self._clean_cdata(link_elem.text)
news_list.append({
'title': title,
'description': description[:500], # 限制长度
'time': pub_time,
'time_str': pub_time.strftime('%m-%d %H:%M') if pub_time else '',
'link': link,
'source': '律动BlockBeats'
})
logger.info(f"获取到 {len(news_list)} 条律动快讯")
return news_list
except ET.ParseError as e:
logger.error(f"解析律动快讯 XML 失败: {e}")
# 记录部分内容用于调试
if 'content' in locals():
logger.debug(f"RSS 内容前 500 字符: {content[:500]}")
return []
except Exception as e:
logger.error(f"获取律动快讯失败: {e}")
import traceback
logger.debug(traceback.format_exc())
return []
async def _fetch_cointelegraph_news(self) -> List[Dict[str, Any]]:
"""获取 Cointelegraph 新闻(英文)"""
news_list = []
try:
async with aiohttp.ClientSession() as session:
async with session.get(self.COINTELEGRAPH_RSS, timeout=10) as response:
if response.status != 200:
logger.error(f"获取 Cointelegraph 失败: HTTP {response.status}")
return []
content = await response.text()
# 解析 XML
root = ET.fromstring(content)
channel = root.find('channel')
if channel is None:
return []
for item in channel.findall('item')[:20]: # 最多取20条
title_elem = item.find('title')
desc_elem = item.find('description')
pub_date_elem = item.find('pubDate')
link_elem = item.find('link')
if title_elem is None:
continue
# 提取标题
title = self._clean_cdata(title_elem.text or '')
# 提取描述(去除 HTML 标签)
description = ''
if desc_elem is not None and desc_elem.text:
description = self._clean_html(self._clean_cdata(desc_elem.text))
# 解析时间
pub_time = None
if pub_date_elem is not None and pub_date_elem.text:
pub_time = self._parse_rss_date(self._clean_cdata(pub_date_elem.text))
# 链接
link = ''
if link_elem is not None and link_elem.text:
link = self._clean_cdata(link_elem.text)
news_list.append({
'title': title,
'description': description[:500],
'time': pub_time,
'time_str': pub_time.strftime('%m-%d %H:%M') if pub_time else '',
'link': link,
'source': 'Cointelegraph'
})
logger.info(f"获取到 {len(news_list)} 条 Cointelegraph 新闻")
return news_list
except Exception as e:
logger.error(f"获取 Cointelegraph 失败: {e}")
return []
async def _fetch_coindesk_news(self) -> List[Dict[str, Any]]:
"""获取 CoinDesk 新闻(英文)"""
news_list = []
try:
async with aiohttp.ClientSession() as session:
async with session.get(self.COINDESK_RSS, timeout=10) as response:
if response.status != 200:
logger.error(f"获取 CoinDesk 失败: HTTP {response.status}")
return []
content = await response.text()
# 解析 XML
root = ET.fromstring(content)
channel = root.find('channel')
if channel is None:
return []
for item in channel.findall('item')[:20]: # 最多取20条
title_elem = item.find('title')
desc_elem = item.find('description')
pub_date_elem = item.find('pubDate')
link_elem = item.find('link')
if title_elem is None:
continue
# 提取标题
title = self._clean_cdata(title_elem.text or '')
# 提取描述(去除 HTML 标签)
description = ''
if desc_elem is not None and desc_elem.text:
description = self._clean_html(self._clean_cdata(desc_elem.text))
# 解析时间
pub_time = None
if pub_date_elem is not None and pub_date_elem.text:
pub_time = self._parse_rss_date(self._clean_cdata(pub_date_elem.text))
# 链接
link = ''
if link_elem is not None and link_elem.text:
link = self._clean_cdata(link_elem.text)
news_list.append({
'title': title,
'description': description[:500],
'time': pub_time,
'time_str': pub_time.strftime('%m-%d %H:%M') if pub_time else '',
'link': link,
'source': 'CoinDesk'
})
logger.info(f"获取到 {len(news_list)} 条 CoinDesk 新闻")
return news_list
except Exception as e:
logger.error(f"获取 CoinDesk 失败: {e}")
return []
def _clean_cdata(self, text: str) -> str:
"""清理 CDATA 标记"""
if not text:
return ''
# 移除 CDATA 包装
text = re.sub(r'<!\[CDATA\[(.*?)\]\]>', r'\1', text, flags=re.DOTALL)
return text.strip()
def _clean_html(self, text: str) -> str:
"""清理 HTML 标签"""
if not text:
return ''
# 移除 HTML 标签
text = re.sub(r'<[^>]+>', '', text)
# 解码 HTML 实体
text = html.unescape(text)
# 清理多余空白
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _parse_rss_date(self, date_str: str) -> Optional[datetime]:
"""解析 RSS 日期格式"""
if not date_str:
return None
# RSS 日期格式: "Sat, 07 Feb 2026 00:30:33 +0800"
formats = [
'%a, %d %b %Y %H:%M:%S %z',
'%a, %d %b %Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S'
]
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
return None
def filter_relevant_news(self, news_list: List[Dict[str, Any]],
symbols: List[str] = None,
hours: int = 4) -> List[Dict[str, Any]]:
"""
过滤相关新闻
Args:
news_list: 新闻列表
symbols: 关注的交易对(如 ['BTCUSDT', 'ETHUSDT']
hours: 只保留最近几小时的新闻
Returns:
过滤后的新闻
"""
if not news_list:
return []
# 时间过滤
cutoff_time = datetime.now() - timedelta(hours=hours)
filtered = []
# 关键词映射
symbol_keywords = {
'BTCUSDT': ['比特币', 'BTC', 'Bitcoin'],
'ETHUSDT': ['以太坊', 'ETH', 'Ethereum'],
'BNBUSDT': ['BNB', 'Binance'],
'SOLUSDT': ['SOL', 'Solana'],
}
# 通用关键词(影响整体市场)
market_keywords = [
'市场', '行情', '反弹', '下跌', '暴跌', '暴涨', '清算',
'资金费率', '多单', '空单', '杠杆', '爆仓',
'美联储', 'Fed', '利率', '通胀',
'监管', 'SEC', 'ETF',
'鲸鱼', '巨鲸', '大户',
'交易所', 'Binance', 'Coinbase'
]
for news in news_list:
# 时间过滤
if news.get('time'):
# 处理带时区的时间
news_time = news['time']
if news_time.tzinfo:
news_time = news_time.replace(tzinfo=None)
if news_time < cutoff_time:
continue
title = news.get('title', '')
desc = news.get('description', '')
content = title + ' ' + desc
# 检查是否与关注的交易对相关
is_relevant = False
if symbols:
for symbol in symbols:
keywords = symbol_keywords.get(symbol, [])
for kw in keywords:
if kw.lower() in content.lower():
is_relevant = True
news['related_symbol'] = symbol
break
if is_relevant:
break
# 检查是否包含市场关键词
if not is_relevant:
for kw in market_keywords:
if kw.lower() in content.lower():
is_relevant = True
news['related_symbol'] = 'MARKET'
break
if is_relevant:
filtered.append(news)
return filtered
async def search_stock_news(self, symbol: str, stock_name: str = '',
max_results: int = 10) -> List[Dict[str, Any]]:
"""
使用 Brave Search API 搜索股票相关新闻
Args:
symbol: 股票代码(如 AAPL, 0700.HK
stock_name: 股票中文名称(可选)
max_results: 最大结果数
Returns:
新闻列表
"""
api_key = self.settings.brave_api_key
if not api_key:
logger.warning("未配置 Brave API Key跳过新闻搜索")
return []
# 检查缓存
cache_key = f"{symbol}_{stock_name}"
if self._cache_time and cache_key in self._cache.get('stock', {}):
if datetime.now() - self._cache_time < self._cache_duration:
return self._cache['stock'][cache_key][:max_results]
# 构建搜索查询
# 根据股票类型构建不同的搜索词
if symbol.endswith('.HK'):
# 港股
if stock_name:
query = f"{stock_name} 港股 新闻 最新"
else:
query = f"{symbol.replace('.HK', '')} 港股 新闻 最新"
else:
# 美股
if stock_name:
query = f"{stock_name} 股票 {symbol} news latest"
else:
query = f"{symbol} stock news latest"
try:
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip',
'X-Subscription-Token': api_key
}
params = {
'q': query,
'count': max_results,
'text_decorations': 'false', # 改为字符串
'search_lang': 'zh-hans', # Brave Search 使用 zh-hans 而非 zh-CN
# 'result_filter': 'news', # 免费计划不支持,移除此参数
'freshness': 'pd' # 过去24小时
}
async with aiohttp.ClientSession() as session:
async with session.get(
self.BRAVE_SEARCH_API,
headers=headers,
params=params,
timeout=10
) as response:
if response.status != 200:
logger.error(f"Brave Search API 请求失败: HTTP {response.status}")
return []
data = await response.json()
# 解析搜索结果
news_list = []
web_results = data.get('web', {}).get('results', [])
for item in web_results:
title = item.get('title', '')
url = item.get('url', '')
description = item.get('description', '')
# 清理描述
description = self._clean_html(description)
news_list.append({
'title': title,
'description': description[:500],
'time': datetime.now(), # Brave Search 不返回精确时间
'time_str': datetime.now().strftime('%m-%d %H:%M'),
'link': url,
'source': 'Brave Search'
})
logger.info(f"Brave Search 搜索 {symbol} 获取到 {len(news_list)} 条新闻")
# 更新缓存
if 'stock' not in self._cache:
self._cache['stock'] = {}
self._cache['stock'][cache_key] = news_list
self._cache_time = datetime.now()
return news_list[:max_results]
except aiohttp.ClientError as e:
logger.error(f"Brave Search API 请求失败: {e}")
return []
except Exception as e:
logger.error(f"搜索股票新闻失败: {e}")
import traceback
logger.debug(traceback.format_exc())
return []
def format_news_for_llm(self, news_list: List[Dict[str, Any]],
max_items: int = 10) -> str:
"""
格式化新闻供 LLM 分析
Args:
news_list: 新闻列表
max_items: 最大条数
Returns:
格式化的新闻文本
"""
if not news_list:
return "暂无相关新闻"
lines = ["## 最新市场新闻\n"]
for i, news in enumerate(news_list[:max_items], 1):
time_str = news.get('time_str', '')
title = news.get('title', '')
desc = news.get('description', '')[:200] # 限制描述长度
lines.append(f"### {i}. [{time_str}] {title}")
if desc:
lines.append(f"{desc}")
lines.append("")
return "\n".join(lines)
# 全局实例
_news_service: Optional[NewsService] = None
def get_news_service() -> NewsService:
"""获取新闻服务实例"""
global _news_service
if _news_service is None:
_news_service = NewsService()
return _news_service