tradusai/core/deduplicator.py
2025-12-02 22:54:03 +08:00

177 lines
4.9 KiB
Python

"""
Message deduplication using event time (E field) and LRU cache
"""
import logging
import time
from collections import OrderedDict
from typing import Dict, Any, Optional
from config import settings
logger = logging.getLogger(__name__)
class MessageDeduplicator:
"""
LRU-based message deduplicator with TTL support.
Uses the 'E' field (event time) from Binance messages as unique identifier.
Automatically evicts old entries to prevent memory leaks.
"""
def __init__(
self,
max_size: int = settings.DEDUP_CACHE_SIZE,
ttl_seconds: int = settings.DEDUP_TTL_SECONDS,
):
"""
Initialize deduplicator
Args:
max_size: Maximum number of entries to keep in cache
ttl_seconds: Time-to-live for cache entries in seconds
"""
self.max_size = max_size
self.ttl_seconds = ttl_seconds
# OrderedDict for LRU cache: {message_key: timestamp}
self._cache: OrderedDict[str, float] = OrderedDict()
# Statistics
self.stats = {
"total_checked": 0,
"duplicates_found": 0,
"cache_evictions": 0,
"ttl_evictions": 0,
}
def _generate_key(self, message: Dict[str, Any]) -> Optional[str]:
"""
Generate unique key for message
Uses combination of:
- Stream name (_stream field)
- Event time (E field)
- Symbol (s field)
Args:
message: Message data
Returns:
Unique key or None if key cannot be generated
"""
try:
# Get stream name
stream = message.get("_stream", "unknown")
# Get event time (E field) - primary dedup identifier
event_time = message.get("E")
if not event_time:
# Fallback to T field for some message types
event_time = message.get("T")
if not event_time:
logger.warning(f"No event time found in message: {message}")
return None
# Get symbol (s field)
symbol = message.get("s", "")
# Create composite key
key = f"{stream}:{symbol}:{event_time}"
return key
except Exception as e:
logger.error(f"Error generating dedup key: {e}")
return None
def _evict_expired(self) -> None:
"""Remove expired entries based on TTL"""
if not self._cache:
return
current_time = time.time()
expired_keys = []
# Find expired entries
for key, timestamp in self._cache.items():
if current_time - timestamp > self.ttl_seconds:
expired_keys.append(key)
else:
# OrderedDict is sorted by insertion time
# Once we hit a non-expired entry, all following entries are also non-expired
break
# Remove expired entries
for key in expired_keys:
del self._cache[key]
self.stats["ttl_evictions"] += 1
def _evict_lru(self) -> None:
"""Remove least recently used entry"""
if self._cache:
self._cache.popitem(last=False) # FIFO: remove oldest
self.stats["cache_evictions"] += 1
def is_duplicate(self, message: Dict[str, Any]) -> bool:
"""
Check if message is a duplicate
Args:
message: Message data to check
Returns:
True if duplicate, False if new message
"""
self.stats["total_checked"] += 1
# Generate unique key
key = self._generate_key(message)
if not key:
# If we can't generate a key, assume it's not a duplicate
return False
# Clean up expired entries periodically
if self.stats["total_checked"] % 100 == 0:
self._evict_expired()
# Check if key exists in cache
current_time = time.time()
if key in self._cache:
# Update timestamp (move to end for LRU)
del self._cache[key]
self._cache[key] = current_time
self.stats["duplicates_found"] += 1
return True
# New message - add to cache
self._cache[key] = current_time
# Enforce max size
if len(self._cache) > self.max_size:
self._evict_lru()
return False
def clear(self) -> None:
"""Clear all cache entries"""
self._cache.clear()
logger.info("Deduplication cache cleared")
def get_stats(self) -> Dict[str, Any]:
"""Get deduplication statistics"""
duplicate_rate = (
self.stats["duplicates_found"] / self.stats["total_checked"]
if self.stats["total_checked"] > 0
else 0.0
)
return {
**self.stats,
"cache_size": len(self._cache),
"duplicate_rate": f"{duplicate_rate:.2%}",
}