""" Message deduplication using event time (E field) and LRU cache """ import logging import time from collections import OrderedDict from typing import Dict, Any, Optional from config import settings logger = logging.getLogger(__name__) class MessageDeduplicator: """ LRU-based message deduplicator with TTL support. Uses the 'E' field (event time) from Binance messages as unique identifier. Automatically evicts old entries to prevent memory leaks. """ def __init__( self, max_size: int = settings.DEDUP_CACHE_SIZE, ttl_seconds: int = settings.DEDUP_TTL_SECONDS, ): """ Initialize deduplicator Args: max_size: Maximum number of entries to keep in cache ttl_seconds: Time-to-live for cache entries in seconds """ self.max_size = max_size self.ttl_seconds = ttl_seconds # OrderedDict for LRU cache: {message_key: timestamp} self._cache: OrderedDict[str, float] = OrderedDict() # Statistics self.stats = { "total_checked": 0, "duplicates_found": 0, "cache_evictions": 0, "ttl_evictions": 0, } def _generate_key(self, message: Dict[str, Any]) -> Optional[str]: """ Generate unique key for message Uses combination of: - Stream name (_stream field) - Event time (E field) - Symbol (s field) Args: message: Message data Returns: Unique key or None if key cannot be generated """ try: # Get stream name stream = message.get("_stream", "unknown") # Get event time (E field) - primary dedup identifier event_time = message.get("E") if not event_time: # Fallback to T field for some message types event_time = message.get("T") if not event_time: logger.warning(f"No event time found in message: {message}") return None # Get symbol (s field) symbol = message.get("s", "") # Create composite key key = f"{stream}:{symbol}:{event_time}" return key except Exception as e: logger.error(f"Error generating dedup key: {e}") return None def _evict_expired(self) -> None: """Remove expired entries based on TTL""" if not self._cache: return current_time = time.time() expired_keys = [] # Find expired entries for key, timestamp in self._cache.items(): if current_time - timestamp > self.ttl_seconds: expired_keys.append(key) else: # OrderedDict is sorted by insertion time # Once we hit a non-expired entry, all following entries are also non-expired break # Remove expired entries for key in expired_keys: del self._cache[key] self.stats["ttl_evictions"] += 1 def _evict_lru(self) -> None: """Remove least recently used entry""" if self._cache: self._cache.popitem(last=False) # FIFO: remove oldest self.stats["cache_evictions"] += 1 def is_duplicate(self, message: Dict[str, Any]) -> bool: """ Check if message is a duplicate Args: message: Message data to check Returns: True if duplicate, False if new message """ self.stats["total_checked"] += 1 # Generate unique key key = self._generate_key(message) if not key: # If we can't generate a key, assume it's not a duplicate return False # Clean up expired entries periodically if self.stats["total_checked"] % 100 == 0: self._evict_expired() # Check if key exists in cache current_time = time.time() if key in self._cache: # Update timestamp (move to end for LRU) del self._cache[key] self._cache[key] = current_time self.stats["duplicates_found"] += 1 return True # New message - add to cache self._cache[key] = current_time # Enforce max size if len(self._cache) > self.max_size: self._evict_lru() return False def clear(self) -> None: """Clear all cache entries""" self._cache.clear() logger.info("Deduplication cache cleared") def get_stats(self) -> Dict[str, Any]: """Get deduplication statistics""" duplicate_rate = ( self.stats["duplicates_found"] / self.stats["total_checked"] if self.stats["total_checked"] > 0 else 0.0 ) return { **self.stats, "cache_size": len(self._cache), "duplicate_rate": f"{duplicate_rate:.2%}", }