177 lines
4.9 KiB
Python
177 lines
4.9 KiB
Python
"""
|
|
Message deduplication using event time (E field) and LRU cache
|
|
"""
|
|
import logging
|
|
import time
|
|
from collections import OrderedDict
|
|
from typing import Dict, Any, Optional
|
|
|
|
from config import settings
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MessageDeduplicator:
|
|
"""
|
|
LRU-based message deduplicator with TTL support.
|
|
|
|
Uses the 'E' field (event time) from Binance messages as unique identifier.
|
|
Automatically evicts old entries to prevent memory leaks.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
max_size: int = settings.DEDUP_CACHE_SIZE,
|
|
ttl_seconds: int = settings.DEDUP_TTL_SECONDS,
|
|
):
|
|
"""
|
|
Initialize deduplicator
|
|
|
|
Args:
|
|
max_size: Maximum number of entries to keep in cache
|
|
ttl_seconds: Time-to-live for cache entries in seconds
|
|
"""
|
|
self.max_size = max_size
|
|
self.ttl_seconds = ttl_seconds
|
|
|
|
# OrderedDict for LRU cache: {message_key: timestamp}
|
|
self._cache: OrderedDict[str, float] = OrderedDict()
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
"total_checked": 0,
|
|
"duplicates_found": 0,
|
|
"cache_evictions": 0,
|
|
"ttl_evictions": 0,
|
|
}
|
|
|
|
def _generate_key(self, message: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Generate unique key for message
|
|
|
|
Uses combination of:
|
|
- Stream name (_stream field)
|
|
- Event time (E field)
|
|
- Symbol (s field)
|
|
|
|
Args:
|
|
message: Message data
|
|
|
|
Returns:
|
|
Unique key or None if key cannot be generated
|
|
"""
|
|
try:
|
|
# Get stream name
|
|
stream = message.get("_stream", "unknown")
|
|
|
|
# Get event time (E field) - primary dedup identifier
|
|
event_time = message.get("E")
|
|
if not event_time:
|
|
# Fallback to T field for some message types
|
|
event_time = message.get("T")
|
|
|
|
if not event_time:
|
|
logger.warning(f"No event time found in message: {message}")
|
|
return None
|
|
|
|
# Get symbol (s field)
|
|
symbol = message.get("s", "")
|
|
|
|
# Create composite key
|
|
key = f"{stream}:{symbol}:{event_time}"
|
|
return key
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating dedup key: {e}")
|
|
return None
|
|
|
|
def _evict_expired(self) -> None:
|
|
"""Remove expired entries based on TTL"""
|
|
if not self._cache:
|
|
return
|
|
|
|
current_time = time.time()
|
|
expired_keys = []
|
|
|
|
# Find expired entries
|
|
for key, timestamp in self._cache.items():
|
|
if current_time - timestamp > self.ttl_seconds:
|
|
expired_keys.append(key)
|
|
else:
|
|
# OrderedDict is sorted by insertion time
|
|
# Once we hit a non-expired entry, all following entries are also non-expired
|
|
break
|
|
|
|
# Remove expired entries
|
|
for key in expired_keys:
|
|
del self._cache[key]
|
|
self.stats["ttl_evictions"] += 1
|
|
|
|
def _evict_lru(self) -> None:
|
|
"""Remove least recently used entry"""
|
|
if self._cache:
|
|
self._cache.popitem(last=False) # FIFO: remove oldest
|
|
self.stats["cache_evictions"] += 1
|
|
|
|
def is_duplicate(self, message: Dict[str, Any]) -> bool:
|
|
"""
|
|
Check if message is a duplicate
|
|
|
|
Args:
|
|
message: Message data to check
|
|
|
|
Returns:
|
|
True if duplicate, False if new message
|
|
"""
|
|
self.stats["total_checked"] += 1
|
|
|
|
# Generate unique key
|
|
key = self._generate_key(message)
|
|
if not key:
|
|
# If we can't generate a key, assume it's not a duplicate
|
|
return False
|
|
|
|
# Clean up expired entries periodically
|
|
if self.stats["total_checked"] % 100 == 0:
|
|
self._evict_expired()
|
|
|
|
# Check if key exists in cache
|
|
current_time = time.time()
|
|
|
|
if key in self._cache:
|
|
# Update timestamp (move to end for LRU)
|
|
del self._cache[key]
|
|
self._cache[key] = current_time
|
|
|
|
self.stats["duplicates_found"] += 1
|
|
return True
|
|
|
|
# New message - add to cache
|
|
self._cache[key] = current_time
|
|
|
|
# Enforce max size
|
|
if len(self._cache) > self.max_size:
|
|
self._evict_lru()
|
|
|
|
return False
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all cache entries"""
|
|
self._cache.clear()
|
|
logger.info("Deduplication cache cleared")
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get deduplication statistics"""
|
|
duplicate_rate = (
|
|
self.stats["duplicates_found"] / self.stats["total_checked"]
|
|
if self.stats["total_checked"] > 0
|
|
else 0.0
|
|
)
|
|
|
|
return {
|
|
**self.stats,
|
|
"cache_size": len(self._cache),
|
|
"duplicate_rate": f"{duplicate_rate:.2%}",
|
|
}
|