tradusai/analysis/data_reader.py

"""
Data reader for fetching market data from Redis Streams
"""
import logging
from typing import Optional, List, Dict, Any
from datetime import datetime, timedelta
import pandas as pd
import redis
import orjson
import requests
import time

from .config import config


logger = logging.getLogger(__name__)


class MarketDataReader:
    """Read and aggregate market data from Redis Streams"""

    def __init__(self):
        self.redis_client = redis.Redis(
            host=config.REDIS_HOST,
            port=config.REDIS_PORT,
            db=config.REDIS_DB,
            decode_responses=False,
        )

    def fetch_historical_klines_from_api(
        self, symbol: str = 'BTCUSDT', interval: str = '5m', limit: int = 200
    ) -> pd.DataFrame:
        """
        Fetch historical kline data from Binance API

        Args:
            symbol: Trading pair (e.g., 'BTCUSDT')
            interval: Kline interval (e.g., '5m', '15m', '1h', '4h')
            limit: Number of candles to fetch (max 1500)

        Returns:
            DataFrame with historical OHLCV data
        """
        try:
            # Binance API endpoint
            url = 'https://fapi.binance.com/fapi/v1/klines'

            params = {
                'symbol': symbol,
                'interval': interval,
                'limit': min(limit, 1500)  # API limit
            }

            logger.info(f"Fetching {limit} historical candles from Binance API ({symbol} {interval})...")
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()

            data = response.json()

            # Parse API response
            klines = []
            for item in data:
                klines.append({
                    'timestamp': datetime.fromtimestamp(item[0] / 1000),
                    'open': float(item[1]),
                    'high': float(item[2]),
                    'low': float(item[3]),
                    'close': float(item[4]),
                    'volume': float(item[5]),
                    'quote_volume': float(item[7]),
                    'trades': int(item[8]),
                    'is_closed': True,  # Historical data is always closed
                })

            df = pd.DataFrame(klines)
            if not df.empty:
                df.set_index('timestamp', inplace=True)
                df.sort_index(inplace=True)
                logger.info(f"✅ Fetched {len(df)} candles from Binance API")

            return df

        except Exception as e:
            logger.error(f"Error fetching from Binance API: {e}")
            return pd.DataFrame()

    def read_kline_stream(
        self, stream_key: str, count: int = None, use_api_fallback: bool = True
    ) -> pd.DataFrame:
        """
        Read kline data from Redis Stream and convert to DataFrame
        Only includes completed candles (x: true). If insufficient data,
        fetches historical data from Binance API.

        Args:
            stream_key: Redis stream key (e.g., 'binance:raw:kline:5m')
            count: Number of recent candles to fetch (default: LOOKBACK_PERIODS)
            use_api_fallback: Whether to fetch from API if Redis data insufficient

        Returns:
            DataFrame with OHLCV data and indicators
        """
        if count is None:
            count = config.LOOKBACK_PERIODS

        try:
            # Read MORE messages from stream to account for duplicates
            # Multiply by 10 to ensure we get enough unique candles after filtering
            messages = self.redis_client.xrevrange(stream_key, count=count * 10)

            if not messages:
                logger.warning(f"No data found in stream: {stream_key}")
                # Fallback to API
                if use_api_fallback:
                    return self._fetch_from_api_with_interval(stream_key, count)
                return pd.DataFrame()

            # Parse messages - ONLY keep completed candles (x: true)
            klines = []
            seen_timestamps = set()

            for msg_id, fields in reversed(messages):  # Reverse to get chronological order
                data = orjson.loads(fields[b'data'])
                k = data.get('k', {})

                # IMPORTANT: Only keep completed candles
                if not k.get('x', False):
                    continue

                # Deduplicate by timestamp
                timestamp = k['t']
                if timestamp in seen_timestamps:
                    continue
                seen_timestamps.add(timestamp)

                klines.append({
                    'timestamp': datetime.fromtimestamp(k['t'] / 1000),
                    'open': float(k['o']),
                    'high': float(k['h']),
                    'low': float(k['l']),
                    'close': float(k['c']),
                    'volume': float(k['v']),
                    'quote_volume': float(k['q']),
                    'trades': int(k['n']),
                    'is_closed': k['x'],
                })

                # Stop if we have enough candles
                if len(klines) >= count:
                    break

            # Create DataFrame
            df = pd.DataFrame(klines)

            if df.empty:
                logger.warning(f"No completed candles found in stream: {stream_key}")
                # Fallback to API
                if use_api_fallback:
                    return self._fetch_from_api_with_interval(stream_key, count)
                return df

            df.set_index('timestamp', inplace=True)
            df.sort_index(inplace=True)

            logger.info(f"Loaded {len(df)} completed candles from {stream_key}")

            # If still insufficient, supplement with API data
            if len(df) < count and use_api_fallback:
                logger.warning(f"Insufficient data: {len(df)}/{count} candles. Fetching from API...")
                api_df = self._fetch_from_api_with_interval(stream_key, count)

                if not api_df.empty:
                    # Merge Redis and API data, preferring Redis for overlapping periods
                    combined = pd.concat([api_df, df])
                    combined = combined[~combined.index.duplicated(keep='last')]
                    combined.sort_index(inplace=True)
                    logger.info(f"Combined data: {len(combined)} candles (Redis: {len(df)}, API: {len(api_df)})")
                    return combined

            return df

        except Exception as e:
            logger.error(f"Error reading kline stream {stream_key}: {e}")
            return pd.DataFrame()

    def _fetch_from_api_with_interval(self, stream_key: str, count: int) -> pd.DataFrame:
        """Extract interval from stream key and fetch from API"""
        # Extract interval from stream key (e.g., 'binance:raw:kline:5m' -> '5m')
        try:
            interval = stream_key.split(':')[-1]
            return self.fetch_historical_klines_from_api(
                symbol='BTCUSDT',
                interval=interval,
                limit=count
            )
        except Exception as e:
            logger.error(f"Error extracting interval from {stream_key}: {e}")
            return pd.DataFrame()

    def read_latest_depth(self) -> Optional[Dict[str, Any]]:
        """
        Read latest order book depth data

        Returns:
            Dict with bids and asks, or None if no data
        """
        try:
            messages = self.redis_client.xrevrange(config.DEPTH_KEY, count=1)

            if not messages:
                return None

            msg_id, fields = messages[0]
            data = orjson.loads(fields[b'data'])

            return {
                'timestamp': datetime.fromtimestamp(data['E'] / 1000),
                'bids': [[float(p), float(q)] for p, q in data['b']],
                'asks': [[float(p), float(q)] for p, q in data['a']],
            }

        except Exception as e:
            logger.error(f"Error reading depth data: {e}")
            return None

    def read_recent_trades(self, count: int = 100) -> List[Dict[str, Any]]:
        """
        Read recent trade data

        Args:
            count: Number of recent trades to fetch

        Returns:
            List of trade dictionaries
        """
        try:
            messages = self.redis_client.xrevrange(config.TRADE_KEY, count=count)

            if not messages:
                return []

            trades = []
            for msg_id, fields in messages:
                data = orjson.loads(fields[b'data'])

                trades.append({
                    'timestamp': datetime.fromtimestamp(data['T'] / 1000),
                    'price': float(data['p']),
                    'quantity': float(data['q']),
                    'is_buyer_maker': data['m'],  # True = sell, False = buy
                })

            return trades

        except Exception as e:
            logger.error(f"Error reading trade data: {e}")
            return []

    def get_multi_timeframe_data(self) -> Dict[str, pd.DataFrame]:
        """
        Fetch data from multiple timeframes

        Returns:
            Dict mapping timeframe to DataFrame
        """
        # Different timeframes need different amount of data
        # Shorter timeframes: 200 candles (for detailed analysis)
        # Longer timeframes: fewer candles (100 for 1d, 60+ for 1w)
        timeframes = {
            '5m': (config.KLINE_5M_KEY, 200),
            '15m': (config.KLINE_15M_KEY, 200),
            '1h': (config.KLINE_1H_KEY, 200),
            '4h': (config.KLINE_4H_KEY, 200),
            '1d': (config.KLINE_1D_KEY, 100),  # 100 days ≈ 3+ months
            '1w': (config.KLINE_1W_KEY, 65),   # 65 weeks ≈ 15 months
        }

        data = {}
        for tf, (key, count) in timeframes.items():
            df = self.read_kline_stream(key, count=count)
            if not df.empty:
                data[tf] = df

        return data

    def get_latest_price(self) -> Optional[float]:
        """Get latest close price from 5m kline"""
        try:
            df = self.read_kline_stream(config.KLINE_5M_KEY, count=1)
            if not df.empty:
                return float(df.iloc[-1]['close'])
        except Exception as e:
            logger.error(f"Error getting latest price: {e}")
        return None