"""
Feature Engineering for HK Racing Analytics
Computes features from raw data for ML model
"""
from typing import Dict, Optional
from loguru import logger
import pandas as pd
import numpy as np


class FeatureEngineer:
    """Engineers features for ML predictions"""
    
    def __init__(self):
        logger.info("Feature Engineer initialized")
    
    def compute_all_features(
        self,
        horse_runs: pd.DataFrame,
        jockey_runs: pd.DataFrame,
        trainer_runs: pd.DataFrame,
        runner_info: Dict
    ) -> Dict:
        """
        Compute all features for a runner
        
        Args:
            horse_runs: DataFrame of horse's past runs
            jockey_runs: DataFrame of jockey's recent runs
            trainer_runs: DataFrame of trainer's recent runs
            runner_info: Current runner info
        
        Returns:
            Dict of feature values
        """
        features = {}
        
        # Form features
        features.update(self._compute_form_features(horse_runs))
        
        # Class features
        features.update(self._compute_class_features(horse_runs, runner_info))
        
        # Distance features
        features.update(self._compute_distance_features(horse_runs, runner_info))
        
        # Track features
        features.update(self._compute_track_features(horse_runs, runner_info))
        
        # Jockey features
        features.update(self._compute_jockey_features(jockey_runs))
        
        # Trainer features
        features.update(self._compute_trainer_features(trainer_runs))
        
        # Barrier features
        features.update(self._compute_barrier_features(runner_info))
        
        return features
    
    def _compute_form_features(self, horse_runs: pd.DataFrame) -> Dict:
        """Compute form-based features"""
        features = {
            'avg_finish_pos_last_6': None,
            'win_rate_last_6': None,
            'place_rate_last_6': None,
            'days_since_last_run': None,
            'runs_in_last_30_days': 0
        }
        
        if horse_runs is None or len(horse_runs) == 0:
            return features
        
        try:
            # Last 6 runs
            recent = horse_runs.head(6)
            
            # Average finish position
            if 'finishing_position' in recent.columns:
                positions = recent['finishing_position'].dropna()
                if len(positions) > 0:
                    features['avg_finish_pos_last_6'] = float(positions.mean())
                    
                    # Win rate (position == 1)
                    features['win_rate_last_6'] = float((positions == 1).sum() / len(positions))
                    
                    # Place rate (position <= 3)
                    features['place_rate_last_6'] = float((positions <= 3).sum() / len(positions))
            
            # Days since last run
            if 'race_date' in recent.columns:
                last_date = pd.to_datetime(recent['race_date'].iloc[0])
                days_diff = (pd.Timestamp.now() - last_date).days
                features['days_since_last_run'] = days_diff
                
                # Runs in last 30 days
                recent_30 = recent[pd.to_datetime(recent['race_date']) > pd.Timestamp.now() - pd.Timedelta(days=30)]
                features['runs_in_last_30_days'] = len(recent_30)
        
        except Exception as e:
            logger.error(f"Error computing form features: {e}")
        
        return features
    
    def _compute_class_features(self, horse_runs: pd.DataFrame, runner_info: Dict) -> Dict:
        """Compute class-based features"""
        features = {
            'class_rating': None,
            'class_diff_from_last': None
        }
        
        try:
            current_rating = runner_info.get('handicap_rating', 0)
            features['class_rating'] = current_rating
            
            if horse_runs is not None and len(horse_runs) > 0:
                last_rating = horse_runs.iloc[0].get('handicap_rating', current_rating)
                features['class_diff_from_last'] = current_rating - last_rating
        
        except Exception as e:
            logger.error(f"Error computing class features: {e}")
        
        return features
    
    def _compute_distance_features(self, horse_runs: pd.DataFrame, runner_info: Dict) -> Dict:
        """Compute distance-based features"""
        features = {
            'distance_starts': 0,
            'distance_win_rate': None,
            'distance_place_rate': None,
            'distance_avg_margin': None
        }
        
        try:
            current_distance = runner_info.get('distance', 0)
            
            if horse_runs is not None and len(horse_runs) > 0 and current_distance > 0:
                # Filter runs at similar distance (±200m)
                if 'distance' in horse_runs.columns:
                    distance_runs = horse_runs[
                        abs(horse_runs['distance'] - current_distance) <= 200
                    ]
                    
                    features['distance_starts'] = len(distance_runs)
                    
                    if len(distance_runs) > 0:
                        # Win rate at distance
                        if 'finishing_position' in distance_runs.columns:
                            positions = distance_runs['finishing_position'].dropna()
                            features['distance_win_rate'] = float((positions == 1).sum() / len(positions))
                            features['distance_place_rate'] = float((positions <= 3).sum() / len(positions))
                        
                        # Average margin
                        if 'margin' in distance_runs.columns:
                            features['distance_avg_margin'] = float(distance_runs['margin'].mean())
        
        except Exception as e:
            logger.error(f"Error computing distance features: {e}")
        
        return features
    
    def _compute_track_features(self, horse_runs: pd.DataFrame, runner_info: Dict) -> Dict:
        """Compute track-based features"""
        features = {
            'track_starts': 0,
            'track_win_rate': None,
            'track_place_rate': None
        }
        
        try:
            current_venue = runner_info.get('venue', '')
            
            if horse_runs is not None and len(horse_runs) > 0 and current_venue:
                # Filter runs at same venue
                if 'venue' in horse_runs.columns:
                    track_runs = horse_runs[horse_runs['venue'] == current_venue]
                    
                    features['track_starts'] = len(track_runs)
                    
                    if len(track_runs) > 0 and 'finishing_position' in track_runs.columns:
                        positions = track_runs['finishing_position'].dropna()
                        features['track_win_rate'] = float((positions == 1).sum() / len(positions))
                        features['track_place_rate'] = float((positions <= 3).sum() / len(positions))
        
        except Exception as e:
            logger.error(f"Error computing track features: {e}")
        
        return features
    
    def _compute_jockey_features(self, jockey_runs: pd.DataFrame) -> Dict:
        """Compute jockey-based features"""
        features = {
            'jockey_win_rate_30_days': None,
            'jockey_place_rate_30_days': None,
            'jockey_horse_win_rate': None
        }
        
        try:
            if jockey_runs is not None and len(jockey_runs) > 0:
                # Recent runs (last 30 days)
                recent = jockey_runs.head(50)  # Assume sorted by date
                
                if 'finishing_position' in recent.columns:
                    positions = recent['finishing_position'].dropna()
                    if len(positions) > 0:
                        features['jockey_win_rate_30_days'] = float((positions == 1).sum() / len(positions))
                        features['jockey_place_rate_30_days'] = float((positions <= 3).sum() / len(positions))
        
        except Exception as e:
            logger.error(f"Error computing jockey features: {e}")
        
        return features
    
    def _compute_trainer_features(self, trainer_runs: pd.DataFrame) -> Dict:
        """Compute trainer-based features"""
        features = {
            'trainer_win_rate_30_days': None,
            'trainer_place_rate_30_days': None,
            'trainer_horse_win_rate': None
        }
        
        try:
            if trainer_runs is not None and len(trainer_runs) > 0:
                recent = trainer_runs.head(50)
                
                if 'finishing_position' in recent.columns:
                    positions = recent['finishing_position'].dropna()
                    if len(positions) > 0:
                        features['trainer_win_rate_30_days'] = float((positions == 1).sum() / len(positions))
                        features['trainer_place_rate_30_days'] = float((positions <= 3).sum() / len(positions))
        
        except Exception as e:
            logger.error(f"Error computing trainer features: {e}")
        
        return features
    
    def _compute_barrier_features(self, runner_info: Dict) -> Dict:
        """Compute barrier-based features"""
        features = {
            'barrier_win_rate': None,
            'barrier_place_rate': None
        }
        
        try:
            barrier = runner_info.get('barrier', 0)
            distance = runner_info.get('distance', 0)
            
            # Barrier statistics depend on track and distance
            # This would normally be computed from historical data
            # For now, use generic barrier stats
            
            if barrier > 0:
                # Simplified: barriers 1-6 are advantageous
                if barrier <= 6:
                    features['barrier_win_rate'] = 0.15
                    features['barrier_place_rate'] = 0.35
                elif barrier <= 10:
                    features['barrier_win_rate'] = 0.10
                    features['barrier_place_rate'] = 0.30
                else:
                    features['barrier_win_rate'] = 0.08
                    features['barrier_place_rate'] = 0.25
        
        except Exception as e:
            logger.error(f"Error computing barrier features: {e}")
        
        return features


if __name__ == "__main__":
    # Test feature engineering
    engineer = FeatureEngineer()
    
    # Mock data
    horse_runs = pd.DataFrame({
        'finishing_position': [1, 3, 2, 5, 4, 1],
        'race_date': pd.date_range(end=pd.Timestamp.now(), periods=6),
        'distance': [1200, 1400, 1200, 1600, 1200, 1200],
        'venue': ['ST', 'HV', 'ST', 'ST', 'HV', 'ST'],
        'handicap_rating': [80, 82, 84, 85, 86, 88]
    })
    
    runner_info = {
        'distance': 1200,
        'venue': 'ST',
        'barrier': 3,
        'handicap_rating': 88
    }
    
    features = engineer.compute_all_features(
        horse_runs=horse_runs,
        jockey_runs=pd.DataFrame(),
        trainer_runs=pd.DataFrame(),
        runner_info=runner_info
    )
    
    print("Features:", features)
