"""
HKJC Playwright Scraper - Extracts Real Race Data
"""
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
from typing import List, Dict, Optional
from loguru import logger
from datetime import datetime, timedelta
import re


class HKJCRealScraper:
    """Scrapes real race data from HKJC using Playwright"""
    
    BASE_URL = "https://racing.hkjc.com/racing/information/english/racing/racecard.aspx"
    
    def __init__(self, headless: bool = True, timeout: int = 60000):
        self.headless = headless
        self.timeout = timeout
        logger.info("HKJC Real Scraper initialized")
    
    def get_race_card(self, race_date: str, venue: str = "ST") -> Dict:
        """
        Get race card for a specific date
        
        Args:
            race_date: YYYYMMDD format
            venue: ST (Sha Tin) or HV (Happy Valley)
        """
        race_card = {
            'race_date': race_date,
            'venue': venue,
            'races': []
        }
        
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=self.headless)
            page = browser.new_page()
            
            try:
                logger.info(f"Fetching race card for {race_date}")
                page.goto(self.BASE_URL, timeout=self.timeout)
                page.wait_for_timeout(3000)  # Wait for JS to load
                
                # Parse all races from the page
                races = self._parse_races(page, race_date, venue)
                race_card['races'] = races
                
                logger.info(f"Parsed {len(races)} races")
                
            except Exception as e:
                logger.error(f"Error fetching race card: {e}")
            finally:
                browser.close()
        
        return race_card
    
    def _parse_races(self, page, race_date: str, venue: str) -> List[Dict]:
        """Parse all races from the page"""
        races = []
        
        try:
            # Find all race sections
            # HKJC uses tables with class 'bg_white'
            race_tables = page.query_selector_all('table.bg_white')
            
            logger.info(f"Found {len(race_tables)} race tables")
            
            race_num = 0
            for table in race_tables:
                # Check if this table has horse data
                rows = table.query_selector_all('tr')
                
                if len(rows) > 1:
                    race_num += 1
                    race = {
                        'race_id': f"{race_date}_{venue}_{race_num}",
                        'race_number': race_num,
                        'distance': 0,
                        'track': 'Turf',
                        'runners': []
                    }
                    
                    # Parse each row for horse data
                    for row in rows[1:]:  # Skip header
                        runner = self._parse_runner(row, race['race_id'])
                        if runner:
                            race['runners'].append(runner)
                    
                    if race['runners']:
                        races.append(race)
            
            # If no races found with table method, try alternative approach
            if not races:
                logger.info("Trying alternative parsing method...")
                races = self._parse_races_alternative(page, race_date, venue)
        
        except Exception as e:
            logger.error(f"Error parsing races: {e}")
        
        return races
    
    def _parse_runner(self, row, race_id: str) -> Optional[Dict]:
        """Parse a runner row"""
        try:
            cells = row.query_selector_all('td')
            
            if len(cells) < 3:
                return None
            
            # Extract horse number (first cell)
            horse_number_text = cells[0].inner_text().strip()
            horse_number = int(horse_number_text) if horse_number_text.isdigit() else 0
            
            # Extract horse name and ID (second cell with link)
            horse_link = cells[1].query_selector('a')
            if horse_link:
                horse_name = horse_link.inner_text().strip()
                href = horse_link.get_attribute('href') or ''
                
                # Extract horse ID from URL
                horse_id_match = re.search(r'horseid=([A-Z0-9_]+)', href)
                horse_id = horse_id_match.group(1) if horse_id_match else f'H{horse_number:03d}'
            else:
                horse_name = cells[1].inner_text().strip()
                horse_id = f'H{horse_number:03d}'
            
            # Skip if no valid horse name
            if not horse_name or len(horse_name) < 2:
                return None
            
            return {
                'horse_number': horse_number,
                'horse': {
                    'horse_id': horse_id,
                    'horse_name': horse_name
                },
                'jockey': {
                    'jockey_id': '',
                    'jockey_name': ''
                },
                'trainer': {
                    'trainer_id': '',
                    'trainer_name': ''
                },
                'weight_carried': 0,
                'barrier': horse_number,
                'handicap_rating': 0,
                'race_id': race_id
            }
            
        except Exception as e:
            logger.error(f"Error parsing runner: {e}")
            return None
    
    def _parse_races_alternative(self, page, race_date: str, venue: str) -> List[Dict]:
        """Alternative parsing method"""
        races = []
        
        try:
            # Get all text content
            text = page.inner_text('body')
            
            # Find all horse links
            horse_links = page.query_selector_all('a[href*="horseid="]')
            
            logger.info(f"Found {len(horse_links)} horse links")
            
            if horse_links:
                # Group horses into races (typically 8-14 per race)
                race_num = 1
                runners = []
                
                for link in horse_links:
                    horse_name = link.inner_text().strip()
                    href = link.get_attribute('href') or ''
                    
                    horse_id_match = re.search(r'horseid=([A-Z0-9_]+)', href)
                    horse_id = horse_id_match.group(1) if horse_id_match else ''
                    
                    horse_number = len(runners) + 1
                    
                    runners.append({
                        'horse_number': horse_number,
                        'horse': {
                            'horse_id': horse_id,
                            'horse_name': horse_name
                        },
                        'jockey': {'jockey_id': '', 'jockey_name': ''},
                        'trainer': {'trainer_id': '', 'trainer_name': ''},
                        'weight_carried': 0,
                        'barrier': horse_number,
                        'handicap_rating': 0,
                        'race_id': f"{race_date}_{venue}_{race_num}"
                    })
                    
                    # Every 14 horses = new race
                    if len(runners) >= 14:
                        races.append({
                            'race_id': f"{race_date}_{venue}_{race_num}",
                            'race_number': race_num,
                            'distance': 1200,
                            'track': 'Turf',
                            'runners': runners
                        })
                        runners = []
                        race_num += 1
                
                # Add remaining runners as last race
                if runners:
                    races.append({
                        'race_id': f"{race_date}_{venue}_{race_num}",
                        'race_number': race_num,
                        'distance': 1200,
                        'track': 'Turf',
                        'runners': runners
                    })
        
        except Exception as e:
            logger.error(f"Alternative parsing error: {e}")
        
        return races
    
    def get_upcoming_race_dates(self) -> List[Dict]:
        """Get upcoming race dates"""
        dates = []
        
        # Generate typical race days (Wed, Sat, Sun)
        today = datetime.now()
        for i in range(14):
            check_date = today + timedelta(days=i)
            if check_date.weekday() in [2, 5, 6]:
                venue = "HV" if check_date.weekday() == 2 else "ST"
                dates.append({
                    'date': check_date.strftime('%Y%m%d'),
                    'date_obj': check_date,
                    'venue': venue,
                    'description': 'Happy Valley' if venue == 'HV' else 'Sha Tin'
                })
        
        return dates


# Test
if __name__ == "__main__":
    scraper = HKJCRealScraper(headless=True)
    
    dates = scraper.get_upcoming_race_dates()
    print(f"Testing with date: {dates[0]['date']}")
    
    card = scraper.get_race_card(dates[0]['date'], dates[0]['venue'])
    
    print(f"\nRaces: {len(card['races'])}")
    for race in card['races'][:2]:
        print(f"  Race {race['race_number']}: {len(race['runners'])} runners")
        if race['runners']:
            print(f"    First: {race['runners'][0]['horse']['horse_name']}")
