from typing import Optional, Dict import pandas as pd from functools import lru_cache from huggingface_hub import snapshot_download import logging from config import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DataManager: def __init__(self): self._leaderboard_data: Optional[pd.DataFrame] = None self._responses_data: Optional[pd.DataFrame] = None self._section_results_data: Optional[pd.DataFrame] = None @lru_cache(maxsize=1) def _load_dataset(self, path: str) -> pd.DataFrame: """Load dataset with caching.""" try: return pd.read_parquet(path) except Exception as e: logger.error(f"Error loading dataset from {path}: {e}") raise RuntimeError(f"Failed to load dataset: {e}") def refresh_datasets(self) -> None: """Refresh all datasets from source.""" try: snapshot_download( repo_id="alibayram", repo_type="dataset", local_dir=CONFIG["dataset"].cache_dir ) # Clear cache to force reload self._load_dataset.cache_clear() logger.info("Datasets refreshed successfully") except Exception as e: logger.error(f"Error refreshing datasets: {e}") @property def leaderboard_data(self) -> pd.DataFrame: if self._leaderboard_data is None: self._leaderboard_data = self._load_dataset(CONFIG["dataset"].leaderboard_path) return self._leaderboard_data @property def responses_data(self) -> pd.DataFrame: if self._responses_data is None: self._responses_data = self._load_dataset(CONFIG["dataset"].responses_path) return self._responses_data @property def section_results_data(self) -> pd.DataFrame: if self._section_results_data is None: self._section_results_data = self._load_dataset(CONFIG["dataset"].section_results_path) return self._section_results_data # Global instance data_manager = DataManager()