# environment_loader.py '''Utility module to load .xlsx files from environment-specific folders and optional Huggingface repositories.''' import os from pathlib import Path import pandas as pd from datetime import datetime, timedelta # Optional Huggingface support try: from huggingface_hub import hf_hub_download except ImportError: hf_hub_download = None def get_latest_file_in_directory(directory_path: str, pattern: str = '*.xlsx') -> Path: '''Return the Path to the latest modified file matching pattern in directory_path.''' dir_path = Path(directory_path) files = list(dir_path.glob(pattern)) if not files: return None return max(files, key=lambda f: f.stat().st_mtime) def get_file_by_date(directory_path: str, target_date: datetime.date, pattern: str = '*.xlsx') -> Path: '''Return Path to the file whose name contains target_date or whose modification date matches target_date.''' date_str = target_date.strftime('%Y-%m-%d') dir_path = Path(directory_path) # First try matching date string in filename candidates = [f for f in dir_path.glob(pattern) if date_str in f.name] if candidates: return max(candidates, key=lambda f: f.stat().st_mtime) # Fallback to checking file modification date files = [f for f in dir_path.glob(pattern) if datetime.fromtimestamp(f.stat().st_mtime).date() == target_date] if files: return max(files, key=lambda f: f.stat().st_mtime) return None def load_latest_xlsx_for_env(env_code: str, base_path: str = 'Q:/Selenium_Reports', use_date: bool = False, target_date: datetime.date = None) -> pd.DataFrame: '''Load the latest or date-specific .xlsx file for the given environment code.''' folder = Path(base_path) / env_code / 'XLSX' if not folder.exists(): raise FileNotFoundError(f"Environment folder not found: {folder}") if use_date: if target_date is None: raise ValueError('target_date must be provided when use_date is True') file_path = get_file_by_date(folder, target_date) else: file_path = get_latest_file_in_directory(folder) if file_path is None: raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}') return pd.read_excel(file_path) def load_environments(env_codes: list, base_path: str = 'Q:/Selenium_Reports', by_date: bool = False, days_ago: int = 1) -> dict: '''Load DataFrame for each environment code; by_date chooses file from days_ago days before.''' data = {} target_date = datetime.now().date() - timedelta(days=days_ago) for env in env_codes: df = load_latest_xlsx_for_env( env_code=env, base_path=base_path, use_date=by_date, target_date=target_date ) data[env] = df return data def load_from_huggingface(repo_id: str, filenames: list, revision: str = 'main') -> dict: '''Download files from a Huggingface repo and load as DataFrames.''' if hf_hub_download is None: raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub') data = {} for fname in filenames: local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision) data[fname] = pd.read_excel(local_path) return data def get_latest_xlsx_path_for_env(env_code: str, base_path: str = 'Q:/Selenium_Reports', use_date: bool = False, target_date: datetime.date = None) -> Path: '''Return the Path to the desired .xlsx file for the given environment code without loading.''' folder = Path(base_path) / env_code / 'XLSX' if not folder.exists(): raise FileNotFoundError(f"Environment folder not found: {folder}") if use_date: if target_date is None: raise ValueError('target_date must be provided when use_date is True') file_path = get_file_by_date(folder, target_date) else: file_path = get_latest_file_in_directory(folder) if file_path is None: raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}') return file_path def get_environments_paths(env_codes: list, base_path: str = 'Q:/Selenium_Reports', by_date: bool = False, days_ago: int = 1) -> dict: '''Return file Paths for each environment code; by_date chooses file from days_ago days before.''' data = {} target_date = datetime.now().date() - timedelta(days=days_ago) for env in env_codes: path = get_latest_xlsx_path_for_env( env_code=env, base_path=base_path, use_date=by_date, target_date=target_date ) data[env] = path return data def get_huggingface_paths(repo_id: str, filenames: list, revision: str = 'main') -> dict: '''Download files from a Huggingface repo and return local Paths without loading.''' if hf_hub_download is None: raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub') data = {} for fname in filenames: local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision) data[fname] = Path(local_path) return data