Spaces:
Sleeping
Sleeping
# environment_loader.py | |
'''Utility module to load .xlsx files from environment-specific folders and optional Huggingface repositories.''' | |
import os | |
from pathlib import Path | |
import pandas as pd | |
from datetime import datetime, timedelta | |
# Optional Huggingface support | |
try: | |
from huggingface_hub import hf_hub_download | |
except ImportError: | |
hf_hub_download = None | |
def get_latest_file_in_directory(directory_path: str, pattern: str = '*.xlsx') -> Path: | |
'''Return the Path to the latest modified file matching pattern in directory_path.''' | |
dir_path = Path(directory_path) | |
files = list(dir_path.glob(pattern)) | |
if not files: | |
return None | |
return max(files, key=lambda f: f.stat().st_mtime) | |
def get_file_by_date(directory_path: str, target_date: datetime.date, pattern: str = '*.xlsx') -> Path: | |
'''Return Path to the file whose name contains target_date or whose modification date matches target_date.''' | |
date_str = target_date.strftime('%Y-%m-%d') | |
dir_path = Path(directory_path) | |
# First try matching date string in filename | |
candidates = [f for f in dir_path.glob(pattern) if date_str in f.name] | |
if candidates: | |
return max(candidates, key=lambda f: f.stat().st_mtime) | |
# Fallback to checking file modification date | |
files = [f for f in dir_path.glob(pattern) if datetime.fromtimestamp(f.stat().st_mtime).date() == target_date] | |
if files: | |
return max(files, key=lambda f: f.stat().st_mtime) | |
return None | |
def load_latest_xlsx_for_env(env_code: str, | |
base_path: str = 'Q:/Selenium_Reports', | |
use_date: bool = False, | |
target_date: datetime.date = None) -> pd.DataFrame: | |
'''Load the latest or date-specific .xlsx file for the given environment code.''' | |
folder = Path(base_path) / env_code / 'XLSX' | |
if not folder.exists(): | |
raise FileNotFoundError(f"Environment folder not found: {folder}") | |
if use_date: | |
if target_date is None: | |
raise ValueError('target_date must be provided when use_date is True') | |
file_path = get_file_by_date(folder, target_date) | |
else: | |
file_path = get_latest_file_in_directory(folder) | |
if file_path is None: | |
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}') | |
return pd.read_excel(file_path) | |
def load_environments(env_codes: list, | |
base_path: str = 'Q:/Selenium_Reports', | |
by_date: bool = False, | |
days_ago: int = 1) -> dict: | |
'''Load DataFrame for each environment code; by_date chooses file from days_ago days before.''' | |
data = {} | |
target_date = datetime.now().date() - timedelta(days=days_ago) | |
for env in env_codes: | |
df = load_latest_xlsx_for_env( | |
env_code=env, | |
base_path=base_path, | |
use_date=by_date, | |
target_date=target_date | |
) | |
data[env] = df | |
return data | |
def load_from_huggingface(repo_id: str, | |
filenames: list, | |
revision: str = 'main') -> dict: | |
'''Download files from a Huggingface repo and load as DataFrames.''' | |
if hf_hub_download is None: | |
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub') | |
data = {} | |
for fname in filenames: | |
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision) | |
data[fname] = pd.read_excel(local_path) | |
return data | |
def get_latest_xlsx_path_for_env(env_code: str, | |
base_path: str = 'Q:/Selenium_Reports', | |
use_date: bool = False, | |
target_date: datetime.date = None) -> Path: | |
'''Return the Path to the desired .xlsx file for the given environment code without loading.''' | |
folder = Path(base_path) / env_code / 'XLSX' | |
if not folder.exists(): | |
raise FileNotFoundError(f"Environment folder not found: {folder}") | |
if use_date: | |
if target_date is None: | |
raise ValueError('target_date must be provided when use_date is True') | |
file_path = get_file_by_date(folder, target_date) | |
else: | |
file_path = get_latest_file_in_directory(folder) | |
if file_path is None: | |
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}') | |
return file_path | |
def get_environments_paths(env_codes: list, | |
base_path: str = 'Q:/Selenium_Reports', | |
by_date: bool = False, | |
days_ago: int = 1) -> dict: | |
'''Return file Paths for each environment code; by_date chooses file from days_ago days before.''' | |
data = {} | |
target_date = datetime.now().date() - timedelta(days=days_ago) | |
for env in env_codes: | |
path = get_latest_xlsx_path_for_env( | |
env_code=env, | |
base_path=base_path, | |
use_date=by_date, | |
target_date=target_date | |
) | |
data[env] = path | |
return data | |
def get_huggingface_paths(repo_id: str, | |
filenames: list, | |
revision: str = 'main') -> dict: | |
'''Download files from a Huggingface repo and return local Paths without loading.''' | |
if hf_hub_download is None: | |
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub') | |
data = {} | |
for fname in filenames: | |
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision) | |
data[fname] = Path(local_path) | |
return data |