batch-run-csv-analyser / environment_loader.py
BananaSauce's picture
added auto loader that doesnt work
295a9df
# environment_loader.py
'''Utility module to load .xlsx files from environment-specific folders and optional Huggingface repositories.'''
import os
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta
# Optional Huggingface support
try:
from huggingface_hub import hf_hub_download
except ImportError:
hf_hub_download = None
def get_latest_file_in_directory(directory_path: str, pattern: str = '*.xlsx') -> Path:
'''Return the Path to the latest modified file matching pattern in directory_path.'''
dir_path = Path(directory_path)
files = list(dir_path.glob(pattern))
if not files:
return None
return max(files, key=lambda f: f.stat().st_mtime)
def get_file_by_date(directory_path: str, target_date: datetime.date, pattern: str = '*.xlsx') -> Path:
'''Return Path to the file whose name contains target_date or whose modification date matches target_date.'''
date_str = target_date.strftime('%Y-%m-%d')
dir_path = Path(directory_path)
# First try matching date string in filename
candidates = [f for f in dir_path.glob(pattern) if date_str in f.name]
if candidates:
return max(candidates, key=lambda f: f.stat().st_mtime)
# Fallback to checking file modification date
files = [f for f in dir_path.glob(pattern) if datetime.fromtimestamp(f.stat().st_mtime).date() == target_date]
if files:
return max(files, key=lambda f: f.stat().st_mtime)
return None
def load_latest_xlsx_for_env(env_code: str,
base_path: str = 'Q:/Selenium_Reports',
use_date: bool = False,
target_date: datetime.date = None) -> pd.DataFrame:
'''Load the latest or date-specific .xlsx file for the given environment code.'''
folder = Path(base_path) / env_code / 'XLSX'
if not folder.exists():
raise FileNotFoundError(f"Environment folder not found: {folder}")
if use_date:
if target_date is None:
raise ValueError('target_date must be provided when use_date is True')
file_path = get_file_by_date(folder, target_date)
else:
file_path = get_latest_file_in_directory(folder)
if file_path is None:
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}')
return pd.read_excel(file_path)
def load_environments(env_codes: list,
base_path: str = 'Q:/Selenium_Reports',
by_date: bool = False,
days_ago: int = 1) -> dict:
'''Load DataFrame for each environment code; by_date chooses file from days_ago days before.'''
data = {}
target_date = datetime.now().date() - timedelta(days=days_ago)
for env in env_codes:
df = load_latest_xlsx_for_env(
env_code=env,
base_path=base_path,
use_date=by_date,
target_date=target_date
)
data[env] = df
return data
def load_from_huggingface(repo_id: str,
filenames: list,
revision: str = 'main') -> dict:
'''Download files from a Huggingface repo and load as DataFrames.'''
if hf_hub_download is None:
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub')
data = {}
for fname in filenames:
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision)
data[fname] = pd.read_excel(local_path)
return data
def get_latest_xlsx_path_for_env(env_code: str,
base_path: str = 'Q:/Selenium_Reports',
use_date: bool = False,
target_date: datetime.date = None) -> Path:
'''Return the Path to the desired .xlsx file for the given environment code without loading.'''
folder = Path(base_path) / env_code / 'XLSX'
if not folder.exists():
raise FileNotFoundError(f"Environment folder not found: {folder}")
if use_date:
if target_date is None:
raise ValueError('target_date must be provided when use_date is True')
file_path = get_file_by_date(folder, target_date)
else:
file_path = get_latest_file_in_directory(folder)
if file_path is None:
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}')
return file_path
def get_environments_paths(env_codes: list,
base_path: str = 'Q:/Selenium_Reports',
by_date: bool = False,
days_ago: int = 1) -> dict:
'''Return file Paths for each environment code; by_date chooses file from days_ago days before.'''
data = {}
target_date = datetime.now().date() - timedelta(days=days_ago)
for env in env_codes:
path = get_latest_xlsx_path_for_env(
env_code=env,
base_path=base_path,
use_date=by_date,
target_date=target_date
)
data[env] = path
return data
def get_huggingface_paths(repo_id: str,
filenames: list,
revision: str = 'main') -> dict:
'''Download files from a Huggingface repo and return local Paths without loading.'''
if hf_hub_download is None:
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub')
data = {}
for fname in filenames:
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision)
data[fname] = Path(local_path)
return data