Spaces:
Running
Running
import pandas as pd | |
import os | |
def load_dataset(file_path: str) -> pd.DataFrame: | |
""" | |
Loads a dataset from a specified file path into a Pandas DataFrame. | |
This function reads a dataset from a given file path. The file can be in various formats | |
supported by Pandas, such as CSV, Excel, or JSON. The function returns the dataset as a | |
Pandas DataFrame, which is a powerful data structure for data manipulation and analysis. | |
Parameters: | |
- file_path (str): The path to the dataset file. This should be a string representing | |
the location of the file on the filesystem. | |
Returns: | |
pd.DataFrame: A DataFrame containing the loaded dataset. | |
Raises: | |
- FileNotFoundError: If the specified file path does not exist or cannot be found. | |
- ValueError: If the file format is not supported or if the file is empty. | |
- pd.errors.EmptyDataError: If the file is empty and cannot be read into a DataFrame. | |
- pd.errors.ParserError: If there is an error while parsing the file. | |
- TypeError: If the file path is not a string or is an unsupported file format. | |
Examples: | |
>>> df = load_dataset('data/my_dataset.csv') | |
>>> print(df.head()) | |
""" | |
# Checking if file path is a string | |
if not isinstance(file_path, str): | |
raise TypeError(f"Expected file path to be a string, but got {type(file_path).__name__}.") | |
# Checking if the file exists | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"File not found: {file_path}. Please check the path and try again.") | |
# Attempting to load the dataset based on the file extension | |
try: | |
# Determine the file extension and load the file accordingly | |
file_extension = file_path.split('.')[-1].lower() | |
if file_extension == 'csv': | |
dataset = pd.read_csv(file_path) | |
elif file_extension in ['xlsx', 'xls']: | |
dataset = pd.read_excel(file_path) | |
elif file_extension == 'json': | |
dataset = pd.read_json(file_path) | |
else: | |
raise ValueError(f"Unsupported file format: {file_extension}. Supported formats are CSV, Excel, and JSON.") | |
# Checking if the dataset is empty | |
if dataset.empty: | |
raise pd.errors.EmptyDataError(f"The file at {file_path} is empty and cannot be loaded into a DataFrame.") | |
return dataset | |
except ValueError as value_error: | |
raise ValueError(f"Error loading the dataset from {file_path}. Please ensure the file is in a supported format and not empty.") from value_error | |
except pd.errors.EmptyDataError as empty_data_error: | |
raise pd.errors.EmptyDataError(f"The file at {file_path} is empty and cannot be loaded into a DataFrame.") from empty_data_error | |
except pd.errors.ParserError as parser_error: | |
raise pd.errors.ParserError(f"Error parsing the file at {file_path}. Please check the file format and contents.") from parser_error | |
except Exception as e: | |
raise Exception(f"An error occurred while loading the file: {file_path}. Error details: {str(e)}") from e | |
# Example usage of the load_dataset function: | |
try: | |
# Example 1: Loading a dataset from a CSV file | |
dataset = load_dataset('data/my_dataset.csv') | |
print("Dataset loaded successfully!") | |
print(dataset.head()) # Displaying the first few rows of the dataset | |
# Example 2: Loading a dataset from an Excel file | |
dataset = load_dataset('data/my_dataset.xlsx') | |
print("Dataset loaded successfully!") | |
print(dataset.head()) # Displaying the first few rows of the dataset | |
# Example 3: Attempting to load a non-existent file (should raise an error) | |
dataset = load_dataset('data/non_existent_file.csv') | |
except Exception as e: | |
print(f"An error occurred: {e}") | |