Spaces:
Sleeping
Sleeping
# MIT License | |
# | |
# Copyright (c) 2023 Victor Calderon | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
""" | |
Module for preparing the input dataset. | |
""" | |
import logging | |
from pathlib import Path | |
from typing import Dict | |
from src.classes import data_preparation as dp | |
from src.utils import default_variables as dv | |
from src.utils import general_utilities as gu | |
__author__ = ["Victor Calderon"] | |
__copyright__ = ["Copyright 2023 Victor Calderon"] | |
__all__ = [] | |
logger = logging.getLogger(__name__) | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s]: %(message)s", | |
) | |
logger.setLevel(logging.INFO) | |
# ---------------------------- PROJECT VARIABLES ------------------------------ | |
MODULE_DESCRIPTION = "Module for data preparation" | |
MODULE_VERSION = "1.0" | |
# ----------------------------- INPUT PARAMETERS ------------------------------ | |
def get_parser(): | |
""" | |
Function to get the input parameters to the script. | |
""" | |
# Defining the 'parser' object to use | |
parser = gu._get_parser_obj(description=MODULE_DESCRIPTION) | |
# Path to the input dataset | |
parser.add_argument( | |
"--dataset-path", | |
dest="dataset_path", | |
default=dv.cicero_dataset_url, | |
type=str, | |
help=""" | |
Path / URL to the input dataset. | |
[Default: '%(default)s'] | |
""", | |
) | |
return parser.parse_args() | |
# ------------------------------- FUNCTIONS ---------------------------------- | |
def _resolve_input_object_path(object_path: str) -> str: | |
""" | |
Check whether or not the path corresponds to a local file or a URL. | |
Parameters | |
------------- | |
object_path : str | |
Path of the input object. | |
Returns | |
---------- | |
parsed_object_path : str | |
Modified / parsed version of the input object ``object_path``. | |
Raises | |
------------ | |
TypeError ; Error | |
This error gets raised whenever the input object is neither | |
a 'file' nor a valid 'url'. | |
""" | |
object_type = gu.check_url_or_file_type(object_path=object_path) | |
if object_type == "unspecified": | |
msg = ( | |
f">>> Unspecified data type for '{object_path}' or does not exist" | |
) | |
logger.error(msg) | |
raise TypeError(msg) | |
return ( | |
object_path | |
if object_type == "url" | |
else str(Path(object_path).resolve()) | |
) | |
def _temp_create_dataset_with_summaries(): | |
""" | |
Function to **temporarily** create the Dataset object in HuggingFace | |
using the dataset with summaries for each of the articles. | |
Notes | |
-------- | |
This is a temporary solution UNTIL the ``Summarizer`` is put in place. | |
""" | |
# Path to the dataset | |
dataset_filepath = str( | |
( | |
gu.get_project_paths() | |
.get("src") | |
.joinpath( | |
"utils", | |
"gpt35_summaries", | |
"df_embed_out2.csv", | |
) | |
).resolve() | |
) | |
# Reading in dataset | |
data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath) | |
# Uploading it to HuggingFace Hub | |
data_prep_obj.push_dataset_to_hub( | |
dataset=data_prep_obj.raw_dataset, | |
dataset_name=dv.summaries_dataset_name, | |
) | |
return | |
# ------------------------------ MAIN FUNCTIONS ------------------------------- | |
def main(params_dict: Dict): | |
""" | |
Main function to process the data. | |
""" | |
# Determine if the path corresponds to a file or a URL | |
params_dict["object_path"] = _resolve_input_object_path( | |
params_dict["dataset_path"] | |
) | |
# Showing set of input parameters | |
gu.show_params(params_dict=params_dict, logger=logger) | |
# Initializing input parameters | |
data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"]) | |
data_prep_obj.show_params() | |
clean_dataset = data_prep_obj.clean_dataset() | |
logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n") | |
logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n") | |
# --- Pushing datasets to HuggingFace Hub | |
# 'Raw' dataset | |
data_prep_obj.push_dataset_to_hub( | |
dataset=data_prep_obj.raw_dataset, | |
dataset_name=dv.raw_dataset_name, | |
) | |
# 'Clean' dataset | |
data_prep_obj.push_dataset_to_hub( | |
dataset=clean_dataset, | |
dataset_name=dv.clean_dataset_name, | |
) | |
# Dataset with summaries | |
_temp_create_dataset_with_summaries() | |
return | |
if __name__ == "__main__": | |
# Getting input parameters | |
params_dict = vars(get_parser()) | |
# Running main function | |
main(params_dict=params_dict) | |