caldervf's picture
Adding files from Github repository.
74c716c
# MIT License
#
# Copyright (c) 2023 Victor Calderon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Module for preparing the input dataset.
"""
import logging
from pathlib import Path
from typing import Dict
from src.classes import data_preparation as dp
from src.utils import default_variables as dv
from src.utils import general_utilities as gu
__author__ = ["Victor Calderon"]
__copyright__ = ["Copyright 2023 Victor Calderon"]
__all__ = []
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s]: %(message)s",
)
logger.setLevel(logging.INFO)
# ---------------------------- PROJECT VARIABLES ------------------------------
MODULE_DESCRIPTION = "Module for data preparation"
MODULE_VERSION = "1.0"
# ----------------------------- INPUT PARAMETERS ------------------------------
def get_parser():
"""
Function to get the input parameters to the script.
"""
# Defining the 'parser' object to use
parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)
# Path to the input dataset
parser.add_argument(
"--dataset-path",
dest="dataset_path",
default=dv.cicero_dataset_url,
type=str,
help="""
Path / URL to the input dataset.
[Default: '%(default)s']
""",
)
return parser.parse_args()
# ------------------------------- FUNCTIONS ----------------------------------
def _resolve_input_object_path(object_path: str) -> str:
"""
Check whether or not the path corresponds to a local file or a URL.
Parameters
-------------
object_path : str
Path of the input object.
Returns
----------
parsed_object_path : str
Modified / parsed version of the input object ``object_path``.
Raises
------------
TypeError ; Error
This error gets raised whenever the input object is neither
a 'file' nor a valid 'url'.
"""
object_type = gu.check_url_or_file_type(object_path=object_path)
if object_type == "unspecified":
msg = (
f">>> Unspecified data type for '{object_path}' or does not exist"
)
logger.error(msg)
raise TypeError(msg)
return (
object_path
if object_type == "url"
else str(Path(object_path).resolve())
)
def _temp_create_dataset_with_summaries():
"""
Function to **temporarily** create the Dataset object in HuggingFace
using the dataset with summaries for each of the articles.
Notes
--------
This is a temporary solution UNTIL the ``Summarizer`` is put in place.
"""
# Path to the dataset
dataset_filepath = str(
(
gu.get_project_paths()
.get("src")
.joinpath(
"utils",
"gpt35_summaries",
"df_embed_out2.csv",
)
).resolve()
)
# Reading in dataset
data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath)
# Uploading it to HuggingFace Hub
data_prep_obj.push_dataset_to_hub(
dataset=data_prep_obj.raw_dataset,
dataset_name=dv.summaries_dataset_name,
)
return
# ------------------------------ MAIN FUNCTIONS -------------------------------
def main(params_dict: Dict):
"""
Main function to process the data.
"""
# Determine if the path corresponds to a file or a URL
params_dict["object_path"] = _resolve_input_object_path(
params_dict["dataset_path"]
)
# Showing set of input parameters
gu.show_params(params_dict=params_dict, logger=logger)
# Initializing input parameters
data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"])
data_prep_obj.show_params()
clean_dataset = data_prep_obj.clean_dataset()
logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n")
logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n")
# --- Pushing datasets to HuggingFace Hub
# 'Raw' dataset
data_prep_obj.push_dataset_to_hub(
dataset=data_prep_obj.raw_dataset,
dataset_name=dv.raw_dataset_name,
)
# 'Clean' dataset
data_prep_obj.push_dataset_to_hub(
dataset=clean_dataset,
dataset_name=dv.clean_dataset_name,
)
# Dataset with summaries
_temp_create_dataset_with_summaries()
return
if __name__ == "__main__":
# Getting input parameters
params_dict = vars(get_parser())
# Running main function
main(params_dict=params_dict)