# MIT License # # Copyright (c) 2023 Victor Calderon # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ Module for preparing the input dataset. """ import logging from pathlib import Path from typing import Dict from src.classes import data_preparation as dp from src.utils import default_variables as dv from src.utils import general_utilities as gu __author__ = ["Victor Calderon"] __copyright__ = ["Copyright 2023 Victor Calderon"] __all__ = [] logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s", ) logger.setLevel(logging.INFO) # ---------------------------- PROJECT VARIABLES ------------------------------ MODULE_DESCRIPTION = "Module for data preparation" MODULE_VERSION = "1.0" # ----------------------------- INPUT PARAMETERS ------------------------------ def get_parser(): """ Function to get the input parameters to the script. """ # Defining the 'parser' object to use parser = gu._get_parser_obj(description=MODULE_DESCRIPTION) # Path to the input dataset parser.add_argument( "--dataset-path", dest="dataset_path", default=dv.cicero_dataset_url, type=str, help=""" Path / URL to the input dataset. [Default: '%(default)s'] """, ) return parser.parse_args() # ------------------------------- FUNCTIONS ---------------------------------- def _resolve_input_object_path(object_path: str) -> str: """ Check whether or not the path corresponds to a local file or a URL. Parameters ------------- object_path : str Path of the input object. Returns ---------- parsed_object_path : str Modified / parsed version of the input object ``object_path``. Raises ------------ TypeError ; Error This error gets raised whenever the input object is neither a 'file' nor a valid 'url'. """ object_type = gu.check_url_or_file_type(object_path=object_path) if object_type == "unspecified": msg = ( f">>> Unspecified data type for '{object_path}' or does not exist" ) logger.error(msg) raise TypeError(msg) return ( object_path if object_type == "url" else str(Path(object_path).resolve()) ) def _temp_create_dataset_with_summaries(): """ Function to **temporarily** create the Dataset object in HuggingFace using the dataset with summaries for each of the articles. Notes -------- This is a temporary solution UNTIL the ``Summarizer`` is put in place. """ # Path to the dataset dataset_filepath = str( ( gu.get_project_paths() .get("src") .joinpath( "utils", "gpt35_summaries", "df_embed_out2.csv", ) ).resolve() ) # Reading in dataset data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath) # Uploading it to HuggingFace Hub data_prep_obj.push_dataset_to_hub( dataset=data_prep_obj.raw_dataset, dataset_name=dv.summaries_dataset_name, ) return # ------------------------------ MAIN FUNCTIONS ------------------------------- def main(params_dict: Dict): """ Main function to process the data. """ # Determine if the path corresponds to a file or a URL params_dict["object_path"] = _resolve_input_object_path( params_dict["dataset_path"] ) # Showing set of input parameters gu.show_params(params_dict=params_dict, logger=logger) # Initializing input parameters data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"]) data_prep_obj.show_params() clean_dataset = data_prep_obj.clean_dataset() logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n") logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n") # --- Pushing datasets to HuggingFace Hub # 'Raw' dataset data_prep_obj.push_dataset_to_hub( dataset=data_prep_obj.raw_dataset, dataset_name=dv.raw_dataset_name, ) # 'Clean' dataset data_prep_obj.push_dataset_to_hub( dataset=clean_dataset, dataset_name=dv.clean_dataset_name, ) # Dataset with summaries _temp_create_dataset_with_summaries() return if __name__ == "__main__": # Getting input parameters params_dict = vars(get_parser()) # Running main function main(params_dict=params_dict)