Spaces:

caldervf
/

Cicero_Synthesizer_Space

Sleeping

App Files Files Community

Cicero_Synthesizer_Space / src /data_processing /prepare_dataset.py

caldervf

Adding files from Github repository.

74c716c over 1 year ago

raw

history blame contribute delete

5.56 kB

	# MIT License
	#
	# Copyright (c) 2023 Victor Calderon
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	"""
	Module for preparing the input dataset.
	"""

	import logging
	from pathlib import Path
	from typing import Dict

	from src.classes import data_preparation as dp
	from src.utils import default_variables as dv
	from src.utils import general_utilities as gu

	__author__ = ["Victor Calderon"]
	__copyright__ = ["Copyright 2023 Victor Calderon"]
	__all__ = []

	logger = logging.getLogger(__name__)
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s]: %(message)s",
	)
	logger.setLevel(logging.INFO)


	# ---------------------------- PROJECT VARIABLES ------------------------------

	MODULE_DESCRIPTION = "Module for data preparation"
	MODULE_VERSION = "1.0"


	# ----------------------------- INPUT PARAMETERS ------------------------------


	def get_parser():
	"""
	Function to get the input parameters to the script.
	"""
	# Defining the 'parser' object to use
	parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)

	# Path to the input dataset
	parser.add_argument(
	"--dataset-path",
	dest="dataset_path",
	default=dv.cicero_dataset_url,
	type=str,
	help="""
	Path / URL to the input dataset.
	[Default: '%(default)s']
	""",
	)

	return parser.parse_args()


	# ------------------------------- FUNCTIONS ----------------------------------


	def _resolve_input_object_path(object_path: str) -> str:
	"""
	Check whether or not the path corresponds to a local file or a URL.

	Parameters
	-------------
	object_path : str
	Path of the input object.

	Returns
	----------
	parsed_object_path : str
	Modified / parsed version of the input object ``object_path``.

	Raises
	------------
	TypeError ; Error
	This error gets raised whenever the input object is neither
	a 'file' nor a valid 'url'.
	"""
	object_type = gu.check_url_or_file_type(object_path=object_path)

	if object_type == "unspecified":
	msg = (
	f">>> Unspecified data type for '{object_path}' or does not exist"
	)
	logger.error(msg)
	raise TypeError(msg)

	return (
	object_path
	if object_type == "url"
	else str(Path(object_path).resolve())
	)


	def _temp_create_dataset_with_summaries():
	"""
	Function to temporarily create the Dataset object in HuggingFace
	using the dataset with summaries for each of the articles.

	Notes
	--------
	This is a temporary solution UNTIL the ``Summarizer`` is put in place.
	"""
	# Path to the dataset
	dataset_filepath = str(
	(
	gu.get_project_paths()
	.get("src")
	.joinpath(
	"utils",
	"gpt35_summaries",
	"df_embed_out2.csv",
	)
	).resolve()
	)

	# Reading in dataset
	data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath)

	# Uploading it to HuggingFace Hub
	data_prep_obj.push_dataset_to_hub(
	dataset=data_prep_obj.raw_dataset,
	dataset_name=dv.summaries_dataset_name,
	)

	return


	# ------------------------------ MAIN FUNCTIONS -------------------------------


	def main(params_dict: Dict):
	"""
	Main function to process the data.
	"""
	# Determine if the path corresponds to a file or a URL
	params_dict["object_path"] = _resolve_input_object_path(
	params_dict["dataset_path"]
	)

	# Showing set of input parameters
	gu.show_params(params_dict=params_dict, logger=logger)

	# Initializing input parameters
	data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"])
	data_prep_obj.show_params()
	clean_dataset = data_prep_obj.clean_dataset()

	logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n")
	logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n")

	# --- Pushing datasets to HuggingFace Hub
	# 'Raw' dataset
	data_prep_obj.push_dataset_to_hub(
	dataset=data_prep_obj.raw_dataset,
	dataset_name=dv.raw_dataset_name,
	)
	# 'Clean' dataset
	data_prep_obj.push_dataset_to_hub(
	dataset=clean_dataset,
	dataset_name=dv.clean_dataset_name,
	)

	# Dataset with summaries
	_temp_create_dataset_with_summaries()

	return


	if __name__ == "__main__":
	# Getting input parameters
	params_dict = vars(get_parser())
	# Running main function
	main(params_dict=params_dict)