Spaces:

JPBianchi
/

vectorsearch

Running

App Files Files Community

vectorsearch / preprocessing.py

JPBianchi

temp before HF pull

30ffb9e 7 months ago

raw history blame contribute delete

No virus

4.63 kB

	import os
	import json
	import pandas as pd
	from typing import List, Union, Dict
	from loguru import logger
	import pandas as pd
	import pathlib


	## Set of helper functions that support data preprocessing
	class FileIO:
	'''
	Convenience class for saving and loading data in parquet and
	json formats to/from disk.
	'''

	def save_as_parquet(self,
	file_path: str,
	data: Union[List[dict], pd.DataFrame],
	overwrite: bool=False) -> None:
	'''
	Saves DataFrame to disk as a parquet file. Removes the index.

	Args:
	-----
	file_path : str
	Output path to save file, if not included "parquet" will be appended
	as file extension.
	data : Union[List[dict], pd.DataFrame]
	Data to save as parquet file. If data is a list of dicts, it will be
	converted to a DataFrame before saving.
	overwrite : bool
	Overwrite existing file if True, otherwise raise FileExistsError.
	'''
	if isinstance(data, list):
	data = self._convert_toDataFrame(data)
	if not file_path.endswith('parquet'):
	file_path = self._rename_file_extension(file_path, 'parquet')
	self._check_file_path(file_path, overwrite=overwrite)
	data.to_parquet(file_path, index=False)
	logger.info(f'DataFrame saved as parquet file here: {file_path}')

	def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
	return pd.DataFrame().from_dict(data)

	def _rename_file_extension(self, file_path: str, extension: str):
	'''
	Renames file with appropriate extension if file_path
	does not already have correct extension.
	'''
	prefix = os.path.splitext(file_path)[0]
	file_path = prefix + '.' + extension
	return file_path

	def _check_file_path(self, file_path: str, overwrite: bool) -> None:
	'''
	Checks for existence of file and overwrite permissions.
	'''
	if os.path.exists(file_path) and overwrite == False:
	raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
	elif os.path.exists(file_path):
	os.remove(file_path)
	else:
	file_name = os.path.basename(file_path)
	dir_structure = file_path.replace(file_name, '')
	pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)

	def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
	'''
	Loads parquet from disk, converts to pandas DataFrame as intermediate
	step and outputs a list of dicts (docs).
	'''
	df = pd.read_parquet(file_path)
	vector_labels = ['content_vector', 'image_vector', 'content_embedding']
	for label in vector_labels:
	if label in df.columns:
	df[label] = df[label].apply(lambda x: x.tolist())
	if verbose:
	memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
	print(f'Shape of data: {df.values.shape}')
	print(f'Memory Usage: {memory_usage}+ MB')
	list_of_dicts = df.to_dict('records')
	return list_of_dicts

	def load_json(self, file_path: str):
	'''
	Loads json file from disk.
	'''
	with open(file_path) as f:
	data = json.load(f)
	return data

	def save_as_json(self,
	file_path: str,
	data: Union[List[dict], dict],
	indent: int=4,
	overwrite: bool=False
	) -> None:
	'''
	Saves data to disk as a json file. Data can be a list of dicts or a single dict.
	'''
	if not file_path.endswith('json'):
	file_path = self._rename_file_extension(file_path, 'json')
	self._check_file_path(file_path, overwrite=overwrite)
	with open(file_path, 'w') as f:
	json.dump(data, f, indent=indent)
	logger.info(f'Data saved as json file here: {file_path}')

	class Utilities:

	def create_video_url(self, video_id: str, playlist_id: str):
	'''
	Creates a hyperlink to a video episode given a video_id and playlist_id.

	Args:
	-----
	video_id : str
	Video id of the episode from YouTube
	playlist_id : str
	Playlist id of the episode from YouTube
	'''
	return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'