vectorsearch / preprocessing.py
JPBianchi's picture
temp before HF pull
30ffb9e
import os
import json
import pandas as pd
from typing import List, Union, Dict
from loguru import logger
import pandas as pd
import pathlib
## Set of helper functions that support data preprocessing
class FileIO:
'''
Convenience class for saving and loading data in parquet and
json formats to/from disk.
'''
def save_as_parquet(self,
file_path: str,
data: Union[List[dict], pd.DataFrame],
overwrite: bool=False) -> None:
'''
Saves DataFrame to disk as a parquet file. Removes the index.
Args:
-----
file_path : str
Output path to save file, if not included "parquet" will be appended
as file extension.
data : Union[List[dict], pd.DataFrame]
Data to save as parquet file. If data is a list of dicts, it will be
converted to a DataFrame before saving.
overwrite : bool
Overwrite existing file if True, otherwise raise FileExistsError.
'''
if isinstance(data, list):
data = self._convert_toDataFrame(data)
if not file_path.endswith('parquet'):
file_path = self._rename_file_extension(file_path, 'parquet')
self._check_file_path(file_path, overwrite=overwrite)
data.to_parquet(file_path, index=False)
logger.info(f'DataFrame saved as parquet file here: {file_path}')
def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
return pd.DataFrame().from_dict(data)
def _rename_file_extension(self, file_path: str, extension: str):
'''
Renames file with appropriate extension if file_path
does not already have correct extension.
'''
prefix = os.path.splitext(file_path)[0]
file_path = prefix + '.' + extension
return file_path
def _check_file_path(self, file_path: str, overwrite: bool) -> None:
'''
Checks for existence of file and overwrite permissions.
'''
if os.path.exists(file_path) and overwrite == False:
raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
elif os.path.exists(file_path):
os.remove(file_path)
else:
file_name = os.path.basename(file_path)
dir_structure = file_path.replace(file_name, '')
pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
'''
Loads parquet from disk, converts to pandas DataFrame as intermediate
step and outputs a list of dicts (docs).
'''
df = pd.read_parquet(file_path)
vector_labels = ['content_vector', 'image_vector', 'content_embedding']
for label in vector_labels:
if label in df.columns:
df[label] = df[label].apply(lambda x: x.tolist())
if verbose:
memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
print(f'Shape of data: {df.values.shape}')
print(f'Memory Usage: {memory_usage}+ MB')
list_of_dicts = df.to_dict('records')
return list_of_dicts
def load_json(self, file_path: str):
'''
Loads json file from disk.
'''
with open(file_path) as f:
data = json.load(f)
return data
def save_as_json(self,
file_path: str,
data: Union[List[dict], dict],
indent: int=4,
overwrite: bool=False
) -> None:
'''
Saves data to disk as a json file. Data can be a list of dicts or a single dict.
'''
if not file_path.endswith('json'):
file_path = self._rename_file_extension(file_path, 'json')
self._check_file_path(file_path, overwrite=overwrite)
with open(file_path, 'w') as f:
json.dump(data, f, indent=indent)
logger.info(f'Data saved as json file here: {file_path}')
class Utilities:
def create_video_url(self, video_id: str, playlist_id: str):
'''
Creates a hyperlink to a video episode given a video_id and playlist_id.
Args:
-----
video_id : str
Video id of the episode from YouTube
playlist_id : str
Playlist id of the episode from YouTube
'''
return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'