Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import re | |
import pandas as pd | |
import numpy as np | |
import glob | |
import huggingface_hub | |
print("hfh", huggingface_hub.__version__) | |
from huggingface_hub import hf_hub_download, upload_file, delete_file, snapshot_download, list_repo_files, dataset_info | |
DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID") | |
DATASET_REPO_URL = f"https://huggingface.co/{DATASET_REPO_ID}" | |
HF_DATA_DIRNAME = "data" | |
LOCAL_DATA_DIRNAME = "data" | |
LOCAL_SAVE_DIRNAME = "save" | |
ds_write_token = os.environ.get("DS_WRITE_TOKEN") | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
print("ds_write_token:", ds_write_token!=None) | |
print("hf_token:", HF_TOKEN!=None) | |
print("hfh_verssion", huggingface_hub.__version__) | |
def retrieveAllSaved(): | |
global DATASET_REPO_ID | |
#listing the files - https://huggingface.co/docs/huggingface_hub/v0.8.1/en/package_reference/hf_api | |
repo_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") | |
#print("Repo files:" + str(repo_files) | |
return repo_files | |
def store_group_sentences(filename: str, df): | |
DATA_FILENAME_1 = f"{filename}" | |
LOCAL_PATH_FILE = os.path.join(LOCAL_SAVE_DIRNAME, DATA_FILENAME_1) | |
DATA_FILE_1 = os.path.join(HF_DATA_DIRNAME, DATA_FILENAME_1) | |
print(f"Trying to save to: {DATA_FILE_1}") | |
os.makedirs(os.path.dirname(LOCAL_PATH_FILE), exist_ok=True) | |
df.to_csv(LOCAL_PATH_FILE, index=False) | |
commit_url = upload_file( | |
path_or_fileobj=LOCAL_PATH_FILE, | |
path_in_repo=DATA_FILE_1, | |
repo_id=DATASET_REPO_ID, | |
repo_type="dataset", | |
token=ds_write_token, | |
) | |
print(commit_url) | |
def saveSentences(sentences_df): | |
prompt = sentences_df["prompt"].iloc[0] # there's only one prompt per dataframe | |
msg, grp_saved_df, filename = getSavedSentences(prompt) | |
new_df = pd.concat([grp_saved_df, sentences_df]) | |
print(f"new_df: \n{sentences_df}") | |
store_group_sentences(filename, new_df) | |
# https://huggingface.co/spaces/elonmuskceo/persistent-data/blob/main/app.py | |
def get_sentence_csv(file_path: str): | |
file_path = os.path.join(HF_DATA_DIRNAME, file_path) | |
print(f"File path: {file_path}") | |
try: | |
hf_hub_download( | |
force_download=True, # to get updates of the dataset | |
repo_type="dataset", | |
repo_id=DATASET_REPO_ID, | |
filename=file_path, | |
cache_dir=LOCAL_DATA_DIRNAME, | |
force_filename=os.path.basename(file_path), | |
token=HF_TOKEN | |
) | |
except Exception as e: | |
# file not found | |
print(f"file not found, probably: {e}") | |
files=glob.glob(f"./{LOCAL_DATA_DIRNAME}/", recursive=True) | |
print("Files glob: "+', '.join(files)) | |
#print("Save file:" + str(os.path.basename(file_path))) | |
print(f"trying to get: {os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path))}") | |
df = pd.read_csv(os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path)), encoding='UTF8') | |
return df | |
def getSavedSentences(grp): | |
filename = f"{grp.replace(' ','-')}.csv" | |
sentence_df = pd.DataFrame() | |
try: | |
text = f"Loading sentences: {filename}\n" | |
sentence_df = get_sentence_csv(filename) | |
except Exception as e: | |
text = f"Error, no saved generations for {filename}" | |
#raise gr.Error(f"Cannot load sentences: {filename}!") | |
return text, sentence_df, filename |