import gradio as gr import os import re import pandas as pd import numpy as np import glob import huggingface_hub print("hfh", huggingface_hub.__version__) from huggingface_hub import hf_hub_download, upload_file, delete_file, snapshot_download, list_repo_files, dataset_info DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID") DATASET_REPO_URL = f"https://huggingface.co/{DATASET_REPO_ID}" HF_DATA_DIRNAME = "data" LOCAL_DATA_DIRNAME = "data" LOCAL_SAVE_DIRNAME = "save" ds_write_token = os.environ.get("DS_WRITE_TOKEN") HF_TOKEN = os.environ.get("HF_TOKEN") print("ds_write_token:", ds_write_token!=None) print("hf_token:", HF_TOKEN!=None) print("hfh_verssion", huggingface_hub.__version__) def retrieveAllSaved(): global DATASET_REPO_ID #listing the files - https://huggingface.co/docs/huggingface_hub/v0.8.1/en/package_reference/hf_api repo_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") #print("Repo files:" + str(repo_files) return repo_files def store_group_sentences(filename: str, df): DATA_FILENAME_1 = f"{filename}" LOCAL_PATH_FILE = os.path.join(LOCAL_SAVE_DIRNAME, DATA_FILENAME_1) DATA_FILE_1 = os.path.join(HF_DATA_DIRNAME, DATA_FILENAME_1) print(f"Trying to save to: {DATA_FILE_1}") os.makedirs(os.path.dirname(LOCAL_PATH_FILE), exist_ok=True) df.to_csv(LOCAL_PATH_FILE, index=False) commit_url = upload_file( path_or_fileobj=LOCAL_PATH_FILE, path_in_repo=DATA_FILE_1, repo_id=DATASET_REPO_ID, repo_type="dataset", token=ds_write_token, ) print(commit_url) def saveSentences(sentences_df): prompt = sentences_df["prompt"].iloc[0] # there's only one prompt per dataframe msg, grp_saved_df, filename = getSavedSentences(prompt) new_df = pd.concat([grp_saved_df, sentences_df]) print(f"new_df: \n{sentences_df}") store_group_sentences(filename, new_df) # https://huggingface.co/spaces/elonmuskceo/persistent-data/blob/main/app.py def get_sentence_csv(file_path: str): file_path = os.path.join(HF_DATA_DIRNAME, file_path) print(f"File path: {file_path}") try: hf_hub_download( force_download=True, # to get updates of the dataset repo_type="dataset", repo_id=DATASET_REPO_ID, filename=file_path, cache_dir=LOCAL_DATA_DIRNAME, force_filename=os.path.basename(file_path), token=HF_TOKEN ) except Exception as e: # file not found print(f"file not found, probably: {e}") files=glob.glob(f"./{LOCAL_DATA_DIRNAME}/", recursive=True) print("Files glob: "+', '.join(files)) #print("Save file:" + str(os.path.basename(file_path))) print(f"trying to get: {os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path))}") df = pd.read_csv(os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path)), encoding='UTF8') return df def getSavedSentences(grp): filename = f"{grp.replace(' ','-')}.csv" sentence_df = pd.DataFrame() try: text = f"Loading sentences: {filename}\n" sentence_df = get_sentence_csv(filename) except Exception as e: text = f"Error, no saved generations for {filename}" #raise gr.Error(f"Cannot load sentences: {filename}!") return text, sentence_df, filename