File size: 3,244 Bytes
06185c8
 
 
 
 
 
 
 
 
 
0e54f18
06185c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import os
import re
import pandas as pd
import numpy as np
import glob
import huggingface_hub
print("hfh", huggingface_hub.__version__)
from huggingface_hub import hf_hub_download, upload_file, delete_file, snapshot_download, list_repo_files, dataset_info

DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID")
DATASET_REPO_URL = f"https://huggingface.co/{DATASET_REPO_ID}"
HF_DATA_DIRNAME = "data"
LOCAL_DATA_DIRNAME = "data"
LOCAL_SAVE_DIRNAME = "save"

ds_write_token = os.environ.get("DS_WRITE_TOKEN")
HF_TOKEN = os.environ.get("HF_TOKEN")

print("ds_write_token:", ds_write_token!=None)
print("hf_token:", HF_TOKEN!=None)
print("hfh_verssion", huggingface_hub.__version__)

def retrieveAllSaved():
    global DATASET_REPO_ID

    #listing the files - https://huggingface.co/docs/huggingface_hub/v0.8.1/en/package_reference/hf_api
    repo_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
    #print("Repo files:" + str(repo_files)

    return repo_files

def store_group_sentences(filename: str, df):
  DATA_FILENAME_1 = f"{filename}"
  LOCAL_PATH_FILE = os.path.join(LOCAL_SAVE_DIRNAME, DATA_FILENAME_1)
  DATA_FILE_1 = os.path.join(HF_DATA_DIRNAME, DATA_FILENAME_1)

  print(f"Trying to save to: {DATA_FILE_1}")

  os.makedirs(os.path.dirname(LOCAL_PATH_FILE), exist_ok=True)
  df.to_csv(LOCAL_PATH_FILE, index=False)

  commit_url = upload_file(
    path_or_fileobj=LOCAL_PATH_FILE,
    path_in_repo=DATA_FILE_1,
    repo_id=DATASET_REPO_ID,
    repo_type="dataset",
    token=ds_write_token,
  )

  print(commit_url)

def saveSentences(sentences_df):
    prompt = sentences_df["prompt"].iloc[0] # there's only one prompt per dataframe
    msg, grp_saved_df, filename = getSavedSentences(prompt)
    new_df = pd.concat([grp_saved_df, sentences_df])
    print(f"new_df: \n{sentences_df}")
    store_group_sentences(filename, new_df)
   

# https://huggingface.co/spaces/elonmuskceo/persistent-data/blob/main/app.py
def get_sentence_csv(file_path: str):
  file_path = os.path.join(HF_DATA_DIRNAME, file_path)
  print(f"File path: {file_path}")
  try:
    hf_hub_download(
       force_download=True, # to get updates of the dataset
       repo_type="dataset",
       repo_id=DATASET_REPO_ID,
       filename=file_path,
       cache_dir=LOCAL_DATA_DIRNAME,
       force_filename=os.path.basename(file_path),
       token=HF_TOKEN
    )
  except Exception as e:
    # file not found
    print(f"file not found, probably: {e}")

  files=glob.glob(f"./{LOCAL_DATA_DIRNAME}/", recursive=True)
  print("Files glob: "+', '.join(files))
  #print("Save file:" + str(os.path.basename(file_path)))
  print(f"trying to get: {os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path))}")
  df = pd.read_csv(os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path)), encoding='UTF8')
  
  return df

def getSavedSentences(grp):
    filename = f"{grp.replace(' ','-')}.csv"
    sentence_df = pd.DataFrame()

    try:
        text = f"Loading sentences: {filename}\n"
        sentence_df = get_sentence_csv(filename)

    except Exception as e:
        text = f"Error, no saved generations for {filename}"
        #raise gr.Error(f"Cannot load sentences: {filename}!")

    return text, sentence_df, filename