Spaces:
Sleeping
Sleeping
Create ds_manager.py
Browse files- ds_manager.py +111 -0
ds_manager.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import glob
|
7 |
+
import huggingface_hub
|
8 |
+
print("hfh", huggingface_hub.__version__)
|
9 |
+
from huggingface_hub import hf_hub_download, upload_file, delete_file, snapshot_download, list_repo_files, dataset_info
|
10 |
+
|
11 |
+
DATASET_REPO_ID = "AnimaLab/vision_model_init_data"
|
12 |
+
DATASET_REPO_URL = f"https://huggingface.co/{DATASET_REPO_ID}"
|
13 |
+
HF_DATA_DIRNAME = "data"
|
14 |
+
LOCAL_DATA_DIRNAME = "data"
|
15 |
+
LOCAL_SAVE_DIRNAME = "save"
|
16 |
+
|
17 |
+
ds_write_token = os.environ.get("DS_WRITE_TOKEN")
|
18 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
19 |
+
|
20 |
+
print("ds_write_token:", ds_write_token!=None)
|
21 |
+
print("hf_token:", HF_TOKEN!=None)
|
22 |
+
print("hfh_verssion", huggingface_hub.__version__)
|
23 |
+
|
24 |
+
def retrieveAllSaved():
|
25 |
+
global DATASET_REPO_ID
|
26 |
+
|
27 |
+
#listing the files - https://huggingface.co/docs/huggingface_hub/v0.8.1/en/package_reference/hf_api
|
28 |
+
repo_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
|
29 |
+
#print("Repo files:" + str(repo_files)
|
30 |
+
|
31 |
+
return repo_files
|
32 |
+
|
33 |
+
def store_group_sentences(filename: str, df):
|
34 |
+
DATA_FILENAME_1 = f"{filename}"
|
35 |
+
LOCAL_PATH_FILE = os.path.join(LOCAL_SAVE_DIRNAME, DATA_FILENAME_1)
|
36 |
+
DATA_FILE_1 = os.path.join(HF_DATA_DIRNAME, DATA_FILENAME_1)
|
37 |
+
|
38 |
+
print(f"Trying to save to: {DATA_FILE_1}")
|
39 |
+
|
40 |
+
os.makedirs(os.path.dirname(LOCAL_PATH_FILE), exist_ok=True)
|
41 |
+
df.to_csv(LOCAL_PATH_FILE, index=False)
|
42 |
+
|
43 |
+
commit_url = upload_file(
|
44 |
+
path_or_fileobj=LOCAL_PATH_FILE,
|
45 |
+
path_in_repo=DATA_FILE_1,
|
46 |
+
repo_id=DATASET_REPO_ID,
|
47 |
+
repo_type="dataset",
|
48 |
+
token=ds_write_token,
|
49 |
+
)
|
50 |
+
|
51 |
+
print(commit_url)
|
52 |
+
|
53 |
+
def saveSentences(sentences_df):
|
54 |
+
prompt = sentences_df["prompt"].iloc[0] # there's only one prompt per dataframe
|
55 |
+
msg, grp_saved_df, filename = getSavedSentences(prompt)
|
56 |
+
new_df = pd.concat([grp_saved_df, sentences_df])
|
57 |
+
print(f"new_df: \n{sentences_df}")
|
58 |
+
store_group_sentences(filename, new_df)
|
59 |
+
# for grp_term in list(sentences_df['org_grp_term'].unique()):
|
60 |
+
# print(f"Retrieving sentences for group: {grp_term}")
|
61 |
+
# msg, grp_saved_df, filename = getSavedSentences(grp_term)
|
62 |
+
# print(f"Num for group: {grp_term} -> {grp_saved_df.shape[0]}")
|
63 |
+
# add_df = sentences_df[sentences_df['org_grp_term'] == grp_term]
|
64 |
+
# print(f"Adding {add_df.shape[0]} sentences...")
|
65 |
+
|
66 |
+
# new_grp_df = pd.concat([grp_saved_df, add_df], ignore_index=True)
|
67 |
+
# new_grp_df = new_grp_df.drop_duplicates(subset = "sentence")
|
68 |
+
|
69 |
+
# print(f"Org size: {grp_saved_df.shape[0]}, Mrg size: {new_grp_df.shape[0]}")
|
70 |
+
# store_group_sentences(filename, new_grp_df)
|
71 |
+
|
72 |
+
|
73 |
+
# https://huggingface.co/spaces/elonmuskceo/persistent-data/blob/main/app.py
|
74 |
+
def get_sentence_csv(file_path: str):
|
75 |
+
file_path = os.path.join(HF_DATA_DIRNAME, file_path)
|
76 |
+
print(f"File path: {file_path}")
|
77 |
+
try:
|
78 |
+
hf_hub_download(
|
79 |
+
force_download=True, # to get updates of the dataset
|
80 |
+
repo_type="dataset",
|
81 |
+
repo_id=DATASET_REPO_ID,
|
82 |
+
filename=file_path,
|
83 |
+
cache_dir=LOCAL_DATA_DIRNAME,
|
84 |
+
force_filename=os.path.basename(file_path),
|
85 |
+
token=HF_TOKEN
|
86 |
+
)
|
87 |
+
except Exception as e:
|
88 |
+
# file not found
|
89 |
+
print(f"file not found, probably: {e}")
|
90 |
+
|
91 |
+
files=glob.glob(f"./{LOCAL_DATA_DIRNAME}/", recursive=True)
|
92 |
+
print("Files glob: "+', '.join(files))
|
93 |
+
#print("Save file:" + str(os.path.basename(file_path)))
|
94 |
+
print(f"trying to get: {os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path))}")
|
95 |
+
df = pd.read_csv(os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path)), encoding='UTF8')
|
96 |
+
|
97 |
+
return df
|
98 |
+
|
99 |
+
def getSavedSentences(grp):
|
100 |
+
filename = f"{grp.replace(' ','-')}.csv"
|
101 |
+
sentence_df = pd.DataFrame()
|
102 |
+
|
103 |
+
try:
|
104 |
+
text = f"Loading sentences: {filename}\n"
|
105 |
+
sentence_df = get_sentence_csv(filename)
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
text = f"Error, no saved generations for {filename}"
|
109 |
+
#raise gr.Error(f"Cannot load sentences: {filename}!")
|
110 |
+
|
111 |
+
return text, sentence_df, filename
|