anon4757 commited on
Commit
06185c8
·
1 Parent(s): 1fa7b63

Create ds_manager.py

Browse files
Files changed (1) hide show
  1. ds_manager.py +111 -0
ds_manager.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+ import glob
7
+ import huggingface_hub
8
+ print("hfh", huggingface_hub.__version__)
9
+ from huggingface_hub import hf_hub_download, upload_file, delete_file, snapshot_download, list_repo_files, dataset_info
10
+
11
+ DATASET_REPO_ID = "AnimaLab/vision_model_init_data"
12
+ DATASET_REPO_URL = f"https://huggingface.co/{DATASET_REPO_ID}"
13
+ HF_DATA_DIRNAME = "data"
14
+ LOCAL_DATA_DIRNAME = "data"
15
+ LOCAL_SAVE_DIRNAME = "save"
16
+
17
+ ds_write_token = os.environ.get("DS_WRITE_TOKEN")
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
19
+
20
+ print("ds_write_token:", ds_write_token!=None)
21
+ print("hf_token:", HF_TOKEN!=None)
22
+ print("hfh_verssion", huggingface_hub.__version__)
23
+
24
+ def retrieveAllSaved():
25
+ global DATASET_REPO_ID
26
+
27
+ #listing the files - https://huggingface.co/docs/huggingface_hub/v0.8.1/en/package_reference/hf_api
28
+ repo_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
29
+ #print("Repo files:" + str(repo_files)
30
+
31
+ return repo_files
32
+
33
+ def store_group_sentences(filename: str, df):
34
+ DATA_FILENAME_1 = f"{filename}"
35
+ LOCAL_PATH_FILE = os.path.join(LOCAL_SAVE_DIRNAME, DATA_FILENAME_1)
36
+ DATA_FILE_1 = os.path.join(HF_DATA_DIRNAME, DATA_FILENAME_1)
37
+
38
+ print(f"Trying to save to: {DATA_FILE_1}")
39
+
40
+ os.makedirs(os.path.dirname(LOCAL_PATH_FILE), exist_ok=True)
41
+ df.to_csv(LOCAL_PATH_FILE, index=False)
42
+
43
+ commit_url = upload_file(
44
+ path_or_fileobj=LOCAL_PATH_FILE,
45
+ path_in_repo=DATA_FILE_1,
46
+ repo_id=DATASET_REPO_ID,
47
+ repo_type="dataset",
48
+ token=ds_write_token,
49
+ )
50
+
51
+ print(commit_url)
52
+
53
+ def saveSentences(sentences_df):
54
+ prompt = sentences_df["prompt"].iloc[0] # there's only one prompt per dataframe
55
+ msg, grp_saved_df, filename = getSavedSentences(prompt)
56
+ new_df = pd.concat([grp_saved_df, sentences_df])
57
+ print(f"new_df: \n{sentences_df}")
58
+ store_group_sentences(filename, new_df)
59
+ # for grp_term in list(sentences_df['org_grp_term'].unique()):
60
+ # print(f"Retrieving sentences for group: {grp_term}")
61
+ # msg, grp_saved_df, filename = getSavedSentences(grp_term)
62
+ # print(f"Num for group: {grp_term} -> {grp_saved_df.shape[0]}")
63
+ # add_df = sentences_df[sentences_df['org_grp_term'] == grp_term]
64
+ # print(f"Adding {add_df.shape[0]} sentences...")
65
+
66
+ # new_grp_df = pd.concat([grp_saved_df, add_df], ignore_index=True)
67
+ # new_grp_df = new_grp_df.drop_duplicates(subset = "sentence")
68
+
69
+ # print(f"Org size: {grp_saved_df.shape[0]}, Mrg size: {new_grp_df.shape[0]}")
70
+ # store_group_sentences(filename, new_grp_df)
71
+
72
+
73
+ # https://huggingface.co/spaces/elonmuskceo/persistent-data/blob/main/app.py
74
+ def get_sentence_csv(file_path: str):
75
+ file_path = os.path.join(HF_DATA_DIRNAME, file_path)
76
+ print(f"File path: {file_path}")
77
+ try:
78
+ hf_hub_download(
79
+ force_download=True, # to get updates of the dataset
80
+ repo_type="dataset",
81
+ repo_id=DATASET_REPO_ID,
82
+ filename=file_path,
83
+ cache_dir=LOCAL_DATA_DIRNAME,
84
+ force_filename=os.path.basename(file_path),
85
+ token=HF_TOKEN
86
+ )
87
+ except Exception as e:
88
+ # file not found
89
+ print(f"file not found, probably: {e}")
90
+
91
+ files=glob.glob(f"./{LOCAL_DATA_DIRNAME}/", recursive=True)
92
+ print("Files glob: "+', '.join(files))
93
+ #print("Save file:" + str(os.path.basename(file_path)))
94
+ print(f"trying to get: {os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path))}")
95
+ df = pd.read_csv(os.path.join(LOCAL_DATA_DIRNAME, os.path.basename(file_path)), encoding='UTF8')
96
+
97
+ return df
98
+
99
+ def getSavedSentences(grp):
100
+ filename = f"{grp.replace(' ','-')}.csv"
101
+ sentence_df = pd.DataFrame()
102
+
103
+ try:
104
+ text = f"Loading sentences: {filename}\n"
105
+ sentence_df = get_sentence_csv(filename)
106
+
107
+ except Exception as e:
108
+ text = f"Error, no saved generations for {filename}"
109
+ #raise gr.Error(f"Cannot load sentences: {filename}!")
110
+
111
+ return text, sentence_df, filename