Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
app.py
CHANGED
@@ -16,9 +16,9 @@ from background import process_arxiv_ids
|
|
16 |
from apscheduler.schedulers.background import BackgroundScheduler
|
17 |
|
18 |
gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id, restart_repo_id = get_secrets()
|
19 |
-
initialize_repos(dataset_repo_id, request_arxiv_repo_id, hf_token)
|
20 |
|
21 |
-
titles, date_dict, requested_arxiv_ids_df, arxivid2data = initialize_data(dataset_repo_id, request_arxiv_repo_id)
|
22 |
|
23 |
from ui import (
|
24 |
get_paper_by_year, get_paper_by_month, get_paper_by_day,
|
@@ -26,15 +26,42 @@ from ui import (
|
|
26 |
before_chat_begin, chat_stream, chat_reset
|
27 |
)
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
with gr.Blocks(css="constants/styles.css", theme=gr.themes.Soft()) as demo:
|
40 |
cur_arxiv_id = gr.Textbox(selected_paper['arxiv_id'], visible=False)
|
|
|
16 |
from apscheduler.schedulers.background import BackgroundScheduler
|
17 |
|
18 |
gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id, restart_repo_id = get_secrets()
|
19 |
+
empty_src_dataset = initialize_repos(dataset_repo_id, request_arxiv_repo_id, hf_token)
|
20 |
|
21 |
+
titles, date_dict, requested_arxiv_ids_df, arxivid2data = initialize_data(dataset_repo_id, request_arxiv_repo_id, empty_src_dataset)
|
22 |
|
23 |
from ui import (
|
24 |
get_paper_by_year, get_paper_by_month, get_paper_by_day,
|
|
|
26 |
before_chat_begin, chat_stream, chat_reset
|
27 |
)
|
28 |
|
29 |
+
if len(date_dict.keys()) > 0:
|
30 |
+
sorted_year = sorted(date_dict.keys())
|
31 |
+
last_year = sorted_year[-1] if len(sorted_year) > 0 else ""
|
32 |
+
sorted_month = sorted(date_dict[last_year].keys())
|
33 |
+
last_month = sorted_month[-1] if len(sorted_year) > 0 else ""
|
34 |
+
sorted_day = sorted(date_dict[last_year][last_month].keys())
|
35 |
+
last_day = sorted_day[-1] if len(sorted_year) > 0 else ""
|
36 |
+
last_papers = date_dict[last_year][last_month][last_day] if len(sorted_year) > 0 else [""]
|
37 |
+
selected_paper = last_papers[0]
|
38 |
+
visible = True
|
39 |
+
else:
|
40 |
+
sorted_year = ["2024"]
|
41 |
+
last_year = sorted_year[-1]
|
42 |
+
sorted_month = ["01"]
|
43 |
+
last_month = sorted_month[-1]
|
44 |
+
sorted_day = ["01"]
|
45 |
+
last_day = sorted_day[-1]
|
46 |
+
|
47 |
+
selected_paper = {}
|
48 |
+
selected_paper["title"] = ""
|
49 |
+
selected_paper["summary"] = ""
|
50 |
+
selected_paper["arxiv_id"] = ""
|
51 |
+
selected_paper["target_date"] = "2024-01-01"
|
52 |
+
for idx in range(10):
|
53 |
+
selected_paper[f"{idx}_question"] = ""
|
54 |
+
selected_paper[f"{idx}_answer:eli5"] = ""
|
55 |
+
selected_paper[f"{idx}_answer:expert"] = ""
|
56 |
+
selected_paper[f"{idx}_additional_depth_q:follow up question"] = ""
|
57 |
+
selected_paper[f"{idx}_additional_depth_q:answers:eli5"] = ""
|
58 |
+
selected_paper[f"{idx}_additional_depth_q:answers:expert"] = ""
|
59 |
+
selected_paper[f"{idx}_additional_breath_q:follow up question"] = ""
|
60 |
+
selected_paper[f"{idx}_additional_breath_q:answers:eli5"] = ""
|
61 |
+
selected_paper[f"{idx}_additional_breath_q:answers:expert"] = ""
|
62 |
+
|
63 |
+
last_papers = [selected_paper]
|
64 |
+
visible = False
|
65 |
|
66 |
with gr.Blocks(css="constants/styles.css", theme=gr.themes.Soft()) as demo:
|
67 |
cur_arxiv_id = gr.Textbox(selected_paper['arxiv_id'], visible=False)
|
init.py
CHANGED
@@ -35,35 +35,37 @@ def _initialize_paper_info(source_ds):
|
|
35 |
arxivid2data = {}
|
36 |
count = 0
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
for paper in papers:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
year, month, day = date.split("-")
|
56 |
-
papers = date2qna[date]
|
57 |
-
for paper in papers:
|
58 |
-
title2qna[paper["title"]] = paper
|
59 |
-
arxivid2data[paper['arxiv_id']] = {"idx": count, "paper": paper}
|
60 |
-
date_dict[year][month][day].append(paper)
|
61 |
-
|
62 |
-
titles = [f"[{v['arxiv_id']}] {k}" for k, v in title2qna.items()]
|
63 |
-
|
64 |
-
return titles, date_dict, arxivid2data
|
65 |
-
|
66 |
-
def initialize_data(source_data_repo_id, request_data_repo_id):
|
67 |
global date_dict, arxivid2data
|
68 |
global requested_arxiv_ids_df
|
69 |
|
@@ -86,6 +88,10 @@ def initialize_repos(
|
|
86 |
):
|
87 |
if create_hf_hub(source_data_repo_id, hf_token) is False:
|
88 |
print(f"{source_data_repo_id} repository already exists")
|
|
|
|
|
|
|
|
|
89 |
|
90 |
if create_hf_hub(request_data_repo_id, hf_token) is False:
|
91 |
print(f"{request_data_repo_id} repository already exists")
|
|
|
35 |
arxivid2data = {}
|
36 |
count = 0
|
37 |
|
38 |
+
if len(source_ds["train"]) > 1:
|
39 |
+
for data in source_ds["train"]:
|
40 |
+
date = data["target_date"].strftime("%Y-%m-%d")
|
41 |
+
arxiv_id = data["arxiv_id"]
|
42 |
+
|
43 |
+
if date in date2qna:
|
44 |
+
papers = copy.deepcopy(date2qna[date])
|
45 |
+
for paper in papers:
|
46 |
+
if paper["title"] == data["title"]:
|
47 |
+
if _count_nans(paper) > _count_nans(data):
|
48 |
+
date2qna[date].remove(paper)
|
49 |
+
|
50 |
+
date2qna[date].append(data)
|
51 |
+
del papers
|
52 |
+
else:
|
53 |
+
date2qna[date] = [data]
|
54 |
+
|
55 |
+
for date in date2qna:
|
56 |
+
year, month, day = date.split("-")
|
57 |
+
papers = date2qna[date]
|
58 |
for paper in papers:
|
59 |
+
title2qna[paper["title"]] = paper
|
60 |
+
arxivid2data[paper['arxiv_id']] = {"idx": count, "paper": paper}
|
61 |
+
date_dict[year][month][day].append(paper)
|
62 |
+
|
63 |
+
titles = [f"[{v['arxiv_id']}] {k}" for k, v in title2qna.items()]
|
64 |
+
return titles, date_dict, arxivid2data
|
65 |
+
else:
|
66 |
+
return [], {}, {}
|
67 |
+
|
68 |
+
def initialize_data(source_data_repo_id, request_data_repo_id, empty_src_dataset):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
global date_dict, arxivid2data
|
70 |
global requested_arxiv_ids_df
|
71 |
|
|
|
88 |
):
|
89 |
if create_hf_hub(source_data_repo_id, hf_token) is False:
|
90 |
print(f"{source_data_repo_id} repository already exists")
|
91 |
+
else:
|
92 |
+
dummy_row = {"title": ["dummy"]}
|
93 |
+
ds = Dataset.from_dict(dummy_row)
|
94 |
+
ds.push_to_hub(source_data_repo_id, token=hf_token)
|
95 |
|
96 |
if create_hf_hub(request_data_repo_id, hf_token) is False:
|
97 |
print(f"{request_data_repo_id} repository already exists")
|