chansung commited on
Commit
1be7f01
β€’
1 Parent(s): e518ed3
Files changed (2) hide show
  1. app.py +38 -11
  2. init.py +34 -28
app.py CHANGED
@@ -16,9 +16,9 @@ from background import process_arxiv_ids
16
  from apscheduler.schedulers.background import BackgroundScheduler
17
 
18
  gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id, restart_repo_id = get_secrets()
19
- initialize_repos(dataset_repo_id, request_arxiv_repo_id, hf_token)
20
 
21
- titles, date_dict, requested_arxiv_ids_df, arxivid2data = initialize_data(dataset_repo_id, request_arxiv_repo_id)
22
 
23
  from ui import (
24
  get_paper_by_year, get_paper_by_month, get_paper_by_day,
@@ -26,15 +26,42 @@ from ui import (
26
  before_chat_begin, chat_stream, chat_reset
27
  )
28
 
29
- sorted_year = sorted(date_dict.keys())
30
- last_year = sorted_year[-1]
31
- sorted_month = sorted(date_dict[last_year].keys())
32
- last_month = sorted_month[-1]
33
- sorted_day = sorted(date_dict[last_year][last_month].keys())
34
- last_day = sorted_day[-1]
35
- last_papers = date_dict[last_year][last_month][last_day]
36
- selected_paper = last_papers[0]
37
- visible = True if len(sorted_year) > 0 else False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  with gr.Blocks(css="constants/styles.css", theme=gr.themes.Soft()) as demo:
40
  cur_arxiv_id = gr.Textbox(selected_paper['arxiv_id'], visible=False)
 
16
  from apscheduler.schedulers.background import BackgroundScheduler
17
 
18
  gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id, restart_repo_id = get_secrets()
19
+ empty_src_dataset = initialize_repos(dataset_repo_id, request_arxiv_repo_id, hf_token)
20
 
21
+ titles, date_dict, requested_arxiv_ids_df, arxivid2data = initialize_data(dataset_repo_id, request_arxiv_repo_id, empty_src_dataset)
22
 
23
  from ui import (
24
  get_paper_by_year, get_paper_by_month, get_paper_by_day,
 
26
  before_chat_begin, chat_stream, chat_reset
27
  )
28
 
29
+ if len(date_dict.keys()) > 0:
30
+ sorted_year = sorted(date_dict.keys())
31
+ last_year = sorted_year[-1] if len(sorted_year) > 0 else ""
32
+ sorted_month = sorted(date_dict[last_year].keys())
33
+ last_month = sorted_month[-1] if len(sorted_year) > 0 else ""
34
+ sorted_day = sorted(date_dict[last_year][last_month].keys())
35
+ last_day = sorted_day[-1] if len(sorted_year) > 0 else ""
36
+ last_papers = date_dict[last_year][last_month][last_day] if len(sorted_year) > 0 else [""]
37
+ selected_paper = last_papers[0]
38
+ visible = True
39
+ else:
40
+ sorted_year = ["2024"]
41
+ last_year = sorted_year[-1]
42
+ sorted_month = ["01"]
43
+ last_month = sorted_month[-1]
44
+ sorted_day = ["01"]
45
+ last_day = sorted_day[-1]
46
+
47
+ selected_paper = {}
48
+ selected_paper["title"] = ""
49
+ selected_paper["summary"] = ""
50
+ selected_paper["arxiv_id"] = ""
51
+ selected_paper["target_date"] = "2024-01-01"
52
+ for idx in range(10):
53
+ selected_paper[f"{idx}_question"] = ""
54
+ selected_paper[f"{idx}_answer:eli5"] = ""
55
+ selected_paper[f"{idx}_answer:expert"] = ""
56
+ selected_paper[f"{idx}_additional_depth_q:follow up question"] = ""
57
+ selected_paper[f"{idx}_additional_depth_q:answers:eli5"] = ""
58
+ selected_paper[f"{idx}_additional_depth_q:answers:expert"] = ""
59
+ selected_paper[f"{idx}_additional_breath_q:follow up question"] = ""
60
+ selected_paper[f"{idx}_additional_breath_q:answers:eli5"] = ""
61
+ selected_paper[f"{idx}_additional_breath_q:answers:expert"] = ""
62
+
63
+ last_papers = [selected_paper]
64
+ visible = False
65
 
66
  with gr.Blocks(css="constants/styles.css", theme=gr.themes.Soft()) as demo:
67
  cur_arxiv_id = gr.Textbox(selected_paper['arxiv_id'], visible=False)
init.py CHANGED
@@ -35,35 +35,37 @@ def _initialize_paper_info(source_ds):
35
  arxivid2data = {}
36
  count = 0
37
 
38
- for data in source_ds["train"]:
39
- date = data["target_date"].strftime("%Y-%m-%d")
40
- arxiv_id = data["arxiv_id"]
41
-
42
- if date in date2qna:
43
- papers = copy.deepcopy(date2qna[date])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  for paper in papers:
45
- if paper["title"] == data["title"]:
46
- if _count_nans(paper) > _count_nans(data):
47
- date2qna[date].remove(paper)
48
-
49
- date2qna[date].append(data)
50
- del papers
51
- else:
52
- date2qna[date] = [data]
53
-
54
- for date in date2qna:
55
- year, month, day = date.split("-")
56
- papers = date2qna[date]
57
- for paper in papers:
58
- title2qna[paper["title"]] = paper
59
- arxivid2data[paper['arxiv_id']] = {"idx": count, "paper": paper}
60
- date_dict[year][month][day].append(paper)
61
-
62
- titles = [f"[{v['arxiv_id']}] {k}" for k, v in title2qna.items()]
63
-
64
- return titles, date_dict, arxivid2data
65
-
66
- def initialize_data(source_data_repo_id, request_data_repo_id):
67
  global date_dict, arxivid2data
68
  global requested_arxiv_ids_df
69
 
@@ -86,6 +88,10 @@ def initialize_repos(
86
  ):
87
  if create_hf_hub(source_data_repo_id, hf_token) is False:
88
  print(f"{source_data_repo_id} repository already exists")
 
 
 
 
89
 
90
  if create_hf_hub(request_data_repo_id, hf_token) is False:
91
  print(f"{request_data_repo_id} repository already exists")
 
35
  arxivid2data = {}
36
  count = 0
37
 
38
+ if len(source_ds["train"]) > 1:
39
+ for data in source_ds["train"]:
40
+ date = data["target_date"].strftime("%Y-%m-%d")
41
+ arxiv_id = data["arxiv_id"]
42
+
43
+ if date in date2qna:
44
+ papers = copy.deepcopy(date2qna[date])
45
+ for paper in papers:
46
+ if paper["title"] == data["title"]:
47
+ if _count_nans(paper) > _count_nans(data):
48
+ date2qna[date].remove(paper)
49
+
50
+ date2qna[date].append(data)
51
+ del papers
52
+ else:
53
+ date2qna[date] = [data]
54
+
55
+ for date in date2qna:
56
+ year, month, day = date.split("-")
57
+ papers = date2qna[date]
58
  for paper in papers:
59
+ title2qna[paper["title"]] = paper
60
+ arxivid2data[paper['arxiv_id']] = {"idx": count, "paper": paper}
61
+ date_dict[year][month][day].append(paper)
62
+
63
+ titles = [f"[{v['arxiv_id']}] {k}" for k, v in title2qna.items()]
64
+ return titles, date_dict, arxivid2data
65
+ else:
66
+ return [], {}, {}
67
+
68
+ def initialize_data(source_data_repo_id, request_data_repo_id, empty_src_dataset):
 
 
 
 
 
 
 
 
 
 
 
 
69
  global date_dict, arxivid2data
70
  global requested_arxiv_ids_df
71
 
 
88
  ):
89
  if create_hf_hub(source_data_repo_id, hf_token) is False:
90
  print(f"{source_data_repo_id} repository already exists")
91
+ else:
92
+ dummy_row = {"title": ["dummy"]}
93
+ ds = Dataset.from_dict(dummy_row)
94
+ ds.push_to_hub(source_data_repo_id, token=hf_token)
95
 
96
  if create_hf_hub(request_data_repo_id, hf_token) is False:
97
  print(f"{request_data_repo_id} repository already exists")