shaocongma commited on
Commit
2dc9347
1 Parent(s): 6eb659a

use most relevant papers as references.

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /.idea/
api_wrapper.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ This script is used to wrap all generation methods together.
3
+
4
+ todo:
5
+ A worker keeps running on the server. Monitor the Amazon SQS. Once receive a new message, do the following:
6
+ Download the corresponding configuration files on S3.
7
+ Change Task status from Pending to Running.
8
+ Call `generator_wrapper` and wait for the outputs.
9
+ If `generator_wrapper` returns results:
10
+ evaluate the results; compile it; upload results to S3 ... Change Task status from Running to Completed.
11
+ If anything goes wrong, raise Error.
12
+ If `generator_wrapper` returns nothing or Timeout, or raise any error:
13
+ Change Task status from Running to Failed.
14
+ '''
15
+
16
+ from auto_backgrounds import generate_draft
17
+ import json
18
+
19
+
20
+ GENERATOR_MAPPING = {"draft": generate_draft}
21
+
22
+ def generator_wrapper(path_to_config_json):
23
+ # Read configuration file and call corresponding function
24
+ with open(path_to_config_json, "r", encoding='utf-8') as f:
25
+ config = json.load(f)
26
+
27
+ generator = GENERATOR_MAPPING.get(config["generator"])
28
+ if generator is None:
29
+ pass
app.py CHANGED
@@ -119,12 +119,31 @@ ACADEMIC_PAPER = """## 一键生成论文初稿
119
 
120
 
121
  REFERENCES = """## 一键搜索相关论文
122
-
123
  1. 在Title文本框中输入想要搜索文献的论文(比如Playing Atari with Deep Reinforcement Learning).
124
  2. 点击Submit. 等待大概十分钟.
125
  3. 在右侧JSON处会显示相关文献.
126
  """
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  with gr.Blocks(theme=theme) as demo:
129
  gr.Markdown('''
130
  # Auto-Draft: 文献整理辅助工具
@@ -152,20 +171,40 @@ with gr.Blocks(theme=theme) as demo:
152
  title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
153
  label="Title", info="论文标题")
154
 
 
 
155
  with gr.Accordion("高级设置", open=False):
156
- description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
157
- info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  with gr.Row():
160
- with gr.Column():
161
- with gr.Row():
162
- template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
163
- interactive=False,
164
- info="生成论文的参考模板. (暂不支持修改)")
165
- model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"],
166
- value="gpt-3.5-turbo",
167
- interactive=True,
168
- info="生成论文用到的语言模型.")
 
 
 
 
169
  gr.Markdown('''
170
  上传.bib文件提供AI需要参考的文献.
171
  ''')
@@ -175,28 +214,23 @@ with gr.Blocks(theme=theme) as demo:
175
  examples=["latex_templates/example_references.bib"],
176
  inputs=bibtex_file
177
  )
178
- with gr.Column():
179
- search_engine = gr.Dropdown(label="Search Engine",
180
- choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
181
- value="Semantic Scholar",
182
- interactive=False,
183
- info="用于决定GPT-4用什么搜索引擎来搜索文献. (暂不支持修改)")
184
- tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
185
- info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
186
- interactive=True)
187
- sections = gr.CheckboxGroup(
188
- choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
189
- "conclusion", "abstract"],
190
- type="value", label="生成章节", interactive=True,
191
- value=["introduction", "related works"])
192
- slider = gr.Slider(minimum=1, maximum=100, value=20, step=1,
193
- interactive=True, label="最大参考文献数目")
194
 
 
 
 
 
 
 
 
 
 
 
 
195
  with gr.Row():
196
  clear_button_pp = gr.Button("Clear")
197
  submit_button_pp = gr.Button("Submit", variant="primary")
198
 
199
- with gr.Tab("文献搜索 (NEW!)"):
200
  gr.Markdown(REFERENCES)
201
 
202
  title_refs = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
 
119
 
120
 
121
  REFERENCES = """## 一键搜索相关论文
122
+ (此功能已经被整合进一键生成论文初稿)
123
  1. 在Title文本框中输入想要搜索文献的论文(比如Playing Atari with Deep Reinforcement Learning).
124
  2. 点击Submit. 等待大概十分钟.
125
  3. 在右侧JSON处会显示相关文献.
126
  """
127
 
128
+ REFERENCES_INSTRUCTION = """### References
129
+ 这一行用于定义AI如何选取参考文献. 目前是两种方式混合:
130
+ 1. GPT自动根据标题生成关键字,使用Semantic Scholar搜索引擎搜索文献,利用Specter获取Paper Embedding来自动选取最相关的文献作为GPT的参考资料.
131
+ 2. 用户上传bibtex文件,使用Google Scholar搜索摘要作为GPT的参考资料.
132
+ 关于有希望利用本地文件来供GPT参考的功能将在未来实装.
133
+ """
134
+
135
+ DOMAIN_KNOWLEDGE_INSTRUCTION = """### Domain Knowledge
136
+ (暂未实装)
137
+ 这一行用于定义AI的知识库. 将提供两种选择:
138
+ 1. 各个领域内由专家预先收集资料并构建的的FAISS向量数据库. 每个数据库内包含了数百万页经过同行评议的论文和专业经典书籍.
139
+ 2. 自行构建的使用OpenAI text-embedding-ada-002模型创建的FAISS向量数据库.
140
+ """
141
+
142
+ OTHERS_INSTRUCTION = """### Others
143
+
144
+ """
145
+
146
+
147
  with gr.Blocks(theme=theme) as demo:
148
  gr.Markdown('''
149
  # Auto-Draft: 文献整理辅助工具
 
171
  title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
172
  label="Title", info="论文标题")
173
 
174
+ slider = gr.Slider(minimum=1, maximum=100, value=20, step=1,
175
+ interactive=True, visible=False, label="最大参考文献数目")
176
  with gr.Accordion("高级设置", open=False):
177
+ with gr.Row():
178
+ description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
179
+ info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
180
+ with gr.Row():
181
+ template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
182
+ interactive=False,
183
+ info="生成论文的参考模板. (暂不支持修改)")
184
+ model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"],
185
+ value="gpt-3.5-turbo",
186
+ interactive=True,
187
+ info="生成论文用到的语言模型.")
188
+ sections = gr.CheckboxGroup(
189
+ choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
190
+ "conclusion", "abstract"],
191
+ type="value", label="生成章节", interactive=True,
192
+ value=["introduction", "related works"])
193
 
194
  with gr.Row():
195
+ with gr.Column(scale=1):
196
+ gr.Markdown(REFERENCES_INSTRUCTION)
197
+
198
+ with gr.Column(scale=2):
199
+ search_engine = gr.Dropdown(label="Search Engine",
200
+ choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
201
+ value="Semantic Scholar",
202
+ interactive=False,
203
+ visible=False,
204
+ info="用于决定GPT用什么搜索引擎来搜索文献. (暂不支持修改)")
205
+ tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
206
+ info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
207
+ interactive=True)
208
  gr.Markdown('''
209
  上传.bib文件提供AI需要参考的文献.
210
  ''')
 
214
  examples=["latex_templates/example_references.bib"],
215
  inputs=bibtex_file
216
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ with gr.Row():
219
+ with gr.Column(scale=1):
220
+ gr.Markdown(DOMAIN_KNOWLEDGE_INSTRUCTION)
221
+
222
+ with gr.Column(scale=2):
223
+ domain_knowledge = gr.Dropdown(label="预载知识库",
224
+ choices=["(None)", "Machine Learning"],
225
+ value="(None)",
226
+ interactive=False,
227
+ info="使用预先构建的知识库. (暂未实装)")
228
+ local_domain_knowledge = gr.File(label="本地知识库 (暂未实装)", interactive=False)
229
  with gr.Row():
230
  clear_button_pp = gr.Button("Clear")
231
  submit_button_pp = gr.Button("Submit", variant="primary")
232
 
233
+ with gr.Tab("文献搜索"):
234
  gr.Markdown(REFERENCES)
235
 
236
  title_refs = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
auto_backgrounds.py CHANGED
@@ -1,12 +1,13 @@
1
  import os.path
2
- import json
3
  from utils.references import References
4
  from utils.file_operations import hash_name, make_archive, copy_templates
5
  from utils.tex_processing import create_copies
6
  from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
 
7
  import logging
8
  import time
9
 
 
10
  TOTAL_TOKENS = 0
11
  TOTAL_PROMPTS_TOKENS = 0
12
  TOTAL_COMPLETION_TOKENS = 0
@@ -32,7 +33,7 @@ def log_usage(usage, generating_target, print_out=True):
32
  logging.info(message)
33
 
34
  def _generation_setup(title, description="", template="ICLR2022", tldr=False,
35
- max_kw_refs=10, max_num_refs=50, bib_refs=None):
36
  """
37
  This function handles the setup process for paper generation; it contains three folds
38
  1. Copy the template to the outputs folder. Create the log file `generation.log`
@@ -54,7 +55,7 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
54
  - destination_folder (str): The path to the destination folder where the generation log is saved.
55
  - all_paper_ids (list): A list of all paper IDs collected for the references.
56
  """
57
- print("Generation setup...")
58
  paper = {}
59
  paper_body = {}
60
 
@@ -63,9 +64,8 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
63
  logging.basicConfig(level=logging.INFO, filename=os.path.join(destination_folder, "generation.log") )
64
 
65
  # Generate keywords and references
66
- print("Initialize the paper information ...")
67
  input_dict = {"title": title, "description": description}
68
- # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
69
  keywords, usage = keywords_generation(input_dict)
70
  log_usage(usage, "keywords")
71
 
@@ -75,13 +75,13 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
75
 
76
  ref = References(title, bib_refs)
77
  ref.collect_papers(keywords, tldr=tldr)
78
- all_paper_ids = ref.to_bibtex(bibtex_path, max_num_refs) #todo: max_num_refs has not implemented yet
79
 
80
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
81
 
82
  paper["title"] = title
83
  paper["description"] = description
84
- paper["references"] = ref.to_prompts()
85
  paper["body"] = paper_body
86
  paper["bibtex"] = bibtex_path
87
  return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
@@ -107,15 +107,20 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
107
  return make_archive(destination_folder, filename)
108
 
109
 
 
110
  def generate_draft(title, description="", template="ICLR2022",
111
  tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None, model="gpt-4"):
112
  # pre-processing `sections` parameter;
 
113
  if sections is None:
114
  sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
115
 
116
  # todo: add more parameters; select which section to generate; select maximum refs.
117
  paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
 
 
118
  for section in sections:
 
119
  max_attempts = 4
120
  attempts_count = 0
121
  while attempts_count < max_attempts:
@@ -124,12 +129,14 @@ def generate_draft(title, description="", template="ICLR2022",
124
  log_usage(usage, section)
125
  break
126
  except Exception as e:
127
- message = f"Failed to generate {section}. {type(e).__name__} was raised: {e}"
128
  print(message)
129
  logging.info(message)
130
  attempts_count += 1
131
- time.sleep(20)
 
132
  # post-processing
 
133
  create_copies(destination_folder)
134
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
135
  filename = hash_name(input_dict) + ".zip"
@@ -137,6 +144,11 @@ def generate_draft(title, description="", template="ICLR2022",
137
  return make_archive(destination_folder, filename)
138
 
139
 
 
 
 
 
 
140
  if __name__ == "__main__":
141
  import openai
142
  openai.api_key = os.getenv("OPENAI_API_KEY")
 
1
  import os.path
 
2
  from utils.references import References
3
  from utils.file_operations import hash_name, make_archive, copy_templates
4
  from utils.tex_processing import create_copies
5
  from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
6
+ from references_generator import generate_top_k_references
7
  import logging
8
  import time
9
 
10
+
11
  TOTAL_TOKENS = 0
12
  TOTAL_PROMPTS_TOKENS = 0
13
  TOTAL_COMPLETION_TOKENS = 0
 
33
  logging.info(message)
34
 
35
  def _generation_setup(title, description="", template="ICLR2022", tldr=False,
36
+ max_kw_refs=10, max_num_refs=50, bib_refs=None, max_tokens=2048):
37
  """
38
  This function handles the setup process for paper generation; it contains three folds
39
  1. Copy the template to the outputs folder. Create the log file `generation.log`
 
55
  - destination_folder (str): The path to the destination folder where the generation log is saved.
56
  - all_paper_ids (list): A list of all paper IDs collected for the references.
57
  """
58
+ # print("Generation setup...")
59
  paper = {}
60
  paper_body = {}
61
 
 
64
  logging.basicConfig(level=logging.INFO, filename=os.path.join(destination_folder, "generation.log") )
65
 
66
  # Generate keywords and references
67
+ # print("Initialize the paper information ...")
68
  input_dict = {"title": title, "description": description}
 
69
  keywords, usage = keywords_generation(input_dict)
70
  log_usage(usage, "keywords")
71
 
 
75
 
76
  ref = References(title, bib_refs)
77
  ref.collect_papers(keywords, tldr=tldr)
78
+ all_paper_ids = ref.to_bibtex(bibtex_path)
79
 
80
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
81
 
82
  paper["title"] = title
83
  paper["description"] = description
84
+ paper["references"] = ref.to_prompts(max_tokens=max_tokens)
85
  paper["body"] = paper_body
86
  paper["bibtex"] = bibtex_path
87
  return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
 
107
  return make_archive(destination_folder, filename)
108
 
109
 
110
+
111
  def generate_draft(title, description="", template="ICLR2022",
112
  tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None, model="gpt-4"):
113
  # pre-processing `sections` parameter;
114
+ print("================PRE-PROCESSING================")
115
  if sections is None:
116
  sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
117
 
118
  # todo: add more parameters; select which section to generate; select maximum refs.
119
  paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
120
+
121
+ # main components
122
  for section in sections:
123
+ print(f"================Generate {section}================")
124
  max_attempts = 4
125
  attempts_count = 0
126
  while attempts_count < max_attempts:
 
129
  log_usage(usage, section)
130
  break
131
  except Exception as e:
132
+ message = f"Failed to generate {section}. {type(e).__name__} was raised: {e}\n"
133
  print(message)
134
  logging.info(message)
135
  attempts_count += 1
136
+ time.sleep(15)
137
+
138
  # post-processing
139
+ print("================POST-PROCESSING================")
140
  create_copies(destination_folder)
141
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
142
  filename = hash_name(input_dict) + ".zip"
 
144
  return make_archive(destination_folder, filename)
145
 
146
 
147
+
148
+
149
+
150
+
151
+
152
  if __name__ == "__main__":
153
  import openai
154
  openai.api_key = os.getenv("OPENAI_API_KEY")
references_generator.py CHANGED
@@ -5,8 +5,10 @@ from section_generator import section_generation_bg, keywords_generation, figure
5
  import itertools
6
  from gradio_client import Client
7
 
 
8
  def generate_raw_references(title, description="",
9
- bib_refs=None, tldr=False, max_kw_refs=10, save_to="ref.bib"):
 
10
  # load pre-provided references
11
  ref = References(title, bib_refs)
12
 
@@ -21,16 +23,17 @@ def generate_raw_references(title, description="",
21
  print(f"keywords: {keywords}\n\n")
22
 
23
  ref.collect_papers(keywords, tldr=tldr)
24
- paper_json = ref.to_json()
25
 
26
  with open(save_to, "w") as f:
27
  json.dump(paper_json, f)
28
 
29
- return save_to, paper_json
30
 
31
  def generate_top_k_references(title, description="",
32
  bib_refs=None, tldr=False, max_kw_refs=10, save_to="ref.bib", top_k=5):
33
- json_path, json_content = generate_raw_references(title, description, bib_refs, tldr, max_kw_refs, save_to)
 
34
 
35
  client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
36
  result = client.predict(
@@ -43,6 +46,7 @@ def generate_top_k_references(title, description="",
43
  result = json.load(f)
44
  return result
45
 
 
46
  if __name__ == "__main__":
47
  import openai
48
  openai.api_key = os.getenv("OPENAI_API_KEY")
 
5
  import itertools
6
  from gradio_client import Client
7
 
8
+
9
  def generate_raw_references(title, description="",
10
+ bib_refs=None, tldr=False, max_kw_refs=10,
11
+ save_to="ref.bib"):
12
  # load pre-provided references
13
  ref = References(title, bib_refs)
14
 
 
23
  print(f"keywords: {keywords}\n\n")
24
 
25
  ref.collect_papers(keywords, tldr=tldr)
26
+ # paper_json = ref.to_json()
27
 
28
  with open(save_to, "w") as f:
29
  json.dump(paper_json, f)
30
 
31
+ return save_to, ref # paper_json
32
 
33
  def generate_top_k_references(title, description="",
34
  bib_refs=None, tldr=False, max_kw_refs=10, save_to="ref.bib", top_k=5):
35
+ json_path, ref_raw = generate_raw_references(title, description, bib_refs, tldr, max_kw_refs, save_to)
36
+ json_content = ref_raw.to_json()
37
 
38
  client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
39
  result = client.predict(
 
46
  result = json.load(f)
47
  return result
48
 
49
+
50
  if __name__ == "__main__":
51
  import openai
52
  openai.api_key = os.getenv("OPENAI_API_KEY")
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
section_generator.py CHANGED
@@ -3,7 +3,7 @@ from utils.gpt_interaction import get_responses, extract_responses, extract_keyw
3
  from utils.figures import generate_random_figures
4
  import time
5
  import os
6
- from utils.prompts import KEYWORDS_SYSTEM
7
  from utils.gpt_interaction import get_gpt_responses
8
  import json
9
 
@@ -13,6 +13,7 @@ import json
13
  # 3. figure_generation: used to generate sample figures.
14
  # all generator should return the token usage.
15
 
 
16
 
17
  def section_generation_bg(paper, section, save_to_path, model):
18
  """
@@ -47,7 +48,7 @@ def section_generation_bg(paper, section, save_to_path, model):
47
  return usage
48
 
49
 
50
- def section_generation(paper, section, save_to_path, model):
51
  """
52
  The main pipeline of generating a section.
53
  1. Generate prompts.
@@ -56,39 +57,32 @@ def section_generation(paper, section, save_to_path, model):
56
  4. Save the text to .tex file.
57
  :return usage
58
  """
59
- print(f"Generating {section}...")
60
  prompts = generate_paper_prompts(paper, section)
61
- gpt_response, usage = get_responses(prompts, model)
62
- output = gpt_response # extract_responses(gpt_response)
63
  paper["body"][section] = output
64
  tex_file = os.path.join(save_to_path, f"{section}.tex")
65
- # tex_file = save_to_path + f"/{section}.tex"
66
- if section == "abstract":
67
- with open(tex_file, "w") as f:
68
- f.write(output)
69
- else:
70
- with open(tex_file, "w") as f:
71
- f.write(output)
72
  time.sleep(5)
73
- print(f"{section} has been generated. Saved to {tex_file}.")
74
  return usage
75
 
76
- # def keywords_generation(input_dict, model, max_kw_refs = 10):
77
- # title = input_dict.get("title")
78
- # description = input_dict.get("description", "")
79
- # if title is not None:
80
- # prompts = generate_keywords_prompts(title, description, max_kw_refs)
81
- # gpt_response, usage = get_responses(prompts, model)
82
- # keywords = extract_keywords(gpt_response)
83
- # return keywords, usage
84
- # else:
85
- # raise ValueError("`input_dict` must include the key 'title'.")
86
 
87
- def keywords_generation(input_dict):
 
 
 
 
 
 
 
 
 
 
 
88
  title = input_dict.get("title")
89
- max_attempts = 10
90
  attempts_count = 0
91
- while attempts_count < max_attempts:
92
  try:
93
  keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=1, max_refs_num=10), title,
94
  model="gpt-3.5-turbo", temperature=0.4)
@@ -97,10 +91,16 @@ def keywords_generation(input_dict):
97
  return output.keys(), usage
98
  except json.decoder.JSONDecodeError:
99
  attempts_count += 1
100
- time.sleep(20)
101
- raise RuntimeError("Fail to generate keywords.")
 
 
 
 
 
102
 
103
  def figures_generation(paper, save_to_path, model):
 
104
  prompts = generate_experiments_prompts(paper)
105
  gpt_response, usage = get_responses(prompts, model)
106
  list_of_methods = list(extract_json(gpt_response))
 
3
  from utils.figures import generate_random_figures
4
  import time
5
  import os
6
+ from utils.prompts import KEYWORDS_SYSTEM, SECTION_GENERATION_SYSTEM
7
  from utils.gpt_interaction import get_gpt_responses
8
  import json
9
 
 
13
  # 3. figure_generation: used to generate sample figures.
14
  # all generator should return the token usage.
15
 
16
+ MAX_ATTEMPTS = 6
17
 
18
  def section_generation_bg(paper, section, save_to_path, model):
19
  """
 
48
  return usage
49
 
50
 
51
+ def section_generation(paper, section, save_to_path, model, research_field="machine learning"):
52
  """
53
  The main pipeline of generating a section.
54
  1. Generate prompts.
 
57
  4. Save the text to .tex file.
58
  :return usage
59
  """
 
60
  prompts = generate_paper_prompts(paper, section)
61
+ output, usage= get_gpt_responses(SECTION_GENERATION_SYSTEM.format(research_field=research_field), prompts,
62
+ model=model, temperature=0.4)
63
  paper["body"][section] = output
64
  tex_file = os.path.join(save_to_path, f"{section}.tex")
65
+ with open(tex_file, "w") as f:
66
+ f.write(output)
 
 
 
 
 
67
  time.sleep(5)
 
68
  return usage
69
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ def keywords_generation(input_dict, default_keywords=None):
72
+ '''
73
+ Input:
74
+ input_dict: a dictionary containing the title of a paper.
75
+ default_keywords: if anything went wrong, return this keywords.
76
+
77
+ Output:
78
+ a dictionary including all keywords and their importance score.
79
+
80
+ Input example: {"title": "The title of a Machine Learning Paper"}
81
+ Output Example: {"machine learning": 5, "reinforcement learning": 2}
82
+ '''
83
  title = input_dict.get("title")
 
84
  attempts_count = 0
85
+ while (attempts_count < MAX_ATTEMPTS) and (title is not None):
86
  try:
87
  keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=1, max_refs_num=10), title,
88
  model="gpt-3.5-turbo", temperature=0.4)
 
91
  return output.keys(), usage
92
  except json.decoder.JSONDecodeError:
93
  attempts_count += 1
94
+ time.sleep(10)
95
+ # Default references
96
+ print("Error: Keywords generation has failed. Return the default keywords.")
97
+ if default_keywords is None or isinstance(default_keywords, dict):
98
+ return {"machine learning": 10}
99
+ else:
100
+ return default_keywords
101
 
102
  def figures_generation(paper, save_to_path, model):
103
+ # todo: this function is not complete.
104
  prompts = generate_experiments_prompts(paper)
105
  gpt_response, usage = get_responses(prompts, model)
106
  list_of_methods = list(extract_json(gpt_response))
utils/references.py CHANGED
@@ -24,6 +24,9 @@ import bibtexparser
24
  import random
25
  from scholarly import scholarly
26
  from scholarly import ProxyGenerator
 
 
 
27
 
28
 
29
  ######################################################################################################################
@@ -86,6 +89,16 @@ def load_papers_from_bibtex(bib_file_path):
86
  bib_papers.append(result)
87
  return bib_papers
88
 
 
 
 
 
 
 
 
 
 
 
89
  ######################################################################################################################
90
  # Semantic Scholar (SS) API
91
  ######################################################################################################################
@@ -209,10 +222,10 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
209
  ######################################################################################################################
210
 
211
  class References:
212
- def __init__(self, title, load_papers):
213
  if load_papers is not None:
214
  self.papers = {}
215
- self.papers["customized_refs"] = load_papers_from_bibtex(load_papers)
216
  else:
217
  self.papers = {}
218
  self.title = title
@@ -228,15 +241,23 @@ class References:
228
 
229
  def collect_papers(self, keywords_dict, tldr=False):
230
  """
 
 
231
  keywords_dict:
232
  {"machine learning": 5, "language model": 2};
233
  the first is the keyword, the second is how many references are needed.
234
  """
235
- for key, counts in keywords_dict.items():
236
- self.papers[key] = _collect_papers_ss(key, counts, tldr)
 
 
 
 
 
 
237
 
238
 
239
- def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50):
240
  """
241
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
242
  """
@@ -244,8 +265,6 @@ class References:
244
  # use embeddings to evaluate; keep top k relevant references in papers
245
  # send (title, .bib file) to evaluate embeddings; recieve truncated papers
246
  papers = self._get_papers(keyword = "_all")
247
- random.shuffle(papers)
248
- papers = papers[:max_num_refs]
249
 
250
  # clear the bibtex file
251
  with open(path_to_bibtex, "w", encoding="utf-8") as file:
@@ -283,14 +302,42 @@ class References:
283
  papers = self.papers["keyword"]
284
  return papers
285
 
286
- def to_prompts(self, keyword = "_all"):
287
  # `prompts`:
288
  # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
289
  # this will be used to instruct GPT model to cite the correct bibtex entry.
290
- papers = self._get_papers(keyword)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  prompts = {}
292
- for paper in papers:
 
293
  prompts[paper["paper_id"]] = paper["abstract"]
 
 
 
294
  return prompts
295
 
296
  def to_json(self, keyword = "_all"):
@@ -304,39 +351,44 @@ class References:
304
 
305
  if __name__ == "__main__":
306
  # testing search results
 
307
  r = ss_search("Deep Q-Networks", limit=1) # a list of raw papers
308
  if r['total'] > 0:
309
  paper = r['data'][0]
310
  # print(paper)
311
 
312
  # resting References
313
- refs = References()
314
- # keywords_dict = {
315
- # "Deep Q-Networks": 5,
316
- # "Actor-Critic Algorithms": 4,
317
- # "Exploration-Exploitation Trade-off": 3
318
- # }
319
- # refs.collect_papers(keywords_dict, tldr=True)
320
- # for k in refs.papers:
321
- # papers = refs.papers[k] # for each keyword, there is a list of papers
322
- # print("keyword: ", k)
323
- # for paper in papers:
324
- # print(paper["paper_id"])
325
- #
326
- # refs.to_bibtex()
327
- # papers_json = refs.to_json() # this json can be used to find the most relevant papers
328
- # with open("papers.json", "w", encoding='utf-8') as text_file:
329
- # text_file.write(f"{papers_json}")
330
- #
331
- # prompts = refs.to_prompts()
332
- # print(prompts)
333
 
334
- bib = "test.bib"
335
- refs.load_papers(bib, "variance-reduction rl")
336
- print(refs.papers)
337
 
 
 
 
 
 
 
338
  prompts = refs.to_prompts()
339
- for k in prompts:
340
- print(f"{k}: {prompts[k]}\n")
341
- # for paper in papers:
342
- # print(paper)
 
 
 
 
 
 
24
  import random
25
  from scholarly import scholarly
26
  from scholarly import ProxyGenerator
27
+ import tiktoken
28
+ import itertools, uuid, json
29
+ from gradio_client import Client
30
 
31
 
32
  ######################################################################################################################
 
89
  bib_papers.append(result)
90
  return bib_papers
91
 
92
+
93
+
94
+ # `tokenizer`: used to count how many tokens
95
+ tokenizer_name = tiktoken.encoding_for_model('gpt-4')
96
+ tokenizer = tiktoken.get_encoding(tokenizer_name.name)
97
+
98
+ def tiktoken_len(text):
99
+ # evaluate how many tokens for the given text
100
+ tokens = tokenizer.encode(text, disallowed_special=())
101
+ return len(tokens)
102
  ######################################################################################################################
103
  # Semantic Scholar (SS) API
104
  ######################################################################################################################
 
222
  ######################################################################################################################
223
 
224
  class References:
225
+ def __init__(self, title, load_papers=None, keyword="customized_refs"):
226
  if load_papers is not None:
227
  self.papers = {}
228
+ self.papers[keyword] = load_papers_from_bibtex(load_papers)
229
  else:
230
  self.papers = {}
231
  self.title = title
 
241
 
242
  def collect_papers(self, keywords_dict, tldr=False):
243
  """
244
+ Collect as many papers as possible
245
+
246
  keywords_dict:
247
  {"machine learning": 5, "language model": 2};
248
  the first is the keyword, the second is how many references are needed.
249
  """
250
+ keywords = list(keywords_dict)
251
+ comb_keywords = list(itertools.combinations(keywords, 2))
252
+ for comb_keyword in comb_keywords:
253
+ keywords.append(" ".join(comb_keyword))
254
+ for key in keywords:
255
+ self.papers[key] = _collect_papers_ss(key, 10, tldr)
256
+ # for key, counts in keywords_dict.items():
257
+ # self.papers[key] = _collect_papers_ss(key, counts, tldr)
258
 
259
 
260
+ def to_bibtex(self, path_to_bibtex="ref.bib"):
261
  """
262
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
263
  """
 
265
  # use embeddings to evaluate; keep top k relevant references in papers
266
  # send (title, .bib file) to evaluate embeddings; recieve truncated papers
267
  papers = self._get_papers(keyword = "_all")
 
 
268
 
269
  # clear the bibtex file
270
  with open(path_to_bibtex, "w", encoding="utf-8") as file:
 
302
  papers = self.papers["keyword"]
303
  return papers
304
 
305
+ def to_prompts(self, keyword = "_all", max_tokens = 2048):
306
  # `prompts`:
307
  # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
308
  # this will be used to instruct GPT model to cite the correct bibtex entry.
309
+
310
+ # two steps:
311
+ # 1. Sort everything from most relevant to less relevant
312
+ # 2. Add paper to prompts until max_tokens
313
+ json_path = str(uuid.uuid1()) + ".json"
314
+ papers_json = self.to_json()
315
+ with open(json_path, "w") as f:
316
+ json.dump(papers_json, f)
317
+
318
+ try:
319
+ title = self.title
320
+ client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
321
+ result = client.predict(
322
+ title, # str in 'Title' Textbox component
323
+ json_path, # str (filepath or URL to file) in 'Papers JSON (as string)' File component
324
+ 50, # int | float (numeric value between 1 and 50) in 'Top-k Relevant Papers' Slider component
325
+ api_name="/get_k_relevant_papers"
326
+ )
327
+ with open(result) as f:
328
+ result = json.load(f)
329
+ result = [item for key, item in result.items()]
330
+ except Exception as e:
331
+ print(f"Error occurs during calling external API: {e}\n")
332
+ print("Use default method instead!")
333
+ result = self._get_papers(keyword)
334
  prompts = {}
335
+ tokens = 0
336
+ for paper in result:
337
  prompts[paper["paper_id"]] = paper["abstract"]
338
+ tokens += tiktoken_len(paper["abstract"])
339
+ if tokens >= max_tokens:
340
+ break
341
  return prompts
342
 
343
  def to_json(self, keyword = "_all"):
 
351
 
352
  if __name__ == "__main__":
353
  # testing search results
354
+ print("================Testing `ss_search`================")
355
  r = ss_search("Deep Q-Networks", limit=1) # a list of raw papers
356
  if r['total'] > 0:
357
  paper = r['data'][0]
358
  # print(paper)
359
 
360
  # resting References
361
+ print("================Testing `References`================")
362
+ refs = References(title="Super Deep Q-Networks")
363
+ keywords_dict = {
364
+ "Deep Q-Networks": 5,
365
+ "Actor-Critic Algorithms": 4,
366
+ "Exploration-Exploitation Trade-off": 3
367
+ }
368
+ print("================Testing `References.collect_papers`================")
369
+ refs.collect_papers(keywords_dict, tldr=True)
370
+ for k in refs.papers:
371
+ papers = refs.papers[k] # for each keyword, there is a list of papers
372
+ print("keyword: ", k)
373
+ for paper in papers:
374
+ print(paper["paper_id"])
 
 
 
 
 
 
375
 
376
+ print("================Testing `References.to_bibtex`================")
377
+ refs.to_bibtex()
 
378
 
379
+ print("================Testing `References.to_json`================")
380
+ papers_json = refs.to_json() # this json can be used to find the most relevant papers
381
+ with open("papers.json", "w", encoding='utf-8') as text_file:
382
+ text_file.write(f"{papers_json}")
383
+
384
+ print("================Testing `References.to_prompts`================")
385
  prompts = refs.to_prompts()
386
+ print(prompts)
387
+
388
+ # bib = "test.bib"
389
+ # refs.load_papers(bib, "variance-reduction rl")
390
+ # print(refs.papers)
391
+ #
392
+ # prompts = refs.to_prompts()
393
+ # for k in prompts:
394
+ # print(f"{k}: {prompts[k]}\n")