Spaces:

auto-academic
/

auto-draft

Runtime error

App Files Files Community

shaocongma commited on May 30, 2023

Commit

2dc9347

1 Parent(s): 6eb659a

use most relevant papers as references.

Browse files

Files changed (8) hide show

.gitignore +1 -0
api_wrapper.py +29 -0
app.py +63 -29
auto_backgrounds.py +21 -9
references_generator.py +8 -4
requirements.txt +0 -0
section_generator.py +28 -28
utils/references.py +89 -37

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /.idea/

api_wrapper.py ADDED Viewed

	@@ -0,0 +1,29 @@

+'''
+This script is used to wrap all generation methods together.
+todo:
+    A worker keeps running on the server. Monitor the Amazon SQS. Once receive a new message, do the following:
+        Download the corresponding configuration files on S3.
+        Change Task status from Pending to Running.
+        Call `generator_wrapper` and wait for the outputs.
+        If `generator_wrapper` returns results:
+            evaluate the results; compile it; upload results to S3 ... Change Task status from Running to Completed.
+            If anything goes wrong, raise Error.
+        If `generator_wrapper` returns nothing or Timeout, or raise any error:
+            Change Task status from Running to Failed.
+'''
+from auto_backgrounds import generate_draft
+import json
+GENERATOR_MAPPING = {"draft": generate_draft}
+def generator_wrapper(path_to_config_json):
+    # Read configuration file and call corresponding function
+    with open(path_to_config_json, "r", encoding='utf-8') as f:
+        config = json.load(f)
+    generator = GENERATOR_MAPPING.get(config["generator"])
+    if generator is None:
+        pass

app.py CHANGED Viewed

@@ -119,12 +119,31 @@ ACADEMIC_PAPER = """## 一键生成论文初稿
 REFERENCES = """## 一键搜索相关论文
 1. 在Title文本框中输入想要搜索文献的论文（比如Playing Atari with Deep Reinforcement Learning).
 2. 点击Submit. 等待大概十分钟.
 3. 在右侧JSON处会显示相关文献.
 """
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
@@ -152,20 +171,40 @@ with gr.Blocks(theme=theme) as demo:
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
                 with gr.Accordion("高级设置", open=False):
-                    description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
-                                                info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
                     with gr.Row():
-                        with gr.Column():
-                            with gr.Row():
-                                template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
-                                                       interactive=False,
-                                                       info="生成论文的参考模板. (暂不支持修改)")
-                                model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"],
-                                                              value="gpt-3.5-turbo",
-                                                              interactive=True,
-                                                              info="生成论文用到的语言模型.")
                             gr.Markdown('''
                             上传.bib文件提供AI需要参考的文献.
                             ''')
@@ -175,28 +214,23 @@ with gr.Blocks(theme=theme) as demo:
                                 examples=["latex_templates/example_references.bib"],
                                 inputs=bibtex_file
                             )
-                        with gr.Column():
-                            search_engine = gr.Dropdown(label="Search Engine",
-                                                        choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
-                                                        value="Semantic Scholar",
-                                                        interactive=False,
-                                                        info="用于决定GPT-4用什么搜索引擎来搜索文献. (暂不支持修改)")
-                            tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
-                                                        info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
-                                                        interactive=True)
-                            sections = gr.CheckboxGroup(
-                                choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
-                                         "conclusion", "abstract"],
-                                type="value", label="生成章节", interactive=True,
-                                value=["introduction", "related works"])
-                            slider = gr.Slider(minimum=1, maximum=100, value=20, step=1,
-                                               interactive=True, label="最大参考文献数目")
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
                     submit_button_pp = gr.Button("Submit", variant="primary")
-            with gr.Tab("文献搜索 (NEW!)"):
                 gr.Markdown(REFERENCES)
                 title_refs = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,

 REFERENCES = """## 一键搜索相关论文
+(此功能已经被整合进一键生成论文初稿)
 1. 在Title文本框中输入想要搜索文献的论文（比如Playing Atari with Deep Reinforcement Learning).
 2. 点击Submit. 等待大概十分钟.
 3. 在右侧JSON处会显示相关文献.
 """
+REFERENCES_INSTRUCTION = """### References
+这一行用于定义AI如何选取参考文献. 目前是两种方式混合:
+1. GPT自动根据标题生成关键字，使用Semantic Scholar搜索引擎搜索文献，利用Specter获取Paper Embedding来自动选取最相关的文献作为GPT的参考资料.
+2. 用户上传bibtex文件，使用Google Scholar搜索摘要作为GPT的参考资料.
+关于有希望利用本地文件来供GPT参考的功能将在未来实装.
+"""
+DOMAIN_KNOWLEDGE_INSTRUCTION = """### Domain Knowledge
+(暂未实装)
+这一行用于定义AI的知识库. 将提供两种选择:
+1. 各个领域内由专家预先收集资料并构建的的FAISS向量数据库. 每个数据库内包含了数百万页经过同行评议的论文和专业经典书籍.
+2. 自行构建的使用OpenAI text-embedding-ada-002模型创建的FAISS向量数据库.
+"""
+OTHERS_INSTRUCTION = """### Others
+"""
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
+                slider = gr.Slider(minimum=1, maximum=100, value=20, step=1,
+                                   interactive=True, visible=False, label="最大参考文献数目")
                 with gr.Accordion("高级设置", open=False):
+                    with gr.Row():
+                        description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
+                                                    info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
+                        with gr.Row():
+                            template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
+                                                   interactive=False,
+                                                   info="生成论文的参考模板. (暂不支持修改)")
+                            model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"],
+                                                          value="gpt-3.5-turbo",
+                                                          interactive=True,
+                                                          info="生成论文用到的语言模型.")
+                        sections = gr.CheckboxGroup(
+                            choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
+                                     "conclusion", "abstract"],
+                            type="value", label="生成章节", interactive=True,
+                            value=["introduction", "related works"])
                     with gr.Row():
+                        with gr.Column(scale=1):
+                            gr.Markdown(REFERENCES_INSTRUCTION)
+                        with gr.Column(scale=2):
+                            search_engine = gr.Dropdown(label="Search Engine",
+                                                        choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
+                                                        value="Semantic Scholar",
+                                                        interactive=False,
+                                                        visible=False,
+                                                        info="用于决定GPT用什么搜索引擎来搜索文献. (暂不支持修改)")
+                            tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
+                                                        info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
+                                                        interactive=True)
                             gr.Markdown('''
                             上传.bib文件提供AI需要参考的文献.
                             ''')
                                 examples=["latex_templates/example_references.bib"],
                                 inputs=bibtex_file
                             )
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr.Markdown(DOMAIN_KNOWLEDGE_INSTRUCTION)
+                        with gr.Column(scale=2):
+                            domain_knowledge = gr.Dropdown(label="预载知识库",
+                                                        choices=["(None)", "Machine Learning"],
+                                                        value="(None)",
+                                                        interactive=False,
+                                                        info="使用预先构建的知识库. (暂未实装)")
+                            local_domain_knowledge = gr.File(label="本地知识库 (暂未实装)", interactive=False)
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
                     submit_button_pp = gr.Button("Submit", variant="primary")
+            with gr.Tab("文献搜索"):
                 gr.Markdown(REFERENCES)
                 title_refs = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,

auto_backgrounds.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os.path
-import json
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from utils.tex_processing import create_copies
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
 import logging
 import time
 TOTAL_TOKENS = 0
 TOTAL_PROMPTS_TOKENS = 0
 TOTAL_COMPLETION_TOKENS = 0
@@ -32,7 +33,7 @@ def log_usage(usage, generating_target, print_out=True):
     logging.info(message)
 def _generation_setup(title, description="", template="ICLR2022", tldr=False,
-                      max_kw_refs=10, max_num_refs=50, bib_refs=None):
     """
     This function handles the setup process for paper generation; it contains three folds
         1. Copy the template to the outputs folder. Create the log file `generation.log`
@@ -54,7 +55,7 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
         - destination_folder (str): The path to the destination folder where the generation log is saved.
         - all_paper_ids (list): A list of all paper IDs collected for the references.
     """
-    print("Generation setup...")
     paper = {}
     paper_body = {}
@@ -63,9 +64,8 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
     logging.basicConfig(level=logging.INFO, filename=os.path.join(destination_folder, "generation.log") )
     # Generate keywords and references
-    print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
-    # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
     keywords, usage = keywords_generation(input_dict)
     log_usage(usage, "keywords")
@@ -75,13 +75,13 @@ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
     ref = References(title, bib_refs)
     ref.collect_papers(keywords, tldr=tldr)
-    all_paper_ids = ref.to_bibtex(bibtex_path, max_num_refs) #todo: max_num_refs has not implemented yet
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     paper["title"] = title
     paper["description"] = description
-    paper["references"] = ref.to_prompts()
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
     return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
@@ -107,15 +107,20 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
     return make_archive(destination_folder, filename)
 def generate_draft(title, description="", template="ICLR2022",
                    tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None, model="gpt-4"):
     # pre-processing `sections` parameter;
     if sections is None:
         sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
     # todo: add more parameters; select which section to generate; select maximum refs.
     paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
     for section in sections:
         max_attempts = 4
         attempts_count = 0
         while attempts_count < max_attempts:
@@ -124,12 +129,14 @@ def generate_draft(title, description="", template="ICLR2022",
                 log_usage(usage, section)
                 break
             except Exception as e:
-                message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}"
                 print(message)
                 logging.info(message)
                 attempts_count += 1
-                time.sleep(20)
     # post-processing
     create_copies(destination_folder)
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
@@ -137,6 +144,11 @@ def generate_draft(title, description="", template="ICLR2022",
     return make_archive(destination_folder, filename)
 if __name__ == "__main__":
     import openai
     openai.api_key = os.getenv("OPENAI_API_KEY")

 import os.path
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from utils.tex_processing import create_copies
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
+from references_generator import generate_top_k_references
 import logging
 import time
 TOTAL_TOKENS = 0
 TOTAL_PROMPTS_TOKENS = 0
 TOTAL_COMPLETION_TOKENS = 0
     logging.info(message)
 def _generation_setup(title, description="", template="ICLR2022", tldr=False,
+                      max_kw_refs=10, max_num_refs=50, bib_refs=None, max_tokens=2048):
     """
     This function handles the setup process for paper generation; it contains three folds
         1. Copy the template to the outputs folder. Create the log file `generation.log`
         - destination_folder (str): The path to the destination folder where the generation log is saved.
         - all_paper_ids (list): A list of all paper IDs collected for the references.
     """
+    # print("Generation setup...")
     paper = {}
     paper_body = {}
     logging.basicConfig(level=logging.INFO, filename=os.path.join(destination_folder, "generation.log") )
     # Generate keywords and references
+    # print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
     keywords, usage = keywords_generation(input_dict)
     log_usage(usage, "keywords")
     ref = References(title, bib_refs)
     ref.collect_papers(keywords, tldr=tldr)
+    all_paper_ids = ref.to_bibtex(bibtex_path)
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     paper["title"] = title
     paper["description"] = description
+    paper["references"] = ref.to_prompts(max_tokens=max_tokens)
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
     return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
     return make_archive(destination_folder, filename)
 def generate_draft(title, description="", template="ICLR2022",
                    tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None, model="gpt-4"):
     # pre-processing `sections` parameter;
+    print("================PRE-PROCESSING================")
     if sections is None:
         sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
     # todo: add more parameters; select which section to generate; select maximum refs.
     paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
+    # main components
     for section in sections:
+        print(f"================Generate {section}================")
         max_attempts = 4
         attempts_count = 0
         while attempts_count < max_attempts:
                 log_usage(usage, section)
                 break
             except Exception as e:
+                message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}\n"
                 print(message)
                 logging.info(message)
                 attempts_count += 1
+                time.sleep(15)
     # post-processing
+    print("================POST-PROCESSING================")
     create_copies(destination_folder)
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)
 if __name__ == "__main__":
     import openai
     openai.api_key = os.getenv("OPENAI_API_KEY")

references_generator.py CHANGED Viewed

@@ -5,8 +5,10 @@ from section_generator import section_generation_bg, keywords_generation, figure
 import itertools
 from gradio_client import Client
 def generate_raw_references(title, description="",
-                            bib_refs=None, tldr=False, max_kw_refs=10,  save_to="ref.bib"):
     # load pre-provided references
     ref = References(title, bib_refs)
@@ -21,16 +23,17 @@ def generate_raw_references(title, description="",
     print(f"keywords: {keywords}\n\n")
     ref.collect_papers(keywords, tldr=tldr)
-    paper_json = ref.to_json()
     with open(save_to, "w") as f:
         json.dump(paper_json, f)
-    return save_to, paper_json
 def generate_top_k_references(title, description="",
                             bib_refs=None, tldr=False, max_kw_refs=10,  save_to="ref.bib", top_k=5):
-    json_path, json_content = generate_raw_references(title, description, bib_refs, tldr, max_kw_refs,  save_to)
     client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
     result = client.predict(
@@ -43,6 +46,7 @@ def generate_top_k_references(title, description="",
         result = json.load(f)
     return result
 if __name__ == "__main__":
     import openai
     openai.api_key = os.getenv("OPENAI_API_KEY")

 import itertools
 from gradio_client import Client
 def generate_raw_references(title, description="",
+                            bib_refs=None, tldr=False, max_kw_refs=10,
+                            save_to="ref.bib"):
     # load pre-provided references
     ref = References(title, bib_refs)
     print(f"keywords: {keywords}\n\n")
     ref.collect_papers(keywords, tldr=tldr)
+    # paper_json = ref.to_json()
     with open(save_to, "w") as f:
         json.dump(paper_json, f)
+    return save_to, ref # paper_json
 def generate_top_k_references(title, description="",
                             bib_refs=None, tldr=False, max_kw_refs=10,  save_to="ref.bib", top_k=5):
+    json_path, ref_raw = generate_raw_references(title, description, bib_refs, tldr, max_kw_refs,  save_to)
+    json_content = ref_raw.to_json()
     client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
     result = client.predict(
         result = json.load(f)
     return result
 if __name__ == "__main__":
     import openai
     openai.api_key = os.getenv("OPENAI_API_KEY")

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

section_generator.py CHANGED Viewed

@@ -3,7 +3,7 @@ from utils.gpt_interaction import get_responses, extract_responses, extract_keyw
 from utils.figures import generate_random_figures
 import time
 import os
-from utils.prompts import KEYWORDS_SYSTEM
 from utils.gpt_interaction import get_gpt_responses
 import json
@@ -13,6 +13,7 @@ import json
 #       3. figure_generation: used to generate sample figures.
 #  all generator should return the token usage.
 def section_generation_bg(paper, section, save_to_path, model):
     """
@@ -47,7 +48,7 @@ def section_generation_bg(paper, section, save_to_path, model):
     return usage
-def section_generation(paper, section, save_to_path, model):
     """
     The main pipeline of generating a section.
         1. Generate prompts.
@@ -56,39 +57,32 @@ def section_generation(paper, section, save_to_path, model):
         4. Save the text to .tex file.
     :return usage
     """
-    print(f"Generating {section}...")
     prompts = generate_paper_prompts(paper, section)
-    gpt_response, usage = get_responses(prompts, model)
-    output = gpt_response # extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
-    # tex_file = save_to_path + f"/{section}.tex"
-    if section == "abstract":
-        with open(tex_file, "w") as f:
-            f.write(output)
-    else:
-        with open(tex_file, "w") as f:
-            f.write(output)
     time.sleep(5)
-    print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
-# def keywords_generation(input_dict,  model, max_kw_refs = 10):
-#     title = input_dict.get("title")
-#     description = input_dict.get("description", "")
-#     if title is not None:
-#         prompts = generate_keywords_prompts(title, description, max_kw_refs)
-#         gpt_response, usage = get_responses(prompts, model)
-#         keywords = extract_keywords(gpt_response)
-#         return keywords, usage
-#     else:
-#         raise ValueError("`input_dict` must include the key 'title'.")
-def keywords_generation(input_dict):
     title = input_dict.get("title")
-    max_attempts = 10
     attempts_count = 0
-    while attempts_count < max_attempts:
         try:
             keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=1, max_refs_num=10), title,
                                      model="gpt-3.5-turbo", temperature=0.4)
@@ -97,10 +91,16 @@ def keywords_generation(input_dict):
             return output.keys(), usage
         except json.decoder.JSONDecodeError:
             attempts_count += 1
-            time.sleep(20)
-    raise RuntimeError("Fail to generate keywords.")
 def figures_generation(paper, save_to_path, model):
     prompts = generate_experiments_prompts(paper)
     gpt_response, usage = get_responses(prompts, model)
     list_of_methods = list(extract_json(gpt_response))

 from utils.figures import generate_random_figures
 import time
 import os
+from utils.prompts import KEYWORDS_SYSTEM, SECTION_GENERATION_SYSTEM
 from utils.gpt_interaction import get_gpt_responses
 import json
 #       3. figure_generation: used to generate sample figures.
 #  all generator should return the token usage.
+MAX_ATTEMPTS = 6
 def section_generation_bg(paper, section, save_to_path, model):
     """
     return usage
+def section_generation(paper, section, save_to_path, model, research_field="machine learning"):
     """
     The main pipeline of generating a section.
         1. Generate prompts.
         4. Save the text to .tex file.
     :return usage
     """
     prompts = generate_paper_prompts(paper, section)
+    output, usage= get_gpt_responses(SECTION_GENERATION_SYSTEM.format(research_field=research_field), prompts,
+                             model=model, temperature=0.4)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
+    with open(tex_file, "w") as f:
+        f.write(output)
     time.sleep(5)
     return usage
+def keywords_generation(input_dict, default_keywords=None):
+    '''
+    Input:
+        input_dict: a dictionary containing the title of a paper.
+        default_keywords: if anything went wrong, return this keywords.
+    Output:
+        a dictionary including all keywords and their importance score.
+    Input example: {"title": "The title of a Machine Learning Paper"}
+    Output Example: {"machine learning": 5, "reinforcement learning": 2}
+    '''
     title = input_dict.get("title")
     attempts_count = 0
+    while (attempts_count < MAX_ATTEMPTS) and (title is not None):
         try:
             keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=1, max_refs_num=10), title,
                                      model="gpt-3.5-turbo", temperature=0.4)
             return output.keys(), usage
         except json.decoder.JSONDecodeError:
             attempts_count += 1
+            time.sleep(10)
+    # Default references
+    print("Error: Keywords generation has failed. Return the default keywords.")
+    if default_keywords is None or isinstance(default_keywords, dict):
+        return {"machine learning": 10}
+    else:
+        return default_keywords
 def figures_generation(paper, save_to_path, model):
+    # todo: this function is not complete.
     prompts = generate_experiments_prompts(paper)
     gpt_response, usage = get_responses(prompts, model)
     list_of_methods = list(extract_json(gpt_response))

utils/references.py CHANGED Viewed

@@ -24,6 +24,9 @@ import bibtexparser
 import random
 from scholarly import scholarly
 from scholarly import ProxyGenerator
 ######################################################################################################################
@@ -86,6 +89,16 @@ def load_papers_from_bibtex(bib_file_path):
             bib_papers.append(result)
         return bib_papers
 ######################################################################################################################
 # Semantic Scholar (SS) API
 ######################################################################################################################
@@ -209,10 +222,10 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
 ######################################################################################################################
 class References:
-    def __init__(self, title, load_papers):
         if load_papers is not None:
             self.papers = {}
-            self.papers["customized_refs"] = load_papers_from_bibtex(load_papers)
         else:
             self.papers = {}
         self.title = title
@@ -228,15 +241,23 @@ class References:
     def collect_papers(self, keywords_dict, tldr=False):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
             the first is the keyword, the second is how many references are needed.
         """
-        for key, counts in keywords_dict.items():
-            self.papers[key] = _collect_papers_ss(key, counts, tldr)
-    def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
@@ -244,8 +265,6 @@ class References:
         #   use embeddings to evaluate; keep top k relevant references in papers
         #   send (title, .bib file) to evaluate embeddings; recieve truncated papers
         papers = self._get_papers(keyword = "_all")
-        random.shuffle(papers)
-        papers = papers[:max_num_refs]
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
@@ -283,14 +302,42 @@ class References:
             papers = self.papers["keyword"]
         return papers
-    def to_prompts(self, keyword = "_all"):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
-        papers = self._get_papers(keyword)
         prompts = {}
-        for paper in papers:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
     def to_json(self, keyword = "_all"):
@@ -304,39 +351,44 @@ class References:
 if __name__ == "__main__":
     # testing search results
     r = ss_search("Deep Q-Networks", limit=1)  # a list of raw papers
     if r['total'] > 0:
         paper = r['data'][0]
         # print(paper)
     # resting References
-    refs = References()
-    # keywords_dict = {
-    #     "Deep Q-Networks": 5,
-    #     "Actor-Critic Algorithms": 4,
-    #     "Exploration-Exploitation Trade-off": 3
-    # }
-    # refs.collect_papers(keywords_dict, tldr=True)
-    # for k in refs.papers:
-    #     papers = refs.papers[k] # for each keyword, there is a list of papers
-    #     print("keyword: ", k)
-    #     for paper in papers:
-    #         print(paper["paper_id"])
-    #
-    # refs.to_bibtex()
-    # papers_json = refs.to_json() # this json can be used to find the most relevant papers
-    # with open("papers.json", "w",  encoding='utf-8') as text_file:
-    #     text_file.write(f"{papers_json}")
-    #
-    # prompts = refs.to_prompts()
-    # print(prompts)
-    bib = "test.bib"
-    refs.load_papers(bib, "variance-reduction rl")
-    print(refs.papers)
     prompts = refs.to_prompts()
-    for k in prompts:
-        print(f"{k}: {prompts[k]}\n")
-    # for paper in papers:
-    #     print(paper)

 import random
 from scholarly import scholarly
 from scholarly import ProxyGenerator
+import tiktoken
+import itertools, uuid, json
+from gradio_client import Client
 ######################################################################################################################
             bib_papers.append(result)
         return bib_papers
+# `tokenizer`: used to count how many tokens
+tokenizer_name = tiktoken.encoding_for_model('gpt-4')
+tokenizer = tiktoken.get_encoding(tokenizer_name.name)
+def tiktoken_len(text):
+    # evaluate how many tokens for the given text
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return len(tokens)
 ######################################################################################################################
 # Semantic Scholar (SS) API
 ######################################################################################################################
 ######################################################################################################################
 class References:
+    def __init__(self, title, load_papers=None, keyword="customized_refs"):
         if load_papers is not None:
             self.papers = {}
+            self.papers[keyword] = load_papers_from_bibtex(load_papers)
         else:
             self.papers = {}
         self.title = title
     def collect_papers(self, keywords_dict, tldr=False):
         """
+        Collect as many papers as possible
         keywords_dict:
             {"machine learning": 5, "language model": 2};
             the first is the keyword, the second is how many references are needed.
         """
+        keywords = list(keywords_dict)
+        comb_keywords = list(itertools.combinations(keywords, 2))
+        for comb_keyword in comb_keywords:
+            keywords.append(" ".join(comb_keyword))
+        for key in keywords:
+            self.papers[key] = _collect_papers_ss(key, 10, tldr)
+        # for key, counts in keywords_dict.items():
+        #     self.papers[key] = _collect_papers_ss(key, counts, tldr)
+    def to_bibtex(self, path_to_bibtex="ref.bib"):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
         #   use embeddings to evaluate; keep top k relevant references in papers
         #   send (title, .bib file) to evaluate embeddings; recieve truncated papers
         papers = self._get_papers(keyword = "_all")
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
             papers = self.papers["keyword"]
         return papers
+    def to_prompts(self, keyword = "_all", max_tokens = 2048):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
+        # two steps:
+        #   1. Sort everything from most relevant to less relevant
+        #   2. Add paper to prompts until max_tokens
+        json_path = str(uuid.uuid1()) + ".json"
+        papers_json = self.to_json()
+        with open(json_path, "w") as f:
+            json.dump(papers_json, f)
+        try:
+            title = self.title
+            client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
+            result = client.predict(
+                title,  # str  in 'Title' Textbox component
+                json_path,  # str (filepath or URL to file) in 'Papers JSON (as string)' File component
+                50,  # int | float (numeric value between 1 and 50) in 'Top-k Relevant Papers' Slider component
+                api_name="/get_k_relevant_papers"
+            )
+            with open(result) as f:
+                result = json.load(f)
+            result = [item for key, item in result.items()]
+        except Exception as e:
+            print(f"Error occurs during calling external API: {e}\n")
+            print("Use default method instead!")
+            result = self._get_papers(keyword)
         prompts = {}
+        tokens = 0
+        for paper in result:
             prompts[paper["paper_id"]] = paper["abstract"]
+            tokens += tiktoken_len(paper["abstract"])
+            if tokens >= max_tokens:
+                break
         return prompts
     def to_json(self, keyword = "_all"):
 if __name__ == "__main__":
     # testing search results
+    print("================Testing `ss_search`================")
     r = ss_search("Deep Q-Networks", limit=1)  # a list of raw papers
     if r['total'] > 0:
         paper = r['data'][0]
         # print(paper)
     # resting References
+    print("================Testing `References`================")
+    refs = References(title="Super Deep Q-Networks")
+    keywords_dict = {
+        "Deep Q-Networks": 5,
+        "Actor-Critic Algorithms": 4,
+        "Exploration-Exploitation Trade-off": 3
+    }
+    print("================Testing `References.collect_papers`================")
+    refs.collect_papers(keywords_dict, tldr=True)
+    for k in refs.papers:
+        papers = refs.papers[k] # for each keyword, there is a list of papers
+        print("keyword: ", k)
+        for paper in papers:
+            print(paper["paper_id"])
+    print("================Testing `References.to_bibtex`================")
+    refs.to_bibtex()
+    print("================Testing `References.to_json`================")
+    papers_json = refs.to_json() # this json can be used to find the most relevant papers
+    with open("papers.json", "w",  encoding='utf-8') as text_file:
+        text_file.write(f"{papers_json}")
+    print("================Testing `References.to_prompts`================")
     prompts = refs.to_prompts()
+    print(prompts)
+    # bib = "test.bib"
+    # refs.load_papers(bib, "variance-reduction rl")
+    # print(refs.papers)
+    #
+    # prompts = refs.to_prompts()
+    # for k in prompts:
+    #     print(f"{k}: {prompts[k]}\n")