auto-draft / utils /references.py
sc_ma
Initial Commit
fb8c051
raw
history blame
5 kB
# Generate references
# 1. select most correlated references from "references" dataset or Arxiv search engine.
# 2. Generate bibtex from the selected papers. --> to_bibtex()
# 3. Generate prompts from the selected papers: --> to_prompts()
# {"paper_id": "paper summary"}
import requests
import re
def _collect_papers_arxiv(keyword, counts=3):
#
# The following codes are used to generate the most related papers
#
# Build the arXiv API query URL with the given keyword and other parameters
def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
base_url = "http://export.arxiv.org/api/query?"
query = f"search_query=all:{keyword}&start=0&max_results={results_limit}"
query += f"&sortBy={sort_by}&sortOrder={sort_order}"
return base_url + query
# Fetch search results from the arXiv API using the constructed URL
def fetch_search_results(query_url):
response = requests.get(query_url)
return response.text
# Parse the XML content of the API response to extract paper information
def parse_results(content):
from xml.etree import ElementTree as ET
root = ET.fromstring(content)
namespace = "{http://www.w3.org/2005/Atom}"
entries = root.findall(f"{namespace}entry")
results = []
for entry in entries:
title = entry.find(f"{namespace}title").text
link = entry.find(f"{namespace}id").text
summary = entry.find(f"{namespace}summary").text
# Extract the authors
authors = entry.findall(f"{namespace}author")
author_list = []
for author in authors:
name = author.find(f"{namespace}name").text
author_list.append(name)
authors_str = " , ".join(author_list)
# Extract the year
published = entry.find(f"{namespace}published").text
year = published.split("-")[0]
arxiv_id = re.search(r'\d+\.\d+', link).group(0)
journal = f"arXiv preprint arXiv:{arxiv_id}"
result = {
"paper_id": arxiv_id,
"title": title,
"link": link,
"abstract": summary,
"authors": authors_str,
"year": year,
"journal": journal
}
results.append(result)
return results
query_url = build_query_url(keyword, counts)
content = fetch_search_results(query_url)
results = parse_results(content)
return results
# Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
class References:
def __init__(self, load_papers = ""):
if load_papers:
# todo: read a json file from the given path
# this could be used to support pre-defined references
pass
else:
self.papers = []
def collect_papers(self, keywords_dict, method="arxiv"):
"""
keywords_dict:
{"machine learning": 5, "language model": 2};
the first is the keyword, the second is how many references are needed.
"""
match method:
case "arxiv":
process =_collect_papers_arxiv
case _:
raise NotImplementedError("Other sources have not been not supported yet.")
for key, counts in keywords_dict.items():
self.papers = self.papers + process(key, counts)
def to_bibtex(self, path_to_bibtex="ref.bib"):
"""
Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
"""
papers = self.papers
# clear the bibtex file
with open(path_to_bibtex, "w", encoding="utf-8") as file:
file.write("")
bibtex_entries = []
paper_ids = []
for paper in papers:
bibtex_entry = f"""@article{{{paper["paper_id"]},
title = {{{paper["title"]}}},
author = {{{paper["authors"]}}},
journal={{{paper["journal"]}}},
year = {{{paper["year"]}}},
url = {{{paper["link"]}}}
}}"""
bibtex_entries.append(bibtex_entry)
paper_ids.append(paper["paper_id"])
# Save the generated BibTeX entries to a file
with open(path_to_bibtex, "a", encoding="utf-8") as file:
for entry in bibtex_entries:
file.write(entry)
file.write("\n\n")
return paper_ids
def to_prompts(self):
# `prompts`:
# {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
# this will be used to instruct GPT model to cite the correct bibtex entry.
prompts = {}
for paper in self.papers:
prompts[paper["paper_id"]] = paper["abstract"]
return prompts