Richard Fan commited on
Commit
6f966a5
·
1 Parent(s): 3905261
Files changed (5) hide show
  1. action.py +142 -0
  2. app.py +186 -0
  3. download_new_papers.py +64 -0
  4. relevancy.py +174 -0
  5. utils.py +149 -0
action.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sendgrid import SendGridAPIClient
2
+ from sendgrid.helpers.mail import Mail, Email, To, Content
3
+
4
+ from datetime import date
5
+
6
+ import argparse
7
+ import yaml
8
+ import os
9
+
10
+ from relevancy import generate_relevance_score, process_subject_fields
11
+ from download_new_papers import get_papers
12
+
13
+
14
+
15
+ # Hackathon quality code. Don't judge too harshly.
16
+ # Feel free to submit pull requests to improve the code.
17
+
18
+ topics = {
19
+ "Physics": "",
20
+ "Mathematics": "math",
21
+ "Computer Science": "cs",
22
+ "Quantitative Biology": "q-bio",
23
+ "Quantitative Finance": "q-fin",
24
+ "Statistics": "stat",
25
+ "Electrical Engineering and Systems Science": "eess",
26
+ "Economics": "econ"
27
+ }
28
+
29
+ physics_topics = {
30
+ "Astrophysics": "astro-ph",
31
+ "Condensed Matter": "cond-mat",
32
+ "General Relativity and Quantum Cosmology": "gr-qc",
33
+ "High Energy Physics - Experiment": "hep-ex",
34
+ "High Energy Physics - Lattice": "hep-lat",
35
+ "High Energy Physics - Phenomenology": "hep-ph",
36
+ "High Energy Physics - Theory": "hep-th",
37
+ "Mathematical Physics": "math-ph",
38
+ "Nonlinear Sciences": "nlin",
39
+ "Nuclear Experiment": "nucl-ex",
40
+ "Nuclear Theory": "nucl-th",
41
+ "Physics": "physics",
42
+ "Quantum Physics": "quant-ph"
43
+ }
44
+
45
+
46
+ # TODO: surely theres a better way
47
+ category_map = {
48
+ "Astrophysics": ["Astrophysics of Galaxies", "Cosmology and Nongalactic Astrophysics", "Earth and Planetary Astrophysics", "High Energy Astrophysical Phenomena", "Instrumentation and Methods for Astrophysics", "Solar and Stellar Astrophysics"],
49
+ "Condensed Matter": ["Disordered Systems and Neural Networks", "Materials Science", "Mesoscale and Nanoscale Physics", "Other Condensed Matter", "Quantum Gases", "Soft Condensed Matter", "Statistical Mechanics", "Strongly Correlated Electrons", "Superconductivity"],
50
+ "General Relativity and Quantum Cosmology": ["None"],
51
+ "High Energy Physics - Experiment": ["None"],
52
+ "High Energy Physics - Lattice": ["None"],
53
+ "High Energy Physics - Phenomenology": ["None"],
54
+ "High Energy Physics - Theory": ["None"],
55
+ "Mathematical Physics": ["None"],
56
+ "Nonlinear Sciences": ["Adaptation and Self-Organizing Systems", "Cellular Automata and Lattice Gases", "Chaotic Dynamics", "Exactly Solvable and Integrable Systems", "Pattern Formation and Solitons"],
57
+ "Nuclear Experiment": ["None"],
58
+ "Nuclear Theory": ["None"],
59
+ "Physics": ["Accelerator Physics", "Applied Physics", "Atmospheric and Oceanic Physics", "Atomic and Molecular Clusters", "Atomic Physics", "Biological Physics", "Chemical Physics", "Classical Physics", "Computational Physics", "Data Analysis, Statistics and Probability", "Fluid Dynamics", "General Physics", "Geophysics", "History and Philosophy of Physics", "Instrumentation and Detectors", "Medical Physics", "Optics", "Physics and Society", "Physics Education", "Plasma Physics", "Popular Physics", "Space Physics"],
60
+ "Quantum Physics": ["None"],
61
+ "Mathematics": ["Algebraic Geometry", "Algebraic Topology", "Analysis of PDEs", "Category Theory", "Classical Analysis and ODEs", "Combinatorics", "Commutative Algebra", "Complex Variables", "Differential Geometry", "Dynamical Systems", "Functional Analysis", "General Mathematics", "General Topology", "Geometric Topology", "Group Theory", "History and Overview", "Information Theory", "K-Theory and Homology", "Logic", "Mathematical Physics", "Metric Geometry", "Number Theory", "Numerical Analysis", "Operator Algebras", "Optimization and Control", "Probability", "Quantum Algebra", "Representation Theory", "Rings and Algebras", "Spectral Theory", "Statistics Theory", "Symplectic Geometry"],
62
+ "Computer Science": ["Artificial Intelligence", "Computation and Language", "Computational Complexity", "Computational Engineering, Finance, and Science", "Computational Geometry", "Computer Science and Game Theory", "Computer Vision and Pattern Recognition", "Computers and Society", "Cryptography and Security", "Data Structures and Algorithms", "Databases", "Digital Libraries", "Discrete Mathematics", "Distributed, Parallel, and Cluster Computing", "Emerging Technologies", "Formal Languages and Automata Theory", "General Literature", "Graphics", "Hardware Architecture", "Human-Computer Interaction", "Information Retrieval", "Information Theory", "Logic in Computer Science", "Machine Learning", "Mathematical Software", "Multiagent Systems", "Multimedia", "Networking and Internet Architecture", "Neural and Evolutionary Computing", "Numerical Analysis", "Operating Systems", "Other Computer Science", "Performance", "Programming Languages", "Robotics", "Social and Information Networks", "Software Engineering", "Sound", "Symbolic Computation", "Systems and Control"],
63
+ "Quantitative Biology": ["Biomolecules", "Cell Behavior", "Genomics", "Molecular Networks", "Neurons and Cognition", "Other Quantitative Biology", "Populations and Evolution", "Quantitative Methods", "Subcellular Processes", "Tissues and Organs"],
64
+ "Quantitative Finance": ["Computational Finance", "Economics", "General Finance", "Mathematical Finance", "Portfolio Management", "Pricing of Securities", "Risk Management", "Statistical Finance", "Trading and Market Microstructure"],
65
+ "Statistics": ["Applications", "Computation", "Machine Learning", "Methodology", "Other Statistics", "Statistics Theory"],
66
+ "Electrical Engineering and Systems Science": ["Audio and Speech Processing", "Image and Video Processing", "Signal Processing", "Systems and Control"],
67
+ "Economics": ["Econometrics", "General Economics", "Theoretical Economics"]
68
+ }
69
+
70
+
71
+ def generate_body(topic, categories, interest, threshold):
72
+ if topic == "Physics":
73
+ raise RuntimeError("You must choose a physics subtopic.")
74
+ elif topic in physics_topics:
75
+ abbr = physics_topics[topic]
76
+ elif topic in topics:
77
+ abbr = topics[topic]
78
+ else:
79
+ raise RuntimeError(f"Invalid topic {topic}")
80
+ if categories:
81
+ for category in categories:
82
+ if category not in category_map[topic]:
83
+ raise RuntimeError(f"{category} is not a category of {topic}")
84
+ papers = get_papers(abbr)
85
+ papers = [
86
+ t for t in papers
87
+ if bool(set(process_subject_fields(t['subjects'])) & set(categories))]
88
+ else:
89
+ papers = get_papers(abbr)
90
+ if interest:
91
+ relevancy, hallucination = generate_relevance_score(
92
+ papers,
93
+ query={"interest": interest},
94
+ threshold_score=threshold,
95
+ num_paper_in_prompt=8)
96
+ body = "<br><br>".join(
97
+ [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
98
+ for paper in relevancy])
99
+ if hallucination:
100
+ body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
101
+ else:
102
+ body = "<br><br>".join(
103
+ [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}'
104
+ for paper in papers])
105
+ return body
106
+
107
+
108
+ if __name__ == "__main__":
109
+ parser = argparse.ArgumentParser()
110
+ parser.add_argument("--config", help="yaml config file to use", default="config.yaml")
111
+ args = parser.parse_args()
112
+ with open(args.config, "r") as f:
113
+ config = yaml.safe_load(f)
114
+ if "OPENAI_API_KEY" not in os.environ:
115
+ raise RuntimeError("No openai api key found")
116
+
117
+ topic = config["topic"]
118
+ categories = config["categories"]
119
+ from_email = config.get("from_email") or os.environ.get("FROM_EMAIL")
120
+ to_email = config.get("to_email") or os.environ.get("TO_EMAIL")
121
+ threshold = config["threshold"]
122
+ interest = config["interest"]
123
+ with open("digest.html", "w") as f:
124
+ body = generate_body(topic, categories, interest, threshold)
125
+ f.write(body)
126
+ if os.environ.get('SENDGRID_API_KEY', None):
127
+ sg = SendGridAPIClient(api_key=os.environ.get('SENDGRID_API_KEY'))
128
+ from_email = Email(from_email) # Change to your verified sender
129
+ to_email = To(to_email)
130
+ subject = date.today().strftime("Personalized arXiv Digest, %d %b %Y")
131
+ content = Content("text/html", body)
132
+ mail = Mail(from_email, to_email, subject, content)
133
+ mail_json = mail.get()
134
+
135
+ # Send an HTTP POST request to /mail/send
136
+ response = sg.client.mail.send.post(request_body=mail_json)
137
+ if response.status_code >= 200 and response.status_code <= 300:
138
+ print("Send test email: Success!")
139
+ else:
140
+ print("Send test email: Failure ({response.status_code}, {response.text})")
141
+ else:
142
+ print("No sendgrid api key found. Skipping email")
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from download_new_papers import get_papers
3
+ import utils
4
+ from relevancy import generate_relevance_score, process_subject_fields
5
+ from sendgrid.helpers.mail import Mail, Email, To, Content
6
+ import sendgrid
7
+ import os
8
+ import openai
9
+
10
+ topics = {
11
+ "Physics": "",
12
+ "Mathematics": "math",
13
+ "Computer Science": "cs",
14
+ "Quantitative Biology": "q-bio",
15
+ "Quantitative Finance": "q-fin",
16
+ "Statistics": "stat",
17
+ "Electrical Engineering and Systems Science": "eess",
18
+ "Economics": "econ"
19
+ }
20
+
21
+ physics_topics = {
22
+ "Astrophysics": "astro-ph",
23
+ "Condensed Matter": "cond-mat",
24
+ "General Relativity and Quantum Cosmology": "gr-qc",
25
+ "High Energy Physics - Experiment": "hep-ex",
26
+ "High Energy Physics - Lattice": "hep-lat",
27
+ "High Energy Physics - Phenomenology": "hep-ph",
28
+ "High Energy Physics - Theory": "hep-th",
29
+ "Mathematical Physics": "math-ph",
30
+ "Nonlinear Sciences": "nlin",
31
+ "Nuclear Experiment": "nucl-ex",
32
+ "Nuclear Theory": "nucl-th",
33
+ "Physics": "physics",
34
+ "Quantum Physics": "quant-ph"
35
+ }
36
+
37
+ categories_map = {
38
+ "Astrophysics": ["Astrophysics of Galaxies", "Cosmology and Nongalactic Astrophysics", "Earth and Planetary Astrophysics", "High Energy Astrophysical Phenomena", "Instrumentation and Methods for Astrophysics", "Solar and Stellar Astrophysics"],
39
+ "Condensed Matter": ["Disordered Systems and Neural Networks", "Materials Science", "Mesoscale and Nanoscale Physics", "Other Condensed Matter", "Quantum Gases", "Soft Condensed Matter", "Statistical Mechanics", "Strongly Correlated Electrons", "Superconductivity"],
40
+ "General Relativity and Quantum Cosmology": ["None"],
41
+ "High Energy Physics - Experiment": ["None"],
42
+ "High Energy Physics - Lattice": ["None"],
43
+ "High Energy Physics - Phenomenology": ["None"],
44
+ "High Energy Physics - Theory": ["None"],
45
+ "Mathematical Physics": ["None"],
46
+ "Nonlinear Sciences": ["Adaptation and Self-Organizing Systems", "Cellular Automata and Lattice Gases", "Chaotic Dynamics", "Exactly Solvable and Integrable Systems", "Pattern Formation and Solitons"],
47
+ "Nuclear Experiment": ["None"],
48
+ "Nuclear Theory": ["None"],
49
+ "Physics": ["Accelerator Physics", "Applied Physics", "Atmospheric and Oceanic Physics", "Atomic and Molecular Clusters", "Atomic Physics", "Biological Physics", "Chemical Physics", "Classical Physics", "Computational Physics", "Data Analysis, Statistics and Probability", "Fluid Dynamics", "General Physics", "Geophysics", "History and Philosophy of Physics", "Instrumentation and Detectors", "Medical Physics", "Optics", "Physics and Society", "Physics Education", "Plasma Physics", "Popular Physics", "Space Physics"],
50
+ "Quantum Physics": ["None"],
51
+ "Mathematics": ["Algebraic Geometry", "Algebraic Topology", "Analysis of PDEs", "Category Theory", "Classical Analysis and ODEs", "Combinatorics", "Commutative Algebra", "Complex Variables", "Differential Geometry", "Dynamical Systems", "Functional Analysis", "General Mathematics", "General Topology", "Geometric Topology", "Group Theory", "History and Overview", "Information Theory", "K-Theory and Homology", "Logic", "Mathematical Physics", "Metric Geometry", "Number Theory", "Numerical Analysis", "Operator Algebras", "Optimization and Control", "Probability", "Quantum Algebra", "Representation Theory", "Rings and Algebras", "Spectral Theory", "Statistics Theory", "Symplectic Geometry"],
52
+ "Computer Science": ["Artificial Intelligence", "Computation and Language", "Computational Complexity", "Computational Engineering, Finance, and Science", "Computational Geometry", "Computer Science and Game Theory", "Computer Vision and Pattern Recognition", "Computers and Society", "Cryptography and Security", "Data Structures and Algorithms", "Databases", "Digital Libraries", "Discrete Mathematics", "Distributed, Parallel, and Cluster Computing", "Emerging Technologies", "Formal Languages and Automata Theory", "General Literature", "Graphics", "Hardware Architecture", "Human-Computer Interaction", "Information Retrieval", "Information Theory", "Logic in Computer Science", "Machine Learning", "Mathematical Software", "Multiagent Systems", "Multimedia", "Networking and Internet Architecture", "Neural and Evolutionary Computing", "Numerical Analysis", "Operating Systems", "Other Computer Science", "Performance", "Programming Languages", "Robotics", "Social and Information Networks", "Software Engineering", "Sound", "Symbolic Computation", "Systems and Control"],
53
+ "Quantitative Biology": ["Biomolecules", "Cell Behavior", "Genomics", "Molecular Networks", "Neurons and Cognition", "Other Quantitative Biology", "Populations and Evolution", "Quantitative Methods", "Subcellular Processes", "Tissues and Organs"],
54
+ "Quantitative Finance": ["Computational Finance", "Economics", "General Finance", "Mathematical Finance", "Portfolio Management", "Pricing of Securities", "Risk Management", "Statistical Finance", "Trading and Market Microstructure"],
55
+ "Statistics": ["Applications", "Computation", "Machine Learning", "Methodology", "Other Statistics", "Statistics Theory"],
56
+ "Electrical Engineering and Systems Science": ["Audio and Speech Processing", "Image and Video Processing", "Signal Processing", "Systems and Control"],
57
+ "Economics": ["Econometrics", "General Economics", "Theoretical Economics"]
58
+ }
59
+
60
+
61
+ def sample(email, topic, physics_topic, categories, interest):
62
+ if subject == "Physics":
63
+ if isinstance(physics_topic, list):
64
+ raise gr.Error("You must choose a physics topic.")
65
+ topic = physics_topic
66
+ abbr = physics_topics[topic]
67
+ else:
68
+ abbr = topics[topic]
69
+ if categories:
70
+ papers = get_papers(abbr)
71
+ papers = [
72
+ t for t in papers
73
+ if bool(set(process_subject_fields(t['subjects'])) & set(categories))][:4]
74
+ else:
75
+ papers = get_papers(abbr, limit=4)
76
+ if interest:
77
+ if not openai.api_key: raise gr.Error("Set your OpenAI api key on the left first")
78
+ relevancy, _ = generate_relevance_score(
79
+ papers,
80
+ query={"interest": interest},
81
+ threshold_score=0,
82
+ num_paper_in_prompt=4)
83
+ return "\n\n".join([paper["summarized_text"] for paper in relevancy])
84
+ else:
85
+ return "\n\n".join(f"Title: {paper['title']}\nAuthors: {paper['authors']}" for paper in papers)
86
+
87
+
88
+ def change_subsubject(subject, physics_subject):
89
+ if subject != "Physics":
90
+ return gr.Dropdown.update(choices=categories_map[subject], value=[], visible=True)
91
+ else:
92
+ if physics_subject and not isinstance(physics_subject, list):
93
+ return gr.Dropdown.update(choices=categories_map[physics_subject], value=[], visible=True)
94
+ else:
95
+ return gr.Dropdown.update(choices=[], value=[], visible=False)
96
+
97
+
98
+ def change_physics(subject):
99
+ if subject != "Physics":
100
+ return gr.Dropdown.update(visible=False, value=[])
101
+ else:
102
+ return gr.Dropdown.update(physics_topics, visible=True)
103
+
104
+
105
+ def test(email, topic, physics_topic, categories, interest, key):
106
+ if not email: raise gr.Error("Set your email")
107
+ if not key: raise gr.Error("Set your SendGrid key")
108
+ if topic == "Physics":
109
+ if isinstance(physics_topic, list):
110
+ raise gr.Error("You must choose a physics topic.")
111
+ topic = physics_topic
112
+ abbr = physics_topics[topic]
113
+ else:
114
+ abbr = topics[topic]
115
+ if categories:
116
+ papers = get_papers(abbr)
117
+ papers = [
118
+ t for t in papers
119
+ if bool(set(process_subject_fields(t['subjects'])) & set(categories))][:4]
120
+ else:
121
+ papers = get_papers(abbr, limit=4)
122
+ if interest:
123
+ if not openai.api_key: raise gr.Error("Set your OpenAI api key on the left first")
124
+ relevancy, hallucination = generate_relevance_score(
125
+ papers,
126
+ query={"interest": interest},
127
+ threshold_score=7,
128
+ num_paper_in_prompt=8)
129
+ body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}' for paper in relevancy])
130
+ if hallucination:
131
+ body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
132
+ else:
133
+ body = "<br><br>".join([f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}' for paper in papers])
134
+ sg = sendgrid.SendGridAPIClient(api_key=key)
135
+ from_email = Email(email)
136
+ to_email = To(email)
137
+ subject = "arXiv digest"
138
+ content = Content("text/html", body)
139
+ mail = Mail(from_email, to_email, subject, content)
140
+ mail_json = mail.get()
141
+
142
+ # Send an HTTP POST request to /mail/send
143
+ response = sg.client.mail.send.post(request_body=mail_json)
144
+ if response.status_code >= 200 and response.status_code <= 300:
145
+ return "Success!"
146
+ else:
147
+ return "Failure: ({response.status_code})"
148
+
149
+
150
+ def register_openai_token(token):
151
+ print(f"registering new key: {token[:5]}")
152
+ openai.api_key = token
153
+
154
+ with gr.Blocks() as demo:
155
+ with gr.Row():
156
+ with gr.Column(scale=0.40):
157
+ with gr.Box():
158
+ token = gr.Textbox(label="OpenAI API Key", type="password")
159
+ with gr.Box():
160
+ description = gr.HTML(value="Send an email to the below address using the configuration on the right. Requires a sendgrid token")
161
+ email = gr.Textbox(label="Email address", type="email", placeholder="")
162
+ sendgrid_token = gr.Textbox(label="SendGrid API Key", type="password")
163
+ with gr.Row():
164
+ test_btn = gr.Button("Send email")
165
+ output = gr.Textbox(show_label=False, placeholder="email status")
166
+ with gr.Column(scale=1):
167
+ subject = gr.Radio(
168
+ list(topics.keys()), label="Topic"
169
+ )
170
+ physics_subject = gr.Dropdown(physics_topics, value=[], multiselect=False, label="Physics category", visible=False, info="")
171
+ subsubject = gr.Dropdown(
172
+ [], value=[], multiselect=True, label="Subtopic", info="", visible=False)
173
+ subject.change(fn=change_physics, inputs=[subject], outputs=physics_subject)
174
+ subject.change(fn=change_subsubject, inputs=[subject, physics_subject], outputs=subsubject)
175
+ physics_subject.change(fn=change_subsubject, inputs=[subject, physics_subject], outputs=subsubject)
176
+
177
+ interest = gr.Textbox(label="A natural language description of what you are interested in. Press shift-enter to update.", lines=7)
178
+ sample_output = gr.Textbox(label="Examples")
179
+ test_btn.click(fn=test, inputs=[email, subject, physics_subject, subsubject, interest, sendgrid_token], outputs=output)
180
+ token.change(fn=register_openai_token, inputs=[token])
181
+ subject.change(fn=sample, inputs=[email, subject, physics_subject, subsubject, interest], outputs=sample_output)
182
+ physics_subject.change(fn=sample, inputs=[email, subject, physics_subject, subsubject, interest], outputs=sample_output)
183
+ subsubject.change(fn=sample, inputs=[email, subject, physics_subject, subsubject, interest], outputs=sample_output)
184
+ interest.submit(fn=sample, inputs=[email, subject, physics_subject, subsubject, interest], outputs=sample_output)
185
+
186
+ demo.launch()
download_new_papers.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # encoding: utf-8
2
+ import os
3
+ import tqdm
4
+ from bs4 import BeautifulSoup as bs
5
+ import urllib.request
6
+ import json
7
+ import datetime
8
+ import pytz
9
+
10
+
11
+ def _download_new_papers(field_abbr):
12
+ NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
13
+ page = urllib.request.urlopen(NEW_SUB_URL)
14
+ soup = bs(page)
15
+ content = soup.body.find("div", {'id': 'content'})
16
+
17
+ # find the first h3 element in content
18
+ h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23
19
+ date = h3.replace("New submissions for", "").strip()
20
+
21
+ dt_list = content.dl.find_all("dt")
22
+ dd_list = content.dl.find_all("dd")
23
+ arxiv_base = "https://arxiv.org/abs/"
24
+
25
+ assert len(dt_list) == len(dd_list)
26
+ new_paper_list = []
27
+ for i in tqdm.tqdm(range(len(dt_list))):
28
+ paper = {}
29
+ paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
30
+ paper['main_page'] = arxiv_base + paper_number
31
+ paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
32
+
33
+ paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
34
+ paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
35
+ .replace("Authors:\n", "").replace("\n", "").strip()
36
+ paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
37
+ paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
38
+ new_paper_list.append(paper)
39
+
40
+
41
+ # check if ./data exist, if not, create it
42
+ if not os.path.exists("./data"):
43
+ os.makedirs("./data")
44
+
45
+ # save new_paper_list to a jsonl file, with each line as the element of a dictionary
46
+ date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
47
+ date = date.strftime("%a, %d %b %y")
48
+ with open(f"./data/{field_abbr}_{date}.jsonl", "w") as f:
49
+ for paper in new_paper_list:
50
+ f.write(json.dumps(paper) + "\n")
51
+
52
+
53
+ def get_papers(field_abbr, limit=None):
54
+ date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
55
+ date = date.strftime("%a, %d %b %y")
56
+ if not os.path.exists(f"./data/{field_abbr}_{date}.jsonl"):
57
+ _download_new_papers(field_abbr)
58
+ results = []
59
+ with open(f"./data/{field_abbr}_{date}.jsonl", "r") as f:
60
+ for i, line in enumerate(f.readlines()):
61
+ if limit and i == limit:
62
+ return results
63
+ results.append(json.loads(line))
64
+ return results
relevancy.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ run:
3
+ python -m relevancy run_all_day_paper \
4
+ --output_dir ./data \
5
+ --model_name="gpt-3.5-turbo" \
6
+ """
7
+ import time
8
+ import json
9
+ import os
10
+ import random
11
+ import re
12
+ import string
13
+ from datetime import datetime
14
+
15
+ import numpy as np
16
+ import tqdm
17
+ import utils
18
+
19
+
20
+ def encode_prompt(query, prompt_papers):
21
+ """Encode multiple prompt instructions into a single string."""
22
+ prompt = open("src/relevancy_prompt.txt").read() + "\n"
23
+ prompt += query['interest']
24
+
25
+ for idx, task_dict in enumerate(prompt_papers):
26
+ (title, authors, abstract) = task_dict["title"], task_dict["authors"], task_dict["abstract"]
27
+ if not title:
28
+ raise
29
+ prompt += f"###\n"
30
+ prompt += f"{idx + 1}. Title: {title}\n"
31
+ prompt += f"{idx + 1}. Authors: {authors}\n"
32
+ prompt += f"{idx + 1}. Abstract: {abstract}\n"
33
+ prompt += f"\n Generate response:\n1."
34
+ print(prompt)
35
+ return prompt
36
+
37
+
38
+ def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
39
+ selected_data = []
40
+ if response is None:
41
+ return []
42
+ json_items = response['message']['content'].replace("\n\n", "\n").split("\n")
43
+ pattern = r"^\d+\. |\\"
44
+ import pprint
45
+ try:
46
+ score_items = [
47
+ json.loads(re.sub(pattern, "", line))
48
+ for line in json_items if "relevancy score" in line.lower()]
49
+ except Exception:
50
+ pprint.pprint([re.sub(pattern, "", line) for line in json_items if "relevancy score" in line.lower()])
51
+ raise RuntimeError("failed")
52
+ pprint.pprint(score_items)
53
+ scores = []
54
+ for item in score_items:
55
+ temp = item["Relevancy score"]
56
+ if "/" in temp:
57
+ scores.append(int(temp.split("/")[0]))
58
+ else:
59
+ scores.append(int(temp))
60
+ if len(score_items) != len(paper_data):
61
+ score_items = score_items[:len(paper_data)]
62
+ hallucination = True
63
+ else:
64
+ hallucination = False
65
+
66
+ for idx, inst in enumerate(score_items):
67
+ # if the decoding stops due to length, the last example is likely truncated so we discard it
68
+ if scores[idx] < threshold_score:
69
+ continue
70
+ output_str = "Title: " + paper_data[idx]["title"] + "\n"
71
+ output_str += "Authors: " + paper_data[idx]["authors"] + "\n"
72
+ output_str += "Link: " + paper_data[idx]["main_page"] + "\n"
73
+ for key, value in inst.items():
74
+ paper_data[idx][key] = value
75
+ output_str += key + ": " + value + "\n"
76
+ paper_data[idx]['summarized_text'] = output_str
77
+ selected_data.append(paper_data[idx])
78
+ return selected_data, hallucination
79
+
80
+
81
+ def find_word_in_string(w, s):
82
+ return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)
83
+
84
+
85
+ def process_subject_fields(subjects):
86
+ all_subjects = subjects.split(";")
87
+ all_subjects = [s.split(" (")[0] for s in all_subjects]
88
+ return all_subjects
89
+
90
+ def generate_relevance_score(
91
+ all_papers,
92
+ query,
93
+ model_name="gpt-3.5-turbo",
94
+ threshold_score=8,
95
+ num_paper_in_prompt=4,
96
+ temperature=0.4,
97
+ top_p=1.0,
98
+ sorting=True
99
+ ):
100
+ ans_data = []
101
+ request_idx = 1
102
+ hallucination = False
103
+ for id in tqdm.tqdm(range(0, len(all_papers), num_paper_in_prompt)):
104
+ prompt_papers = all_papers[id:id+num_paper_in_prompt]
105
+ # only sampling from the seed tasks
106
+ prompt = encode_prompt(query, prompt_papers)
107
+
108
+ decoding_args = utils.OpenAIDecodingArguments(
109
+ temperature=temperature,
110
+ n=1,
111
+ max_tokens=1072, # hard-code to maximize the length. the requests will be automatically adjusted
112
+ top_p=top_p,
113
+ )
114
+ request_start = time.time()
115
+ response = utils.openai_completion(
116
+ prompts=prompt,
117
+ model_name=model_name,
118
+ batch_size=1,
119
+ decoding_args=decoding_args,
120
+ logit_bias={"100257": -100}, # prevent the <|endoftext|> from being generated
121
+ # "100265":-100, "100276":-100 for <|im_end|> and <endofprompt> token
122
+ )
123
+ print ("response", response['message']['content'])
124
+ request_duration = time.time() - request_start
125
+
126
+ process_start = time.time()
127
+ batch_data, hallu = post_process_chat_gpt_response(prompt_papers, response, threshold_score=threshold_score)
128
+ hallucination = hallucination or hallu
129
+ ans_data.extend(batch_data)
130
+
131
+ print(f"Request {request_idx+1} took {request_duration:.2f}s")
132
+ print(f"Post-processing took {time.time() - process_start:.2f}s")
133
+
134
+ if sorting:
135
+ ans_data = sorted(ans_data, key=lambda x: x["Relevancy score"], reverse=True)
136
+
137
+ return ans_data, hallucination
138
+
139
+ def run_all_day_paper(
140
+ query={"interest":"", "subjects":["Computation and Language", "Artificial Intelligence"]},
141
+ date=None,
142
+ data_dir="../data",
143
+ model_name="gpt-3.5-turbo",
144
+ threshold_score=8,
145
+ num_paper_in_prompt=8,
146
+ temperature=0.4,
147
+ top_p=1.0
148
+ ):
149
+ if date is None:
150
+ date = datetime.today().strftime('%a, %d %b %y')
151
+ # string format such as Wed, 10 May 23
152
+ print ("the date for the arxiv data is: ", date)
153
+
154
+ all_papers = [json.loads(l) for l in open(f"{data_dir}/{date}.jsonl", "r")]
155
+ print (f"We found {len(all_papers)}.")
156
+
157
+ all_papers_in_subjects = [
158
+ t for t in all_papers
159
+ if bool(set(process_subject_fields(t['subjects'])) & set(query['subjects']))
160
+ ]
161
+ print(f"After filtering subjects, we have {len(all_papers_in_subjects)} papers left.")
162
+ ans_data = generate_relevance_score(all_papers_in_subjects, query, model_name, threshold_score, num_paper_in_prompt, temperature, top_p)
163
+ utils.write_ans_to_file(ans_data, date, output_dir="../outputs")
164
+ return ans_data
165
+
166
+
167
+ if __name__ == "__main__":
168
+ query = {"interest":"""
169
+ 1. Large language model pretraining and finetunings
170
+ 2. Multimodal machine learning
171
+ 3. Do not care about specific application, for example, information extraction, summarization, etc.
172
+ 4. Not interested in paper focus on specific languages, e.g., Arabic, Chinese, etc.\n""",
173
+ "subjects":["Computation and Language"]}
174
+ ans_data = run_all_day_paper(query)
utils.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import logging
3
+ import math
4
+ import os
5
+ import io
6
+ import sys
7
+ import time
8
+ import json
9
+ from typing import Optional, Sequence, Union
10
+
11
+ import openai
12
+ import tqdm
13
+ from openai import openai_object
14
+ import copy
15
+
16
+ StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
17
+
18
+
19
+ openai_org = os.getenv("OPENAI_ORG")
20
+ if openai_org is not None:
21
+ openai.organization = openai_org
22
+ logging.warning(f"Switching to organization: {openai_org} for OAI API key.")
23
+
24
+
25
+ @dataclasses.dataclass
26
+ class OpenAIDecodingArguments(object):
27
+ max_tokens: int = 1800
28
+ temperature: float = 0.2
29
+ top_p: float = 1.0
30
+ n: int = 1
31
+ stream: bool = False
32
+ stop: Optional[Sequence[str]] = None
33
+ presence_penalty: float = 0.0
34
+ frequency_penalty: float = 0.0
35
+ # logprobs: Optional[int] = None
36
+
37
+
38
+ def openai_completion(
39
+ prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
40
+ decoding_args: OpenAIDecodingArguments,
41
+ model_name="text-davinci-003",
42
+ sleep_time=2,
43
+ batch_size=1,
44
+ max_instances=sys.maxsize,
45
+ max_batches=sys.maxsize,
46
+ return_text=False,
47
+ **decoding_kwargs,
48
+ ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
49
+ """Decode with OpenAI API.
50
+
51
+ Args:
52
+ prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
53
+ as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
54
+ it can also be a dictionary (or list thereof) as explained here:
55
+ https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
56
+ decoding_args: Decoding arguments.
57
+ model_name: Model name. Can be either in the format of "org/model" or just "model".
58
+ sleep_time: Time to sleep once the rate-limit is hit.
59
+ batch_size: Number of prompts to send in a single request. Only for non chat model.
60
+ max_instances: Maximum number of prompts to decode.
61
+ max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
62
+ return_text: If True, return text instead of full completion object (which contains things like logprob).
63
+ decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
64
+
65
+ Returns:
66
+ A completion or a list of completions.
67
+ Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
68
+ - a string (if return_text is True)
69
+ - an openai_object.OpenAIObject object (if return_text is False)
70
+ - a list of objects of the above types (if decoding_args.n > 1)
71
+ """
72
+ is_chat_model = "gpt-3.5" in model_name or "gpt-4" in model_name
73
+ is_single_prompt = isinstance(prompts, (str, dict))
74
+ if is_single_prompt:
75
+ prompts = [prompts]
76
+
77
+ if max_batches < sys.maxsize:
78
+ logging.warning(
79
+ "`max_batches` will be deprecated in the future, please use `max_instances` instead."
80
+ "Setting `max_instances` to `max_batches * batch_size` for now."
81
+ )
82
+ max_instances = max_batches * batch_size
83
+
84
+ prompts = prompts[:max_instances]
85
+ num_prompts = len(prompts)
86
+ prompt_batches = [
87
+ prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
88
+ for batch_id in range(int(math.ceil(num_prompts / batch_size)))
89
+ ]
90
+
91
+ completions = []
92
+ for batch_id, prompt_batch in tqdm.tqdm(
93
+ enumerate(prompt_batches),
94
+ desc="prompt_batches",
95
+ total=len(prompt_batches),
96
+ ):
97
+ batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
98
+
99
+ while True:
100
+ try:
101
+ shared_kwargs = dict(
102
+ model=model_name,
103
+ **batch_decoding_args.__dict__,
104
+ **decoding_kwargs,
105
+ )
106
+ if is_chat_model:
107
+ completion_batch = openai.ChatCompletion.create(
108
+ messages=[
109
+ {"role": "system", "content": "You are a helpful assistant."},
110
+ {"role": "user", "content": prompt_batch[0]}
111
+ ],
112
+ **shared_kwargs
113
+ )
114
+ else:
115
+ completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs)
116
+
117
+ choices = completion_batch.choices
118
+
119
+ for choice in choices:
120
+ choice["total_tokens"] = completion_batch.usage.total_tokens
121
+ completions.extend(choices)
122
+ break
123
+ except openai.error.OpenAIError as e:
124
+ logging.warning(f"OpenAIError: {e}.")
125
+ if "Please reduce your prompt" in str(e):
126
+ batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8)
127
+ logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
128
+ else:
129
+ logging.warning("Hit request rate limit; retrying...")
130
+ time.sleep(sleep_time) # Annoying rate limit on requests.
131
+
132
+ if return_text:
133
+ completions = [completion.text for completion in completions]
134
+ if decoding_args.n > 1:
135
+ # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
136
+ completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)]
137
+ if is_single_prompt:
138
+ # Return non-tuple if only 1 input and 1 generation.
139
+ (completions,) = completions
140
+ return completions
141
+
142
+
143
+ def write_ans_to_file(ans_data, file_prefix, output_dir="./output"):
144
+ if not os.path.exists(output_dir):
145
+ os.makedirs(output_dir)
146
+ filename = os.path.join(output_dir, file_prefix + ".txt")
147
+ with open(filename, "w") as f:
148
+ for ans in ans_data:
149
+ f.write(ans + "\n")