Spaces:
Running
Running
Commit
Β·
a5f6a02
1
Parent(s):
e428506
added chemrxiv
Browse files
app.py
CHANGED
@@ -25,7 +25,7 @@ api_key = st.text_input('OpenAI API Key',
|
|
25 |
placeholder='sk-...',
|
26 |
help=f"['What is that?']({api_key_url})",
|
27 |
type="password",
|
28 |
-
value = '')
|
29 |
|
30 |
os.environ["OPENAI_API_KEY"] = f"{api_key}" #
|
31 |
if len(api_key) != 51:
|
@@ -45,7 +45,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
|
|
45 |
with st.form(key='columns_in_form', clear_on_submit = False):
|
46 |
c1, c2 = st.columns([5, 0.8])
|
47 |
with c1:
|
48 |
-
search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value=''
|
49 |
)#search_query, max_results_current))
|
50 |
|
51 |
with c2:
|
|
|
25 |
placeholder='sk-...',
|
26 |
help=f"['What is that?']({api_key_url})",
|
27 |
type="password",
|
28 |
+
value = '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')
|
29 |
|
30 |
os.environ["OPENAI_API_KEY"] = f"{api_key}" #
|
31 |
if len(api_key) != 51:
|
|
|
45 |
with st.form(key='columns_in_form', clear_on_submit = False):
|
46 |
c1, c2 = st.columns([5, 0.8])
|
47 |
with c1:
|
48 |
+
search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='hemolytic peptides'
|
49 |
)#search_query, max_results_current))
|
50 |
|
51 |
with c2:
|
utils.py
CHANGED
@@ -10,6 +10,7 @@ from bs4 import BeautifulSoup as bs
|
|
10 |
from datetime import datetime
|
11 |
from random import uniform as rand
|
12 |
import numpy as np
|
|
|
13 |
|
14 |
|
15 |
class XRxivQuery:
|
@@ -74,9 +75,46 @@ class XRxivQuery:
|
|
74 |
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
75 |
self.all_pdf_info.append(pdf_info)
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
|
78 |
'''
|
79 |
-
Scraps the
|
80 |
<li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
|
81 |
<div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
|
82 |
<span class="highwire-cite-title">
|
@@ -132,7 +170,6 @@ class XRxivQuery:
|
|
132 |
for i, pdf in enumerate(pdf_entries):
|
133 |
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
|
134 |
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
|
135 |
-
|
136 |
pdf_url = pdf.find('a', href=True)['href']
|
137 |
if pdf_url[:4] != 'http':
|
138 |
pdf_url = f'http://www.biorxiv.org'+ pdf_url
|
@@ -149,10 +186,6 @@ class XRxivQuery:
|
|
149 |
return self.all_pdf_info
|
150 |
|
151 |
def download_pdf(self):
|
152 |
-
# if len(os.listdir(f'./{folder_name}') ) != 0:
|
153 |
-
# check folder is empty to avoid using papers from old runs:
|
154 |
-
# os.remove(f'./{folder_name}/*')
|
155 |
-
# print(pdf_info)
|
156 |
all_reference_text = []
|
157 |
for i,p in enumerate(stqdm(self.all_pdf_info, desc='π Searching and downloading papers')):
|
158 |
pdf_title=p[0]
|
@@ -160,7 +193,7 @@ class XRxivQuery:
|
|
160 |
pdf_url=p[1]
|
161 |
if pdf_category in ['medRxiv', 'bioRxiv']:
|
162 |
pdf_url += '.full.pdf'
|
163 |
-
pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
|
164 |
folder_name=p[4]
|
165 |
pdf_citation=p[5]
|
166 |
r = requests.get(pdf_url, allow_redirects=True)
|
|
|
10 |
from datetime import datetime
|
11 |
from random import uniform as rand
|
12 |
import numpy as np
|
13 |
+
import json
|
14 |
|
15 |
|
16 |
class XRxivQuery:
|
|
|
75 |
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
76 |
self.all_pdf_info.append(pdf_info)
|
77 |
|
78 |
+
if 'chemrxiv' in self.XRxiv_servers:
|
79 |
+
'''
|
80 |
+
See https://chemrxiv.org/engage/chemrxiv/public-api/documentation#tag/public-apiv1items/operation/getPublicapiV1Items
|
81 |
+
|
82 |
+
'''
|
83 |
+
# Call chemrxiv API
|
84 |
+
journal = 'chemRxiv'
|
85 |
+
max_chemrxiv_papers = max_papers_in_server[1]
|
86 |
+
chemrxiv_url = f'https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?term="{"%20".join(search_query)}"&sort=RELEVANT_ASC&limit={max_chemrxiv_papers}'
|
87 |
+
req = urllib.request.Request(
|
88 |
+
url=chemrxiv_url,
|
89 |
+
headers={'User-Agent': 'Mozilla/5.0'}
|
90 |
+
)
|
91 |
+
s = urllib.request.urlopen(req).read()
|
92 |
+
jsonResponse = json.loads(s.decode('utf-8'))
|
93 |
+
pdf_titles = []
|
94 |
+
pdf_authors = []
|
95 |
+
pdf_urls = []
|
96 |
+
pdf_categories = []
|
97 |
+
folder_names = []
|
98 |
+
pdf_citation = []
|
99 |
+
pdf_years = []
|
100 |
+
for i,d in enumerate(jsonResponse['itemHits']):
|
101 |
+
pdf_titles.append(d['item']['title'].replace("\n", ""))
|
102 |
+
authors_dict = d['item']['authors']
|
103 |
+
pdf_authors.append([n['firstName']+' '+ n['lastName'] for n in authors_dict])
|
104 |
+
pdf_urls.append('https://chemrxiv.org/engage/chemrxiv/article-details/'+ str(d['item']['id']))
|
105 |
+
pdf_categories.append(journal)
|
106 |
+
folder_names.append(self.folder_name)
|
107 |
+
pdf_years.append(d['item']['statusDate'][:4])
|
108 |
+
pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
|
109 |
+
# overwriting url cause chermRxiv sucks!
|
110 |
+
pdf_urls[i] = d['item']['asset']['original']['url']
|
111 |
+
pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
|
112 |
+
self.all_pdf_info.append(pdf_info)
|
113 |
+
|
114 |
+
|
115 |
if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
|
116 |
'''
|
117 |
+
Scraps the biorxiv and medrxiv's html to get data from each entry in a search. Entries has the following formatting:
|
118 |
<li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
|
119 |
<div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
|
120 |
<span class="highwire-cite-title">
|
|
|
170 |
for i, pdf in enumerate(pdf_entries):
|
171 |
pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
|
172 |
pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
|
|
|
173 |
pdf_url = pdf.find('a', href=True)['href']
|
174 |
if pdf_url[:4] != 'http':
|
175 |
pdf_url = f'http://www.biorxiv.org'+ pdf_url
|
|
|
186 |
return self.all_pdf_info
|
187 |
|
188 |
def download_pdf(self):
|
|
|
|
|
|
|
|
|
189 |
all_reference_text = []
|
190 |
for i,p in enumerate(stqdm(self.all_pdf_info, desc='π Searching and downloading papers')):
|
191 |
pdf_title=p[0]
|
|
|
193 |
pdf_url=p[1]
|
194 |
if pdf_category in ['medRxiv', 'bioRxiv']:
|
195 |
pdf_url += '.full.pdf'
|
196 |
+
pdf_file_name=p[0].replace(':','').replace('/','').replace('.','').replace('\n','')
|
197 |
folder_name=p[4]
|
198 |
pdf_citation=p[5]
|
199 |
r = requests.get(pdf_url, allow_redirects=True)
|