mehradans92 commited on
Commit
a5f6a02
Β·
1 Parent(s): e428506

added chemrxiv

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. utils.py +40 -7
app.py CHANGED
@@ -25,7 +25,7 @@ api_key = st.text_input('OpenAI API Key',
25
  placeholder='sk-...',
26
  help=f"['What is that?']({api_key_url})",
27
  type="password",
28
- value = '')
29
 
30
  os.environ["OPENAI_API_KEY"] = f"{api_key}" #
31
  if len(api_key) != 51:
@@ -45,7 +45,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
45
  with st.form(key='columns_in_form', clear_on_submit = False):
46
  c1, c2 = st.columns([5, 0.8])
47
  with c1:
48
- search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value=''
49
  )#search_query, max_results_current))
50
 
51
  with c2:
 
25
  placeholder='sk-...',
26
  help=f"['What is that?']({api_key_url})",
27
  type="password",
28
+ value = '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')
29
 
30
  os.environ["OPENAI_API_KEY"] = f"{api_key}" #
31
  if len(api_key) != 51:
 
45
  with st.form(key='columns_in_form', clear_on_submit = False):
46
  c1, c2 = st.columns([5, 0.8])
47
  with c1:
48
+ search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='hemolytic peptides'
49
  )#search_query, max_results_current))
50
 
51
  with c2:
utils.py CHANGED
@@ -10,6 +10,7 @@ from bs4 import BeautifulSoup as bs
10
  from datetime import datetime
11
  from random import uniform as rand
12
  import numpy as np
 
13
 
14
 
15
  class XRxivQuery:
@@ -74,9 +75,46 @@ class XRxivQuery:
74
  pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
75
  self.all_pdf_info.append(pdf_info)
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
78
  '''
79
- Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
80
  <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
81
  <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
82
  <span class="highwire-cite-title">
@@ -132,7 +170,6 @@ class XRxivQuery:
132
  for i, pdf in enumerate(pdf_entries):
133
  pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
134
  pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
135
-
136
  pdf_url = pdf.find('a', href=True)['href']
137
  if pdf_url[:4] != 'http':
138
  pdf_url = f'http://www.biorxiv.org'+ pdf_url
@@ -149,10 +186,6 @@ class XRxivQuery:
149
  return self.all_pdf_info
150
 
151
  def download_pdf(self):
152
- # if len(os.listdir(f'./{folder_name}') ) != 0:
153
- # check folder is empty to avoid using papers from old runs:
154
- # os.remove(f'./{folder_name}/*')
155
- # print(pdf_info)
156
  all_reference_text = []
157
  for i,p in enumerate(stqdm(self.all_pdf_info, desc='πŸ” Searching and downloading papers')):
158
  pdf_title=p[0]
@@ -160,7 +193,7 @@ class XRxivQuery:
160
  pdf_url=p[1]
161
  if pdf_category in ['medRxiv', 'bioRxiv']:
162
  pdf_url += '.full.pdf'
163
- pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
164
  folder_name=p[4]
165
  pdf_citation=p[5]
166
  r = requests.get(pdf_url, allow_redirects=True)
 
10
  from datetime import datetime
11
  from random import uniform as rand
12
  import numpy as np
13
+ import json
14
 
15
 
16
  class XRxivQuery:
 
75
  pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
76
  self.all_pdf_info.append(pdf_info)
77
 
78
+ if 'chemrxiv' in self.XRxiv_servers:
79
+ '''
80
+ See https://chemrxiv.org/engage/chemrxiv/public-api/documentation#tag/public-apiv1items/operation/getPublicapiV1Items
81
+
82
+ '''
83
+ # Call chemrxiv API
84
+ journal = 'chemRxiv'
85
+ max_chemrxiv_papers = max_papers_in_server[1]
86
+ chemrxiv_url = f'https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?term="{"%20".join(search_query)}"&sort=RELEVANT_ASC&limit={max_chemrxiv_papers}'
87
+ req = urllib.request.Request(
88
+ url=chemrxiv_url,
89
+ headers={'User-Agent': 'Mozilla/5.0'}
90
+ )
91
+ s = urllib.request.urlopen(req).read()
92
+ jsonResponse = json.loads(s.decode('utf-8'))
93
+ pdf_titles = []
94
+ pdf_authors = []
95
+ pdf_urls = []
96
+ pdf_categories = []
97
+ folder_names = []
98
+ pdf_citation = []
99
+ pdf_years = []
100
+ for i,d in enumerate(jsonResponse['itemHits']):
101
+ pdf_titles.append(d['item']['title'].replace("\n", ""))
102
+ authors_dict = d['item']['authors']
103
+ pdf_authors.append([n['firstName']+' '+ n['lastName'] for n in authors_dict])
104
+ pdf_urls.append('https://chemrxiv.org/engage/chemrxiv/article-details/'+ str(d['item']['id']))
105
+ pdf_categories.append(journal)
106
+ folder_names.append(self.folder_name)
107
+ pdf_years.append(d['item']['statusDate'][:4])
108
+ pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
109
+ # overwriting url cause chermRxiv sucks!
110
+ pdf_urls[i] = d['item']['asset']['original']['url']
111
+ pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
112
+ self.all_pdf_info.append(pdf_info)
113
+
114
+
115
  if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
116
  '''
117
+ Scraps the biorxiv and medrxiv's html to get data from each entry in a search. Entries has the following formatting:
118
  <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
119
  <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
120
  <span class="highwire-cite-title">
 
170
  for i, pdf in enumerate(pdf_entries):
171
  pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
172
  pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
 
173
  pdf_url = pdf.find('a', href=True)['href']
174
  if pdf_url[:4] != 'http':
175
  pdf_url = f'http://www.biorxiv.org'+ pdf_url
 
186
  return self.all_pdf_info
187
 
188
  def download_pdf(self):
 
 
 
 
189
  all_reference_text = []
190
  for i,p in enumerate(stqdm(self.all_pdf_info, desc='πŸ” Searching and downloading papers')):
191
  pdf_title=p[0]
 
193
  pdf_url=p[1]
194
  if pdf_category in ['medRxiv', 'bioRxiv']:
195
  pdf_url += '.full.pdf'
196
+ pdf_file_name=p[0].replace(':','').replace('/','').replace('.','').replace('\n','')
197
  folder_name=p[4]
198
  pdf_citation=p[5]
199
  r = requests.get(pdf_url, allow_redirects=True)