Spaces:

mehradans92
/

decode-elm

Running

App Files Files Community

mehradans92 commited on Feb 20, 2023

Commit

a5f6a02

1 Parent(s): e428506

added chemrxiv

Browse files

Files changed (2) hide show

app.py +2 -2
utils.py +40 -7

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ api_key = st.text_input('OpenAI API Key',
          placeholder='sk-...',
          help=f"['What is that?']({api_key_url})",
          type="password",
-         value = '')
 os.environ["OPENAI_API_KEY"] = f"{api_key}" #
 if len(api_key) != 51:
@@ -45,7 +45,7 @@ def search_click_callback(search_query, max_results, XRxiv_servers=[]):
 with st.form(key='columns_in_form', clear_on_submit = False):
     c1, c2 = st.columns([5, 0.8])
     with c1:
-        search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value=''
                                        )#search_query, max_results_current))
     with c2:

          placeholder='sk-...',
          help=f"['What is that?']({api_key_url})",
          type="password",
+         value = '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')
 os.environ["OPENAI_API_KEY"] = f"{api_key}" #
 if len(api_key) != 51:
 with st.form(key='columns_in_form', clear_on_submit = False):
     c1, c2 = st.columns([5, 0.8])
     with c1:
+        search_query = st.text_input("Input search query here:", placeholder='Keywords for most relevant search...', value='hemolytic peptides'
                                        )#search_query, max_results_current))
     with c2:

utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ from bs4 import BeautifulSoup as bs
 from datetime import datetime
 from random import uniform as rand
 import numpy as np
 class XRxivQuery:
@@ -74,9 +75,46 @@ class XRxivQuery:
             pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
             self.all_pdf_info.append(pdf_info)
         if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
             '''
-            Scraps the arXiv's html to get data from each entry in a search. Entries has the following formatting:
             <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
             <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
             <span class="highwire-cite-title">
@@ -132,7 +170,6 @@ class XRxivQuery:
             for i, pdf in enumerate(pdf_entries):
                 pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
                 pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
                 pdf_url = pdf.find('a', href=True)['href']
                 if pdf_url[:4] != 'http':
                     pdf_url = f'http://www.biorxiv.org'+ pdf_url
@@ -149,10 +186,6 @@ class XRxivQuery:
         return self.all_pdf_info
     def download_pdf(self):
-        # if len(os.listdir(f'./{folder_name}') ) != 0:
-                # check folder is empty to avoid using papers from old runs:
-                # os.remove(f'./{folder_name}/*')
-        # print(pdf_info)
         all_reference_text = []
         for i,p in enumerate(stqdm(self.all_pdf_info, desc='🔍 Searching and downloading papers')):
             pdf_title=p[0]
@@ -160,7 +193,7 @@ class XRxivQuery:
             pdf_url=p[1]
             if pdf_category in ['medRxiv', 'bioRxiv']:
                 pdf_url += '.full.pdf'
-            pdf_file_name=p[0].replace(':','').replace('/','').replace('.','')
             folder_name=p[4]
             pdf_citation=p[5]
             r = requests.get(pdf_url, allow_redirects=True)

 from datetime import datetime
 from random import uniform as rand
 import numpy as np
+import json
 class XRxivQuery:
             pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
             self.all_pdf_info.append(pdf_info)
+        if 'chemrxiv' in self.XRxiv_servers:
+            '''
+            See https://chemrxiv.org/engage/chemrxiv/public-api/documentation#tag/public-apiv1items/operation/getPublicapiV1Items
+            '''
+            # Call chemrxiv API
+            journal = 'chemRxiv'
+            max_chemrxiv_papers = max_papers_in_server[1]
+            chemrxiv_url = f'https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?term="{"%20".join(search_query)}"&sort=RELEVANT_ASC&limit={max_chemrxiv_papers}'
+            req = urllib.request.Request(
+                    url=chemrxiv_url,
+                    headers={'User-Agent': 'Mozilla/5.0'}
+                )
+            s = urllib.request.urlopen(req).read()
+            jsonResponse = json.loads(s.decode('utf-8'))
+            pdf_titles   = []
+            pdf_authors  = []
+            pdf_urls     = []
+            pdf_categories = []
+            folder_names = []
+            pdf_citation = []
+            pdf_years = []
+            for i,d in enumerate(jsonResponse['itemHits']):
+                pdf_titles.append(d['item']['title'].replace("\n", ""))
+                authors_dict = d['item']['authors']
+                pdf_authors.append([n['firstName']+' '+ n['lastName'] for n in authors_dict])
+                pdf_urls.append('https://chemrxiv.org/engage/chemrxiv/article-details/'+ str(d['item']['id']))
+                pdf_categories.append(journal)
+                folder_names.append(self.folder_name)
+                pdf_years.append(d['item']['statusDate'][:4])
+                pdf_citation.append(f"{', '.join(pdf_authors[i])}, {pdf_titles[i]}. {journal} [{pdf_categories[i][0]}] ({pdf_years[i]}), (available at {pdf_urls[i]}).")
+                # overwriting url cause chermRxiv sucks!
+                pdf_urls[i] = d['item']['asset']['original']['url']
+            pdf_info = list(zip(pdf_titles, pdf_urls, pdf_authors, pdf_categories, folder_names, pdf_citation))
+            self.all_pdf_info.append(pdf_info)
         if 'biorxiv' in self.XRxiv_servers or 'medrxiv' in self.XRxiv_servers:
             '''
+            Scraps the biorxiv and medrxiv's html to get data from each entry in a search. Entries has the following formatting:
             <li class="first last odd search-result result-jcode-medrxiv search-result-highwire-citation">
             <div class="highwire-article-citation highwire-citation-type-highwire-article node" data-apath="/medrxiv/early/2021/02/18/2021.02.12.21251663.atom" data-pisa="medrxiv;2021.02.12.21251663v1" data-pisa-master="medrxiv;2021.02.12.21251663" id="node-medrxivearly202102182021021221251663atom1512875027"><div class="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix">
             <span class="highwire-cite-title">
             for i, pdf in enumerate(pdf_entries):
                 pdf_titles.append(pdf.find('span', attrs={'class': 'highwire-cite-title'}).text.strip())
                 pdf_authors.append(pdf.find('span', attrs={'class': 'highwire-citation-authors'}).text.strip().split(', '))
                 pdf_url = pdf.find('a', href=True)['href']
                 if pdf_url[:4] != 'http':
                     pdf_url = f'http://www.biorxiv.org'+ pdf_url
         return self.all_pdf_info
     def download_pdf(self):
         all_reference_text = []
         for i,p in enumerate(stqdm(self.all_pdf_info, desc='🔍 Searching and downloading papers')):
             pdf_title=p[0]
             pdf_url=p[1]
             if pdf_category in ['medRxiv', 'bioRxiv']:
                 pdf_url += '.full.pdf'
+            pdf_file_name=p[0].replace(':','').replace('/','').replace('.','').replace('\n','')
             folder_name=p[4]
             pdf_citation=p[5]
             r = requests.get(pdf_url, allow_redirects=True)