oceansweep commited on
Commit
8619cce
·
verified ·
1 Parent(s): 1b24d60

Upload 2 files

Browse files
App_Function_Libraries/Third_Party/Arxiv.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arxiv.py
2
+ # Description: This file contains the functions for searching and ingesting arXiv papers.
3
+ import time
4
+
5
+ import arxiv
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from datetime import datetime
9
+
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3 import Retry
12
+
13
+ #
14
+ # Local Imports
15
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
16
+ #
17
+ #####################################################################################################
18
+ #
19
+ # Functions:
20
+
21
+ # Number of results per page
22
+ ARXIV_PAGE_SIZE = 10
23
+
24
+
25
+ def fetch_arxiv_pdf_url(paper_id):
26
+ base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
27
+
28
+ # Configure retry strategy
29
+ retry_strategy = Retry(
30
+ total=3, # Maximum number of retries
31
+ status_forcelist=[429, 500, 502, 503, 504], # Retry on these status codes
32
+ backoff_factor=1 # Exponential backoff factor
33
+ )
34
+ adapter = HTTPAdapter(max_retries=retry_strategy)
35
+ http = requests.Session()
36
+ http.mount("https://", adapter)
37
+ http.mount("http://", adapter)
38
+
39
+ try:
40
+ response = http.get(base_url)
41
+ response.raise_for_status()
42
+ # Delay between requests to avoid rate limiting
43
+ time.sleep(2)
44
+ soup = BeautifulSoup(response.text, 'xml')
45
+ pdf_link = soup.find('link', title='pdf')['href']
46
+ return pdf_link
47
+ except requests.exceptions.RequestException as e:
48
+ print(f"**Error:** {e}")
49
+ return None
50
+
51
+
52
+ def search_arxiv(query):
53
+ client = arxiv.Client()
54
+ search = arxiv.Search(
55
+ query=query,
56
+ max_results=10,
57
+ sort_by=arxiv.SortCriterion.Relevance
58
+ )
59
+
60
+ results = []
61
+ for result in client.results(search):
62
+ results.append([
63
+ result.title,
64
+ result.entry_id.split('/')[-1], # Extract the ID from the entry_id
65
+ ', '.join(author.name for author in result.authors),
66
+ result.summary
67
+ ])
68
+
69
+ return results
70
+
71
+
72
+ def fetch_arxiv_xml(paper_id):
73
+ base_url = "http://export.arxiv.org/api/query?id_list="
74
+ response = requests.get(base_url + paper_id)
75
+ response.raise_for_status()
76
+ return response.text
77
+
78
+
79
+ def parse_arxiv_feed(xml_content):
80
+ soup = BeautifulSoup(xml_content, 'xml')
81
+ entries = []
82
+ for entry in soup.find_all('entry'):
83
+ title = entry.title.text.strip()
84
+ paper_id = entry.id.text.strip().split('/abs/')[-1]
85
+ authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
86
+ published = entry.published.text.strip().split('T')[0]
87
+ abstract = entry.summary.text.strip()
88
+ entries.append({
89
+ 'id': paper_id,
90
+ 'title': title,
91
+ 'authors': authors,
92
+ 'published': published,
93
+ 'abstract': abstract
94
+ })
95
+ return entries
96
+
97
+
98
+ def build_query_url(query, author, year, start):
99
+ # HTTP? FIXME....
100
+ base_url = "http://export.arxiv.org/api/query?"
101
+ search_params = []
102
+
103
+ # Build search query
104
+ if query:
105
+ search_params.append(f"all:{query}")
106
+ if author:
107
+ search_params.append(f'au:"{author}"')
108
+ if year:
109
+ search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')
110
+
111
+ search_query = "+AND+".join(search_params) if search_params else "all:*"
112
+
113
+ url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
114
+ return url
115
+
116
+ def convert_xml_to_markdown(xml_content):
117
+ soup = BeautifulSoup(xml_content, 'xml')
118
+
119
+ # Extract title, authors, abstract, and other relevant information from the specific paper entry
120
+ entry = soup.find('entry')
121
+ title = entry.find('title').text.strip()
122
+ authors = [author.find('name').text.strip() for author in entry.find_all('author')]
123
+ abstract = entry.find('summary').text.strip()
124
+ published = entry.find('published').text.strip()
125
+
126
+ categories = [category['term'] for category in entry.find_all('category')]
127
+
128
+ # Constructing a markdown representation for the paper
129
+ markdown = f"# {title}\n\n"
130
+ markdown += f"**Authors:** {', '.join(authors)}\n\n"
131
+ markdown += f"**Published Date:** {published}\n\n"
132
+ markdown += f"**Abstract:**\n\n{abstract}\n\n"
133
+ markdown += f"**Categories:** {', '.join(categories)}\n\n"
134
+
135
+ return markdown, title, authors, categories
136
+
137
+
138
+ def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
139
+ try:
140
+ xml_content = fetch_arxiv_xml(paper_id)
141
+ markdown, title, authors, categories = convert_xml_to_markdown(xml_content)
142
+
143
+ keywords = f"arxiv,{','.join(categories)}"
144
+ if additional_keywords:
145
+ keywords += f",{additional_keywords}"
146
+
147
+ add_media_with_keywords(
148
+ url=f"https://arxiv.org/abs/{paper_id}",
149
+ title=title,
150
+ media_type='document',
151
+ content=markdown,
152
+ keywords=keywords,
153
+ prompt='No prompt for arXiv papers',
154
+ summary='arXiv paper ingested from XML',
155
+ transcription_model='None',
156
+ author=', '.join(authors),
157
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
158
+ )
159
+
160
+ return f"arXiv paper '{title}' ingested successfully."
161
+ except Exception as e:
162
+ return f"Error processing arXiv paper: {str(e)}"
163
+
164
+ #
165
+ # End of Arxiv.py
166
+ ####################################################################################################
App_Function_Libraries/Third_Party/__init__.py ADDED
File without changes