Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
•
59abadc
1
Parent(s):
9b39647
Add ArxivPaper class for scraping and extracting paper details
Browse files- src/scrapper/main.py +44 -0
src/scrapper/main.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from prompts.templates import reference_extraction
|
2 |
+
from scrapper.arxiv import get_paper_id,Arxiv
|
3 |
+
from scrapper.extractor import get_google_scrape,init_extractor
|
4 |
+
from tqdm import tqdm
|
5 |
+
import os
|
6 |
+
|
7 |
+
class ArxivPaper:
|
8 |
+
|
9 |
+
def __init__(self, author_name: str):
|
10 |
+
self.author_name = author_name
|
11 |
+
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
|
12 |
+
|
13 |
+
def get_results_google(self, number_of_results: int = 25):
|
14 |
+
result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
|
15 |
+
paper_links = []
|
16 |
+
for i in result_dict['organic_results']:
|
17 |
+
if "arxiv.org" in i['link']:
|
18 |
+
paper_links.append(i['link'])
|
19 |
+
print(f"Found {len(paper_links)} papers")
|
20 |
+
return paper_links
|
21 |
+
|
22 |
+
def get_paper_id(self, paper_link: list):
|
23 |
+
paper_ids = []
|
24 |
+
for i in paper_link:
|
25 |
+
if "arxiv.org" in i:
|
26 |
+
if "pdf" in i:
|
27 |
+
paper_ids.append(i.split("/")[-1].split(".pdf")[0])
|
28 |
+
else:
|
29 |
+
paper_ids.append(i.split("/")[-1])
|
30 |
+
if '' in paper_ids:
|
31 |
+
paper_ids.remove('')
|
32 |
+
return list((paper_ids))
|
33 |
+
|
34 |
+
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
|
35 |
+
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
36 |
+
for i in tqdm(paper_ids):
|
37 |
+
paper = Arxiv(i)
|
38 |
+
paper.load()
|
39 |
+
paper.get_meta()
|
40 |
+
refs = paper.get_refs(
|
41 |
+
extractor=self.extractor,
|
42 |
+
text_splitter=self.text_splitter,)
|
43 |
+
paper.chunker()
|
44 |
+
paper.save_chunks(include_metadata=True, path=path_author)
|