gamingflexer commited on
Commit
59abadc
1 Parent(s): 9b39647

Add ArxivPaper class for scraping and extracting paper details

Browse files
Files changed (1) hide show
  1. src/scrapper/main.py +44 -0
src/scrapper/main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from prompts.templates import reference_extraction
2
+ from scrapper.arxiv import get_paper_id,Arxiv
3
+ from scrapper.extractor import get_google_scrape,init_extractor
4
+ from tqdm import tqdm
5
+ import os
6
+
7
+ class ArxivPaper:
8
+
9
+ def __init__(self, author_name: str):
10
+ self.author_name = author_name
11
+ self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
12
+
13
+ def get_results_google(self, number_of_results: int = 25):
14
+ result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
15
+ paper_links = []
16
+ for i in result_dict['organic_results']:
17
+ if "arxiv.org" in i['link']:
18
+ paper_links.append(i['link'])
19
+ print(f"Found {len(paper_links)} papers")
20
+ return paper_links
21
+
22
+ def get_paper_id(self, paper_link: list):
23
+ paper_ids = []
24
+ for i in paper_link:
25
+ if "arxiv.org" in i:
26
+ if "pdf" in i:
27
+ paper_ids.append(i.split("/")[-1].split(".pdf")[0])
28
+ else:
29
+ paper_ids.append(i.split("/")[-1])
30
+ if '' in paper_ids:
31
+ paper_ids.remove('')
32
+ return list((paper_ids))
33
+
34
+ def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
35
+ path_author = os.path.join(path, self.author_name.replace(" ", "_"))
36
+ for i in tqdm(paper_ids):
37
+ paper = Arxiv(i)
38
+ paper.load()
39
+ paper.get_meta()
40
+ refs = paper.get_refs(
41
+ extractor=self.extractor,
42
+ text_splitter=self.text_splitter,)
43
+ paper.chunker()
44
+ paper.save_chunks(include_metadata=True, path=path_author)