File size: 1,817 Bytes
59abadc
 
 
 
 
59a1246
59abadc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59a1246
59abadc
 
 
 
 
 
 
 
59a1246
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from prompts.templates import reference_extraction
from scrapper.arxiv import get_paper_id,Arxiv
from scrapper.extractor import get_google_scrape,init_extractor
from tqdm import tqdm
import os
from config import OPENAI_API_KEY

class ArxivPaper:

    def __init__(self, author_name: str):
        self.author_name = author_name
        self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
    
    def get_results_google(self, number_of_results: int = 25):
        result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
        paper_links = []
        for i in result_dict['organic_results']:
            if "arxiv.org" in i['link']:
                paper_links.append(i['link'])
        print(f"Found {len(paper_links)} papers")
        return paper_links
    
    def get_paper_id(self, paper_link: list):
        paper_ids = []
        for i in paper_link:
            if "arxiv.org" in i:
                if "pdf" in i:
                    paper_ids.append(i.split("/")[-1].split(".pdf")[0])
                else:
                    paper_ids.append(i.split("/")[-1])
        if '' in paper_ids:
            paper_ids.remove('')
        return list((paper_ids))
    
    def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
        path_author = os.path.join(path, self.author_name.replace(" ", "_"))
        data = {}
        for i in tqdm(paper_ids):
            paper = Arxiv(i)
            paper.load()
            paper.get_meta()
            refs = paper.get_refs(
            extractor=self.extractor,
            text_splitter=self.text_splitter,)
            paper.chunker()
            paper.save_chunks(include_metadata=True, path=path_author)