Spaces:
Runtime error
Runtime error
File size: 1,817 Bytes
59abadc 59a1246 59abadc 59a1246 59abadc 59a1246 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from prompts.templates import reference_extraction
from scrapper.arxiv import get_paper_id,Arxiv
from scrapper.extractor import get_google_scrape,init_extractor
from tqdm import tqdm
import os
from config import OPENAI_API_KEY
class ArxivPaper:
def __init__(self, author_name: str):
self.author_name = author_name
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
def get_results_google(self, number_of_results: int = 25):
result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
paper_links = []
for i in result_dict['organic_results']:
if "arxiv.org" in i['link']:
paper_links.append(i['link'])
print(f"Found {len(paper_links)} papers")
return paper_links
def get_paper_id(self, paper_link: list):
paper_ids = []
for i in paper_link:
if "arxiv.org" in i:
if "pdf" in i:
paper_ids.append(i.split("/")[-1].split(".pdf")[0])
else:
paper_ids.append(i.split("/")[-1])
if '' in paper_ids:
paper_ids.remove('')
return list((paper_ids))
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
data = {}
for i in tqdm(paper_ids):
paper = Arxiv(i)
paper.load()
paper.get_meta()
refs = paper.get_refs(
extractor=self.extractor,
text_splitter=self.text_splitter,)
paper.chunker()
paper.save_chunks(include_metadata=True, path=path_author)
|