gamingflexer commited on
Commit
59a1246
1 Parent(s): 424c175
src/scrapper/extractor.py CHANGED
@@ -1,5 +1,6 @@
1
  from langchain_community.chat_models import ChatOpenAI
2
- from langchain import PromptTemplate, LLMChain
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  import tiktoken
5
  from typing import Union
 
1
  from langchain_community.chat_models import ChatOpenAI
2
+ from langchain.prompts import PromptTemplate
3
+ from langchain.chains import LLMChain
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  import tiktoken
6
  from typing import Union
src/scrapper/main.py CHANGED
@@ -3,6 +3,7 @@ from scrapper.arxiv import get_paper_id,Arxiv
3
  from scrapper.extractor import get_google_scrape,init_extractor
4
  from tqdm import tqdm
5
  import os
 
6
 
7
  class ArxivPaper:
8
 
@@ -33,6 +34,7 @@ class ArxivPaper:
33
 
34
  def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
35
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
 
36
  for i in tqdm(paper_ids):
37
  paper = Arxiv(i)
38
  paper.load()
@@ -41,4 +43,5 @@ class ArxivPaper:
41
  extractor=self.extractor,
42
  text_splitter=self.text_splitter,)
43
  paper.chunker()
44
- paper.save_chunks(include_metadata=True, path=path_author)
 
 
3
  from scrapper.extractor import get_google_scrape,init_extractor
4
  from tqdm import tqdm
5
  import os
6
+ from config import OPENAI_API_KEY
7
 
8
  class ArxivPaper:
9
 
 
34
 
35
  def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
36
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
37
+ data = {}
38
  for i in tqdm(paper_ids):
39
  paper = Arxiv(i)
40
  paper.load()
 
43
  extractor=self.extractor,
44
  text_splitter=self.text_splitter,)
45
  paper.chunker()
46
+ paper.save_chunks(include_metadata=True, path=path_author)
47
+